# Scraper for the wayback machine sky news archive

### Import required libraries 

In [2]:
# General Libraries
import pandas as pd
import requests as rq
import os
import calendar
import time
from time import sleep
import json
from random import randint

# Libraries for scraping
import newspaper
from newspaper import Article
from newspaper import Config
from bs4 import BeautifulSoup


## Step 1: Collect all captures of the sky news website from the wayback machine in a specific time period. 

In [4]:
def get_urls_of_captures(url_of_interest, from_date, to_date, output_format):
    base_url ="http://web.archive.org/cdx/search/cdx"

    # create url to scrape from
    url = f"{base_url}?url={url_of_interest}&collapse=digest&from={from_date}&to={to_date}&output={output_format}"

    # request urls of captures from wayback cdx api
    urls_of_captures = rq.get(url).text
    parse_urls_of_captures = json.loads(urls_of_captures) 

    # Extracts timestamp and original columns from urls and compiles a url list
    url_list = []
    for i in range(1,len(parse_urls_of_captures)): # start at 1 to skip headers
        orig_url = parse_urls_of_captures[i][2]
        timestamp = parse_urls_of_captures[i][1]
        waylink = timestamp+'/'+orig_url
        url_list.append(waylink)

    # Compile final url pattern
    final_urls_of_captures = []
    for url in url_list:
        final_url = 'https://web.archive.org/web/'+url
        final_urls_of_captures.append(final_url)

    return final_urls_of_captures

In [13]:
# change following values to modify the request
url_of_interest = "news.sky.com"
from_date = "20230101"
to_date = "20230201" # include first of next month to be sure to scrape all articles of last day
output_format ="json"

# Store results of cdx server
urls_of_captures = get_urls_of_captures(url_of_interest, from_date, to_date, output_format)

# create data frames
cdx_urls_df = pd.DataFrame(urls_of_captures, columns=["cdx_url"])

# safe data frame as csv with current timestamp
current_GMT = time.gmtime()
time_stamp = calendar.timegm(current_GMT)
cdx_urls_df.to_csv(f'{time_stamp}_cdx_urls_skynews.csv', sep=';')

## Step 2: Scrape articles

In [6]:
# helper function to delete cache of the newspaper3k library
# the base url 'web.archive.org' stays the same and so the library accesses the cache and returns results 
# without taking into account the changed subdomain.

def delete_feed_category_cache_path():
    # the following location of the feed_category_cache might be different depending on your system
    feed_category_cache_path = "/private/var/folders/sh/fjb1r_5j6gxcy4lzfcc4_4zr0000gn/T/.newspaper_scraper/feed_category_cache"

    # delete all files inside the cache
    for filename in os.listdir(feed_category_cache_path):
        file_path = os.path.join(feed_category_cache_path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

In [7]:
# helper function to format data
def clean_scraped_date(data):
    cleaned_data = []
    for item in data:
        item = item.string.lower()
        if item not in cleaned_data:
            cleaned_data.append(item)
    return cleaned_data

In [None]:
# Configs of newspaper3k
config = Config()
config.request_timeout = 10 # increase timeout because some sites need longer to load

scraped_urls = {} # keep track of scraped url
collection_of_scraped_articles = []
collection_of_failed_scraped_articles = []

counter_url = 0 # keep track of how many url's already got scraped

# urls_of_captures is defined under step 1
for url in urls_of_captures:
    print("url", counter_url)
    counter_url += 1
    delete_feed_category_cache_path() # throws error if path doesn't exists

    counter_sub_urls = 0
    
    sleep(randint(5, 8)) # timeout to avoid 429 error
    # building the news source
    sky_news = newspaper.build(url, memoize_articles=False, language='en')

    for article in sky_news.articles:
        try:
            print("article", counter_sub_urls)
            counter_sub_urls += 1
            article_url = article.url
            
            main_article_url = article_url.split("news.sky.com")[1]
            if main_article_url in scraped_urls:
                continue
            
            sleep(randint(5, 8)) # timeout to avoid 429 error
            page = Article(article_url, config=config)
            page.download()

            # collect specific data of article
            soup = BeautifulSoup(page.html, 'html.parser')
            article_date = soup.find(attrs={"class": "sdc-article-date__date-time"})
            article_keywords = soup.find_all(attrs={"class": "sdc-article-tags__link"})
            article_authors = soup.find_all(attrs={"class": "sdc-article-author__link"})

            # collect title
            page.parse()
            article_title = page.title
            article_text = page.text

            # clean up scraped data
            if not article_title:
                article_title = "N/A"
            if not article_text:
                article_text = "N/A"
            if not article_url:
                article_url = "N/A"
            if article_date:
                article_date = article_date.string
            if article_keywords:
                article_keywords = clean_scraped_date(article_keywords)
            if article_authors:
                article_authors = clean_scraped_date(article_authors)
            
            collection_of_scraped_articles.append([article_title, 
                                                    article_date,
                                                    article_authors,
                                                    article_keywords,
                                                    article_text,
                                                    article_url])
            scraped_urls[main_article_url] = 1

        except Exception as error:
            collection_of_failed_scraped_articles.append([article_url, error])
            continue

# create data frames
scraped_articles_df = pd.DataFrame(collection_of_scraped_articles, columns=["Title", "Date", "Authors", "Tags", "Text", "Url"])
failed_articles_df = pd.DataFrame(collection_of_failed_scraped_articles, columns=["Url", "Error"])

# safe data frames as csv with current timestamp
current_GMT = time.gmtime()
time_stamp = calendar.timegm(current_GMT)
scraped_articles_df.to_csv(f'{time_stamp}_articles.csv', sep=';')
failed_articles_df.to_csv(f'{time_stamp}_failed.csv', sep=';')