# Scraper for the wayback machine BBC news archive

### import required packages

In [4]:
import os
from time import sleep
from random import randint
import pandas as pd
import newspaper
from bs4 import BeautifulSoup
import requests as rq
import json
import calendar
import time
from newspaper import Article
from newspaper import Config

### Define wayback machine archive Url's for scraping

In [41]:
# collect all urls for captures by the wayback machine in a specific timeframe
def get_urls_of_captures():
    base_url ="http://web.archive.org/cdx/search/cdx"

    # change following values to modify the request
    url_of_interest = "bbc.com/news"
    from_date = "20220101"
    to_date = "20220201" # include first of next month to be sure to scrape all articles of last day
    output_format ="json"

    # set final url together
    url = f"{base_url}?url={url_of_interest}&collapse=digest&from={from_date}&to={to_date}&output={output_format}"

    # request urls of captures from wayback cdx api
    urls_of_captures = rq.get(url).text
    parse_urls_of_captures = json.loads(urls_of_captures) 

    # Extracts timestamp and original columns from urls and compiles a url list
    url_list = []
    for i in range(1,len(parse_urls_of_captures)): # start at 1 to skip headers
        orig_url = parse_urls_of_captures[i][2]
        timestamp = parse_urls_of_captures[i][1]
        waylink = timestamp+'/'+orig_url
        url_list.append(waylink)

    # Compile final url pattern
    final_urls_of_captures = []
    for url in url_list:
        final_url = 'https://web.archive.org/web/'+url
        final_urls_of_captures.append(final_url)

    return final_urls_of_captures



### Delete the cache for newspaper3k library

In [42]:
# helper to delete cache of the newspaper3k library (the cache will return items from the previous scrape if not deleted)
def delete_feed_category_cache_path():
    # the following location of the feed_category_cache might be different depending on operating system
    # current location is the filepath to the feed_category_cache on windows
    feed_category_cache_path = r"C:\Users\steni\AppData\Local\Temp\.newspaper_scraper\feed_category_cache"

    # delete all files inside the cache
    for filename in os.listdir(feed_category_cache_path):
        file_path = os.path.join(feed_category_cache_path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

### Convert scraped data to lowercase

In [43]:
def clean_scraped_date(data):
    cleaned_data = []
    for item in data:
        item = item.string.lower()
        if item not in cleaned_data:
            cleaned_data.append(item)
    return cleaned_data

### Get the urls from the Wayback machine cdx server

In [44]:

urls_of_captures = get_urls_of_captures()

### Scraper

In [None]:
# create lists to store articles and define the user agent (helps with avoiding timeouts due to too many requests) 
counter_url = 0
collection_of_scraped_articles = []
collection_of_failed_scraped_articles = []
scraped_urls = []
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
config = Config()
config.browser_user_agent = user_agent
config.request_timeout = 10


for url in urls_of_captures:
    print("url", counter_url)
    counter_url += 1
    delete_feed_category_cache_path() # throws error if path doesn't exists

    counter = 0
    
    # building the news source
    bbc = newspaper.build(url, memoize_articles=False, language='en')

    for article in bbc.articles:
        try:
            print("article", counter)
            counter += 1
            article_url = article.url
            
            main_article_url = article_url.split("bbc.com")[1]
            if main_article_url in scraped_urls:
                continue

            sleep(randint(4, 7))

            page = Article(article_url, config=config)
            page.download()
            article_meta_data = page.meta_data

            # collect specific data of article
            soup = BeautifulSoup(page.html, 'html.parser')
            page.parse()
            bbc_dictionary = json.loads("".join(soup.find("script", {"type":"application/ld+json"}).contents))
            
            #find date
            article_date = [value for (key, value) in bbc_dictionary.items() if key == 'datePublished']

            # find ul list containing tags
            parent = soup.find("section").find("ul")
            article_keywords = list(parent.descendants)
            
            # find author
            article_authors = [value['name'] for (key, value) in bbc_dictionary.items() if key == 'author']
            
            # collect title and summary
            article_title = [value for (key, value) in bbc_dictionary.items() if key == 'headline']
            text = soup.find_all("p", {"class": "ssrcss-1q0x1qg-Paragraph eq5iqo00"})
            article_text = ""
            for element in text:
                article_text += "\n" + "".join(element.findAll(string = True)) 

            # clean up scraped data
            if not article_title or article_title == None:
                article_title = "N/A"
            if not article_text or article_text == None:
                article_text = "N/A"
            if not article_url or article_url == None:
                article_url = "N/A"
            if article_date:
                article_date = article_date
            if article_keywords and article_keywords != None:
                article_keywords = clean_scraped_date(article_keywords)
            if article_authors and article_authors != None:
                article_authors = article_authors
            
            
            collection_of_scraped_articles.append([article_title, 
                                                    article_date,
                                                    article_authors,
                                                    article_keywords,
                                                    article_text,
                                                    article_url])
            scraped_urls.append(main_article_url)
            sleep(randint(5, 8))
        
        except Exception as error:
            collection_of_failed_scraped_articles.append([article_url, error])
            continue
    
# create data frames
scraped_articles_df = pd.DataFrame(collection_of_scraped_articles, columns=["Title", "Date", "Authors", "Tags", "text", "Url"])
failed_articles_df = pd.DataFrame(collection_of_failed_scraped_articles, columns=["Url", "Error"])

# save data frames as csv with current timestamp
current_GMT = time.gmtime()
time_stamp = calendar.timegm(current_GMT)
scraped_articles_df.to_csv(f'{time_stamp}_articles.csv', sep=';')
failed_articles_df.to_csv(f'{time_stamp}_failed.csv', sep=';')

In [None]:
print(len(failed_articles_df))
print(len(scraped_articles_df))

### pre-processing 

In [87]:
# clean the scraped dataset.
# removes some rows that are scraped from pages that are not actually articles due to the way BBC is structured.  
scraped_df = scraped_articles_df
d = scraped_df.loc[scraped_df["Title"] == "N/A"]
d1 = scraped_df.loc[scraped_df["text"] == "N/A"]
scraped_df.drop(d.index, inplace= True)
scraped_df.drop(d1.index, inplace= True)
#scraped_articles_df[["Title"]] = scraped_articles_df[["Title"]].str[0]
current_GMT = time.gmtime()
time_stamp = calendar.timegm(current_GMT)
scraped_df.to_csv(f'{time_stamp}_articles.csv', sep=';')