In [1]:
# import required packages
import os
from time import sleep
from random import randint
import pandas as pd
import newspaper
from bs4 import BeautifulSoup
import requests as rq
import json
import nltk
import ssl
import calendar
import time


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\steni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Manual creation get_urls_of_captures list due to the archive api being down 
def manual_get_urls_of_captures(): 
    #01.01.2022 - 31.01.2022
    final_url_list = [
        "https://web.archive.org/web/20220101101506/http://www.bbc.com/", 
        "https://web.archive.org/web/20220102101510/http://www.bbc.com/",
        "https://web.archive.org/web/20220103101535/https://www.bbc.com/",
        "https://web.archive.org/web/20220104104015/https://www.bbc.com/",
        "https://web.archive.org/web/20220105111551/http://www.bbc.com/",
        "https://web.archive.org/web/20220106111553/http://www.bbc.com/",
        "https://web.archive.org/web/20220107111707/https://www.bbc.com/",
        "https://web.archive.org/web/20220108113014/http://www.bbc.com/",
        "https://web.archive.org/web/20220109121526/https://www.bbc.com/",
        "https://web.archive.org/web/20220110121539/http://www.bbc.com/",
        "https://web.archive.org/web/20220111122739/https://www.bbc.com/",
        "https://web.archive.org/web/20220112124750/https://www.bbc.com/",
        "https://web.archive.org/web/20220113131043/https://www.bbc.com/",
        "https://web.archive.org/web/20220114131546/http://www.bbc.com/",
        "https://web.archive.org/web/20220115131717/http://www.bbc.com/",
        "https://web.archive.org/web/20220116141608/http://www.bbc.com/",
        "https://web.archive.org/web/20220117141525/https://www.bbc.com/",
        "https://web.archive.org/web/20220118151542/http://www.bbc.com/",
        "https://web.archive.org/web/20220119161414/http://www.bbc.com/",
        "https://web.archive.org/web/20220120163833/https://www.bbc.com/",
        "https://web.archive.org/web/20220121171555/http://www.bbc.com/",
        "https://web.archive.org/web/20220122171711/https://www.bbc.com/",
        "https://web.archive.org/web/20220123171737/http://www.bbc.com/",
        "https://web.archive.org/web/20220124173041/https://www.bbc.com/",
        "https://web.archive.org/web/20220125174040/https://www.bbc.com/",
        "https://web.archive.org/web/20220126174110/https://www.bbc.com/",
        "https://web.archive.org/web/20220127175131/https://www.bbc.com/",
        "https://web.archive.org/web/20220128175546/https://www.bbc.com/",
        "https://web.archive.org/web/20220129181124/https://www.bbc.com/",
        "https://web.archive.org/web/20220130182141/https://www.bbc.com/", 
        "https://web.archive.org/web/20220131183052/https://www.bbc.com/",
        ]
    return final_url_list


In [4]:
# collect all urls for captures by the wayback machine in a specific timeframe
def get_urls_of_captures():
    base_url ="http://web.archive.org/cdx/search/cdx"

    # change following values to modify the request
    url_of_interest = "bbc.com"
    from_date = "20220101"
    to_date = "20220201" # include first of next month to be sure to scrape all articles of last day
    output_format ="json"

    # set final url together
    url = f"{base_url}?url={url_of_interest}&collapse=digest&from={from_date}&to={to_date}&output={output_format}"

    # request urls of captures from wayback cdx api
    urls_of_captures = rq.get(url).text
    parse_urls_of_captures = json.loads(urls_of_captures) 

    # Extracts timestamp and original columns from urls and compiles a url list
    url_list = []
    for i in range(1,len(parse_urls_of_captures)): # start at 1 to skip headers
        orig_url = parse_urls_of_captures[i][2]
        timestamp = parse_urls_of_captures[i][1]
        waylink = timestamp+'/'+orig_url
        url_list.append(waylink)

    # Compile final url pattern
    final_urls_of_captures = []
    for url in url_list:
        final_url = 'https://web.archive.org/web/'+url
        final_urls_of_captures.append(final_url)

    return final_urls_of_captures



In [5]:
# helper to delete cache of the newspaper3k library
def delete_feed_category_cache_path():
    # the following location of the feed_category_cache might be different depending on operating system
    # current location is the filepath to the feed_category_cache on windows
    feed_category_cache_path = r"C:\Users\steni\AppData\Local\Temp\.newspaper_scraper\feed_category_cache"

    # delete all files inside the cache
    for filename in os.listdir(feed_category_cache_path):
        file_path = os.path.join(feed_category_cache_path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

In [6]:
def clean_scraped_date(data):
    cleaned_data = []
    for item in data:
        item = item.string.lower()
        if item not in cleaned_data:
            cleaned_data.append(item)
    return cleaned_data

In [11]:
urls_of_captures = get_urls_of_captures()

In [12]:
len(urls_of_captures)

3780

In [33]:
counter_url = 0
collection_of_scraped_articles = []
collection_of_failed_scraped_articles = []

scraped_urls = []



for url in urls_of_captures:
    print("url", counter_url)
    counter_url += 1
    delete_feed_category_cache_path() # throws error if path doesn't exists

    counter = 0
    
    # building the news source
    bbc = newspaper.build(url, memoize_articles=False, language='en')

    for article in bbc.articles:
        try:
            print("article", counter)
            counter += 1
            article_url = article.url
            
            main_article_url = article_url.split("bbc.com")[1]
            if main_article_url in scraped_urls:
                continue

            article.download()

            # collect specific data of article
            soup = BeautifulSoup(article.html, 'html.parser')
            article_date = soup.find("time")
            parent = soup.find("section").find("ul")
            article_keywords = list(parent.descendants)
            article_authors = soup.select("div.ssrcss-68pt20-Text-TextContributorName.e8mq1e96")
            
            # collect title
            article.parse()
            article_title = article.title
            text = soup.find_all("p", {"class": "ssrcss-1q0x1qg-Paragraph eq5iqo00"})
            article_text = ""
            for element in text:
                article_text += "\n" + "".join(element.findAll(text = True)) 

            # clean up scraped data
            if not article_title:
                article_title = "N/A"
            if not article_text:
                article_text = "N/A"
            if not article_url:
                article_url = "N/A"
            if article_date.has_attr("datetime"):
                article_date = article_date["datetime"]
            if article_keywords:
                article_keywords = clean_scraped_date(article_keywords)
            if article_authors:
                article_authors = clean_scraped_date(article_authors)
            
            # use nlp to get keywords of article
            article.nlp()
            article_nlp_keywords = article.keywords
            if not article_nlp_keywords:
                article_nlp_keywords = "N/A"
            
            collection_of_scraped_articles.append([article_title, 
                                                    article_date,
                                                    article_authors,
                                                    article_keywords,
                                                    article_nlp_keywords,
                                                    article_text,
                                                    article_url])
            scraped_urls.append(main_article_url)
            sleep(randint(5, 8))
        
        except Exception as error:
            collection_of_failed_scraped_articles.append([article_url, error])
            continue
    
# create data frames
scraped_articles_df = pd.DataFrame(collection_of_scraped_articles, columns=["Title", "Date", "Authors", "Tags", "NLP Keywords", "text", "Url"])
failed_articles_df = pd.DataFrame(collection_of_failed_scraped_articles, columns=["Url", "Error"])

# save data frames as csv with current timestamp
current_GMT = time.gmtime()
time_stamp = calendar.timegm(current_GMT)
scraped_articles_df.to_csv(f'{time_stamp}_articles.csv', sep=';')
failed_articles_df.to_csv(f'{time_stamp}_failed.csv', sep=';')

url 0
article 0
article 1
article 2
article 3
article 4
article 5


  article_text += "\n" + "".join(element.findAll(text = True))


article 6
article 7
article 8
article 9
article 10
article 11
article 12
article 13
article 14
article 15
article 16
article 17
article 18
article 19
article 20
article 21
article 22
article 23
article 24
article 25
article 26
article 27
article 28
article 29
article 30
article 31
article 32
article 33
article 34
article 35
article 36
article 37
article 38
article 39
article 40
article 41
article 42
article 43
article 44
article 45
article 46
article 47
article 48
article 49
article 50
article 51
article 52
article 53
article 54
article 55
article 56
article 57
article 58
article 59
article 60
article 61
article 62
article 63
article 64
article 65
article 66
article 67
article 68
article 69
article 70
article 71
article 72
article 73
article 74
article 75
article 76
article 77
article 78
article 79
article 80
article 81
article 82
article 83
article 84
article 85
article 86
article 87
article 88
article 89
url 1
url 2
url 3
url 4
url 5
article 0
article 1
article 2
article 3
article 4


In [34]:
print(len(failed_articles_df))
print(len(scraped_articles_df))

42765
451


In [35]:
scraped_articles_df

Unnamed: 0,Title,Date,Authors,Tags,NLP Keywords,text,Url
0,Covid-19: WHO chief optimistic disease will be...,2021-12-31T19:28:26.000Z,[],"[world health organization (who), coronavirus ...","[dose, received, europe, target, end, rates, c...",\nThis video can not be played\nThe World Heal...,https://web.archive.org/web/20220101000057/htt...
1,BBC News,2021-12-31T19:28:26.000Z,[],"[skip to content, accessibility help]","[standup, video, bbc, englishwhy, comedians, c...",,https://web.archive.org/web/20220101000057/htt...
2,US & Canada,2021-12-31T21:05:12.000Z,[],"[skip to content, accessibility help]","[grooming, ghislaine, convicted, does, canada,...",,https://web.archive.org/web/20220101000057/htt...
3,Virginia Giuffre: Prince Andrew accuser seeks ...,2021-12-31T23:54:13.000Z,[],"[prince andrew, duke of york, ghislaine maxwel...","[reasonably, private, andrew, lead, giuffre, a...",\nLawyers for a US woman who has accused Princ...,https://web.archive.org/web/20220101000057/htt...
4,Covid: Cathay Pacific flights cut after Hong K...,2021-12-31T03:13:55.000Z,[],"[global supply chain management, hong kong, ai...","[covid, world, cathay, hong, major, kong, paci...",\nCathay Pacific has announced immediate major...,https://web.archive.org/web/20220101000057/htt...
...,...,...,...,...,...,...,...
446,True story? Lie detection systems go high-tech,2022-01-31T00:03:00.000Z,[],[israel],"[remained, dry, mouth, lied, true, person, ner...",\nProf Yael Hanein sticks a number of electrod...,https://web.archive.org/web/20220201200221/htt...
447,Laura Kuenssberg: MPs resolved to leave PM on ...,2022-01-31T23:29:50.000Z,[],"[conservative party, coronavirus lockdown meas...","[resolved, gone, laura, probation, know, kuens...","\nIt was only 12 pages, but there was one very...",https://web.archive.org/web/20220201200922/htt...
448,Tom Brady retires: The rise and success of sev...,2022-02-01T23:06:39+00:00,[],"[skip to content, accessibility help]","[winning, retires, quarterback, tom, brady, se...",,https://web.archive.org/web/20220201234042/htt...
449,Pierre-Emerick Aubameyang: Striker's Arsenal e...,2022-02-01T20:36:40+00:00,[],"[skip to content, accessibility help]","[striker, pierreemerick, barcelona, goals, con...",,https://web.archive.org/web/20220201234042/htt...
