In [1]:
# import required packages
import os
from time import sleep
from random import randint
import pandas as pd
import newspaper
from bs4 import BeautifulSoup
import requests as rq
import json
import nltk
import ssl
import calendar
import time


In [2]:
# only needed for nlp
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()



showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
# collect all urls for captures by the wayback machine in a specific timeframe
def get_urls_of_captures():
    base_url ="http://web.archive.org/cdx/search/cdx"

    # change following values to modify the request
    url_of_interest = "news.sky.com"
    from_date = "20220101"
    to_date = "20220201" # include first of next month to be sure to scrape all articles of last day
    output_format ="json"

    # set final url together
    url = f"{base_url}?url={url_of_interest}&collapse=digest&from={from_date}&to={to_date}&output={output_format}"

    # request urls of captures from wayback cdx api
    urls_of_captures = rq.get(url).text
    parse_urls_of_captures = json.loads(urls_of_captures) 

    # Extracts timestamp and original columns from urls and compiles a url list
    url_list = []
    for i in range(1,len(parse_urls_of_captures)): # start at 1 to skip headers
        orig_url = parse_urls_of_captures[i][2]
        timestamp = parse_urls_of_captures[i][1]
        waylink = timestamp+'/'+orig_url
        url_list.append(waylink)

    # Compile final url pattern
    final_urls_of_captures = []
    for url in url_list:
        final_url = 'https://web.archive.org/web/'+url
        final_urls_of_captures.append(final_url)

    return final_urls_of_captures



In [4]:
# helper to delete cache of the newspaper3k library
def delete_feed_category_cache_path():
    # the following location of the feed_category_cache might be different depending on operating system
    feed_category_cache_path = "/private/var/folders/sh/fjb1r_5j6gxcy4lzfcc4_4zr0000gn/T/.newspaper_scraper/feed_category_cache"

    # delete all files inside the cache
    for filename in os.listdir(feed_category_cache_path):
        file_path = os.path.join(feed_category_cache_path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

In [5]:
def clean_scraped_date(data):
    cleaned_data = []
    for item in data:
        item = item.string.lower()
        if item not in cleaned_data:
            cleaned_data.append(item)
    return cleaned_data

In [6]:
urls_of_captures = get_urls_of_captures()

In [7]:
counter_url = 0
collection_of_scraped_articles = []
collection_of_failed_scraped_articles = []

scraped_urls = []



for url in urls_of_captures:
    print("url", counter_url)
    counter_url += 1
    delete_feed_category_cache_path() # throws error if path doesn't exists

    counter = 0
    
    # building the news source
    sky_news = newspaper.build(url, memoize_articles=False, language='en')

    for article in sky_news.articles:
        try:
            print("article", counter)
            counter += 1
            article_url = article.url
            
            main_article_url = article_url.split("news.sky.com")[1]
            if main_article_url in scraped_urls:
                continue

            article.download()

            # collect specific data of article
            soup = BeautifulSoup(article.html, 'html.parser')
            article_date = soup.find(attrs={"class": "sdc-article-date__date-time"})
            article_keywords = soup.find_all(attrs={"class": "sdc-article-tags__link"})
            article_authors = soup.find_all(attrs={"class": "sdc-article-author__link"})

            # collect title
            article.parse()
            article_title = article.title
            article_text = article.text

            # clean up scraped data
            if not article_title:
                article_title = "N/A"
            if not article_text:
                article_text = "N/A"
            if not article_url:
                article_url = "N/A"
            if article_date:
                article_date = article_date.string
            if article_keywords:
                article_keywords = clean_scraped_date(article_keywords)
            if article_authors:
                article_authors = clean_scraped_date(article_authors)
            
            # use nlp to get keywords of article
            article.nlp()
            article_nlp_keywords = article.keywords
            if not article_nlp_keywords:
                article_nlp_keywords = "N/A"
            
            collection_of_scraped_articles.append([article_title, 
                                                    article_date,
                                                    article_authors,
                                                    article_keywords,
                                                    article_nlp_keywords,
                                                    article_text,
                                                    article_url])
            scraped_urls.append(main_article_url)
            sleep(randint(4, 7))

        except Exception as error:
            collection_of_failed_scraped_articles.append([article_url, error])
            continue

# create data frames
scraped_articles_df = pd.DataFrame(collection_of_scraped_articles, columns=["Title", "Date", "Authors", "Tags", "NLP Keywords", "Text", "Url"])
failed_articles_df = pd.DataFrame(collection_of_failed_scraped_articles, columns=["Url", "Error"])

# safe data frames as csv with current timestamp
current_GMT = time.gmtime()
time_stamp = calendar.timegm(current_GMT)
scraped_articles_df.to_csv(f'{time_stamp}_articles.csv', sep=';')
failed_articles_df.to_csv(f'{time_stamp}_failed.csv', sep=';')

url 0
article 0
article 1
article 2
article 3
article 4
article 5
article 6
article 7
article 8
article 9
article 10
article 11
article 12
article 13
article 14
article 15
article 16
article 17
article 18
article 19
article 20
article 21
article 22
article 23
article 24
article 25
article 26
article 27
article 28
article 29
article 30
article 31
article 32
article 33
article 34
article 35
article 36
article 37
article 38
article 39
article 40
article 41
article 42
article 43
article 44
article 45
article 46
article 47
article 48
article 49
article 50
article 51
article 52
article 53
article 54
article 55
article 56
article 57
article 58
article 59
article 60
article 61
article 62
article 63
article 64
article 65
article 66
article 67
article 68
article 69
article 70
article 71
article 72
article 73
article 74
article 75
article 76
article 77
article 78
article 79
article 80
article 81
article 82
article 83
article 84
article 85
article 86
article 87
article 88
article 89
article 90
art