Import JSON data scraped with subreddit_scraper.py and annotate using SUTime module from Stanford NLP Group. 

In [None]:
import pandas as pd
import json
import time
import requests 
from datetime import datetime 
import os 
from sutime import SUTime
import dateparser
from bs4 import BeautifulSoup
from urllib import request

In [2]:
def progress_bar(current, total, barLength = 20):
    percent = float(current) * 100 / total
    arrow   = '-' * int(percent/100 * barLength - 1) + '>'
    spaces  = ' ' * (barLength - len(arrow))
    print('Progress: [%s%s] %d %%' % (arrow, spaces, percent), end='\r')

In [3]:
# Insert Guardian API key which you can grab here: https://open-platform.theguardian.com/access/ 
key=""

In [None]:
# sutime set up 
jar_path = 'jars'
jar_files = os.path.join(os.path.abspath(''), jar_path)
sutime = SUTime(jars=jar_files, mark_time_ranges=True)

In [5]:
# read in full json generated with subreddit_scraper.py:  
fs = pd.read_json('august-december-2020.json', lines=True)

In [6]:
# locate guardian articles: 
submissions = fs.loc[fs['domain'] == 'theguardian.com'].reset_index()

In [7]:
# grab web content from urls
urls = submissions['url'].tolist()
created, date, title, link = [], [], [], []
for i in range(len(urls)):
    progress_bar(i,len(urls))
    try:
        page = requests.get("https://content.guardianapis.com/" + urls[i].split("guardian.com/",1)[1] + "?api-key=" + key +"&show-fields=bodyText")
        content = page.json()
        text = content["response"]["content"]["fields"]["bodyText"]
        ref = content["response"]["content"]["webPublicationDate"].split("T",1)[0]
        h1 = content["response"]["content"]["webTitle"]
        for j in sutime.parse(text, ref):
                if j['type'] == "DATE":
                    if dateparser.parse(j['value']) is not None:
                        date.append(dateparser.parse(j['value']).strftime("%Y"))
                        created.append(datetime.utcfromtimestamp(submissions['created_utc'][i]).strftime('%Y-%m'))
                        link.append(urls[i])
                        title.append(h1)
    except Exception:
        pass

Progress: [------------------> ] 99 %

In [8]:
articles = pd.DataFrame({'month_published': created, 'year_mentioned': date, 'article': title, 'url': link})

In [9]:
# optional drop results where article mentions year it was written:
for i in range(len(articles)):
    if articles['year_mentioned'][i] in articles['month_published'][i]:
        articles.drop(i, inplace=True)

In [10]:
# drop duplicate rows:
articles.drop_duplicates(keep='first',inplace=True)
articles = articles.reset_index()
del articles['index']
articles

Unnamed: 0,month_published,year_mentioned,article,url
0,2020-08,2019,More coal power generation closed than opened ...,https://www.theguardian.com/environment/2020/a...
1,2020-08,2016,More coal power generation closed than opened ...,https://www.theguardian.com/environment/2020/a...
2,2020-08,2038,More coal power generation closed than opened ...,https://www.theguardian.com/environment/2020/a...
3,2020-08,2030,More coal power generation closed than opened ...,https://www.theguardian.com/environment/2020/a...
4,2020-08,2018,More coal power generation closed than opened ...,https://www.theguardian.com/environment/2020/a...
...,...,...,...,...
377,2020-12,2021,Floating 'mini-nukes' could power countries by...,https://www.theguardian.com/environment/2020/d...
378,2020-12,2019,Early humans may have survived the harsh winte...,https://www.theguardian.com/science/2020/dec/2...
379,2020-12,2019,ESA signs deals for its first reusable transpo...,https://www.theguardian.com/science/2020/dec/2...
380,2020-12,2023,ESA signs deals for its first reusable transpo...,https://www.theguardian.com/science/2020/dec/2...


In [None]:
# save:
articles.to_csv('guardian_articles_august-december_2020.csv')