In [38]:
import spacy
import en_core_web_lg   
from newsapi import NewsApiClient 
import pickle
import json # to work with json data 
import pandas as pd

nlp_eng= en_core_web_lg.load() # spacy English language model
newsapi= NewsApiClient(api_key='0fbb92da9e004a85834fdf57627a1e32') # uses my from news api

# get articles from the past 30 days 
articles = newsapi.get_everything(q='coronavirus', language='en', 
        from_param='2020-09-29', to='2020-10-29', sort_by='relevancy',
        page=2)

In [39]:
# print response from newsapi  
output=json.dumps(articles,indent=4)
print(output)

{
    "status": "ok",
    "totalResults": 185079,
    "articles": [
        {
            "source": {
                "id": null,
                "name": "New York Times"
            },
            "author": "Richard P\u00e9rez-Pe\u00f1a",
            "title": "Coronavirus Deaths Pass One Million Worldwide",
            "description": "Over the past 10 months, the virus has taken more lives than H.I.V., malaria, influenza and cholera. And as it sows destruction in daily life around the globe, it is still growing quickly.",
            "url": "https://www.nytimes.com/2020/09/28/world/covid-1-million-deaths.html",
            "urlToImage": "https://static01.nyt.com/images/2020/05/15/world/28virus-onemillion/merlin_172508022_69add9d4-cc46-4118-96a3-0f0c1654ec4e-facebookJumbo.jpg",
            "publishedAt": "2020-09-29T00:42:32Z",
            "content": "And, crucially, people are most contagious when they first show symptoms or even earlier, not days or weeks later, when they are sickest

In [40]:
data=[] 
# extract only title, publish date, description and content 
for i, article in enumerate(articles):
    for x in articles['articles']:
        title=x['title']
        description=x['description']
        publishedAt=x['publishedAt']
        content=x['content']
        data.append({'title':title, 'publishedAt':publishedAt, 
        'description':description, 'content':content})
        
df=pd.DataFrame(data) # create pandas dataframe from articles
df=df.dropna() # remove missing values
df.head() # view first 5 rows of articles 

Unnamed: 0,title,publishedAt,description,content
0,Coronavirus Deaths Pass One Million Worldwide,2020-09-29T00:42:32Z,"Over the past 10 months, the virus has taken m...","And, crucially, people are most contagious whe..."
1,Where Is the Coronavirus Pandemic Headed?,2020-10-07T10:08:00Z,The coronavirus “is not going to be over by fa...,There are a lot of ways to listen to The Daily...
2,Jets Confirm Player's Positive Coronavirus Test,2020-10-09T15:08:48Z,Sunday’s Jets game against the Cardinals could...,"If Tennessee and Buffalo play on Tuesday, the ..."
3,The Conners Return to Confront the Coronavirus,2020-10-18T15:55:57Z,This ABC sitcom has dealt with real-world chal...,Gilbert said the series will not dwell on the ...
4,Coronavirus in California: An Update on Cases,2020-10-13T12:42:09Z,Tuesday: Catch up on the Lakers’ 17th champion...,This is the Lakers first title with Jeanie Bus...


In [41]:
df.to_csv('COVID_articles.csv') # save cleaned output to csv 

In [47]:
# function to find source of most relevant keywords: in titles, descriptions, or content?
def compare_num_keywords():
    title_key_ct=0
    desc_key_ct=0
    content_key_ct=0
    pos_tag=['VERB','NOUN','PROPN'] # types of tokens we want to keep 
    
    # keywords from keywords 
    for title in df['title']:
        doc=nlp_eng(title) # use Spacy English lang model on each title
        for token in doc:
            if(not token.is_stop and not token.is_punct and token.pos_ in pos_tag):
                title_key_ct+=1
    
    # keywords from descriptions 
    for desc in df['description']:
        doc=nlp_eng(desc) # use Spacy English lang model on each desc
        for token in doc:
            if(not token.is_stop and not token.is_punct and token.pos_ in pos_tag):
                desc_key_ct+=1
    
    # keywords from content 
    for content in df['content']:
        doc=nlp_eng(content) # use Spacy English lang model on all content
        for token in doc:
            if(not token.is_stop and not token.is_punct and token.pos_ in pos_tag):
                content_key_ct+=1
        
    print(title_key_ct, ' title keywords')
    print(desc_key_ct, ' description keywords')
    print(content_key_ct, ' content keywords')

compare_num_keywords()

255  title keywords
579  description keywords
903  content keywords


In [60]:
# function to extract keywords
def get_keywords(src):
    keywords=[]
    pos_tag=['VERB','NOUN','PROPN']
    doc=nlp_eng(df.iat[src,3]) # src is the row, content in 3rd column
    for token in doc:
        if(not token.is_stop and not token.is_punct and token.pos_ in pos_tag):
            keywords.append(token.text)
    return keywords

In [64]:
from collections import Counter
words = [] 

# for each piece of content, find the keywords and take the top 5 most common  
for content in range (len(df['content'])):
    words.append([('#' + x[0]) for x in Counter(get_keywords(content)).most_common(5)])
df['keywords'] = words # add a new column for keywords 

In [65]:
df.head()

Unnamed: 0,title,publishedAt,description,content,keywords
0,Coronavirus Deaths Pass One Million Worldwide,2020-09-29T00:42:32Z,"Over the past 10 months, the virus has taken m...","And, crucially, people are most contagious whe...","[#people, #symptoms, #days, #weeks, #reversal]"
1,Where Is the Coronavirus Pandemic Headed?,2020-10-07T10:08:00Z,The coronavirus “is not going to be over by fa...,There are a lot of ways to listen to The Daily...,"[#lot, #ways, #listen, #Daily, #want]"
2,Jets Confirm Player's Positive Coronavirus Test,2020-10-09T15:08:48Z,Sunday’s Jets game against the Cardinals could...,"If Tennessee and Buffalo play on Tuesday, the ...","[#game, #Tennessee, #Buffalo, #play, #Tuesday]"
3,The Conners Return to Confront the Coronavirus,2020-10-18T15:55:57Z,This ABC sitcom has dealt with real-world chal...,Gilbert said the series will not dwell on the ...,"[#said, #Gilbert, #series, #dwell, #parts]"
4,Coronavirus in California: An Update on Cases,2020-10-13T12:42:09Z,Tuesday: Catch up on the Lakers’ 17th champion...,This is the Lakers first title with Jeanie Bus...,"[#Lakers, #Buss, #title, #Jeanie, #daughter]"


In [66]:
df.to_csv('covid_articles_keywords.csv') # save dataset