In [4]:
import requests
import json
import pandas as pd
from config import nyt_key
import matplotlib.pyplot as plt
%matplotlib inline

In [59]:
def pull_tokenized(year):
    months = ["3","4","5","6","7"]
    
    publication_date = []
    document_type = []
    headline = []
    abstract = []
    snippet = []
    lead_paragraph = []
    keyword_1 = []
    keyword_2 = []
    keyword_3 = []
    
    for month in months:
        base_url = f"https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={nyt_key}"
        response = requests.get(base_url).json()["response"]["docs"]
        for i in response:
            publication_date.append(i["pub_date"])
            document_type.append(i["document_type"])
            headline.append(i["headline"]["main"])
            abstract.append(i["abstract"])
            lead_paragraph.append(i["lead_paragraph"])

    data = {
        "publication_date":publication_date,
        "document_type":document_type,
        "headline":headline,
        "abstract":abstract,
        "lead_paragraph":lead_paragraph,
    }
    df = pd.DataFrame(data)

    # converting data points into one large string value
    all_headline = df['headline'].str.lower().str.cat(sep=' ')
    all_abstract = df['abstract'].str.lower().str.cat(sep=' ')
    all_lead_paragraph = df['lead_paragraph'].str.lower().str.cat(sep=' ')
    all_words = all_headline + all_abstract + all_lead_paragraph
    words_count = all_words.split(sep=" ")
    print(f"Total number of raw words: {len(words_count):,}")

    # natural language toolkit
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.util import ngrams

    # tokenizing the massive string value and filtering out all stopwords, puncuation and numbers
    stop_words=set(stopwords.words('english'))
    word_tokenize=word_tokenize(all_words)
    alpha_word_tokenize=[word for word in word_tokenize if word.isalpha()]
    filtered_tokenize=[word for word in alpha_word_tokenize if not word in stop_words]
    ngram_two=list(ngrams(filtered_tokenize, 2))
    filtered_tokenize=ngram_two+filtered_tokenize
    print(f"Total number of tokenized words after filters applied: {len(filtered_tokenize):,}")

    # creating dictionary with keys=keywords and values=number_of_keyword_mentions 
    term_freq={}
    for token in filtered_tokenize: 
        if token in term_freq: 
            term_freq[token]+=1
        else: 
            term_freq[token]=1

    # getting the top 100 mentions of all headlines, abstracts, lead_paragraphs and keywords
    import math
    sort_freq=sorted(term_freq.items(), key=lambda x: x[1], reverse=True)
    top_terms_freq=sort_freq[:]
    top_terms_dict={}
    for each_term_freq in top_terms_freq: 
        if type(each_term_freq[0])==tuple: 
            top_terms_dict[' '.join(each_term_freq[0])]=each_term_freq[1]
        else: 
            top_terms_dict[each_term_freq[0]]=each_term_freq[1]
            
    series = pd.Series(top_terms_dict,index=top_terms_dict.keys())
    return series

## Evaluating word mentions with the Natural Language Toolkit
* This code tokenizes all headlines, abstracts, lead_paragraphs and keywords into one large string
* All stopwords and words containing puncuation or numbers are dropped from the tokenized data
* The top 1000 words and their counts are displayed in dictionary format

In [66]:
# series_2020 = pull_tokenized(2020)
# series_2016 = pull_tokenized(2016)
# series_2012 = pull_tokenized(2012)
series_2008 = pull_tokenized(2008)

Total number of raw words: 6,126,983
Total number of tokenized words after filters applied: 6,839,837


In [67]:
series_2008

new           27846
said          16947
one           14286
york          11345
new york      11293
              ...  
silkier           1
chitarra          1
gluey             1
fleecing          1
limoncello        1
Length: 1362233, dtype: int64

### Pull data for each year above before running below code

In [68]:
df_combined = pd.concat([series_2020,series_2016,series_2012,series_2008],axis=1)
df_combined = df_combined.rename(columns={df_combined.columns[0]:"2020",df_combined.columns[1]:"2016",df_combined.columns[2]:"2012",df_combined.columns[3]:"2008"})
df_combined

Unnamed: 0,2020,2016,2012,2008
coronavirus,14456.0,,,
new,10989.0,17045.0,24894.0,27846.0
pandemic,6454.0,7.0,11.0,7.0
trump,6062.0,6727.0,92.0,132.0
one,5691.0,6631.0,10153.0,14286.0
...,...,...,...,...
kickier,,,,1.0
rattan,,,,1.0
ketchupy,,,,1.0
chitarra,,,,1.0


In [69]:
# df_combined.to_csv("Output/DFs/tokens_raw.csv")