In [None]:
import datetime as dt
import pytz
import pickle
import os
import json
import re

import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import requests
import networkx as nx
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder


tz = pytz.timezone("UTC")

In [None]:
NYT_API_KEY = '9dBQIHF8gNTdsM0JSjUIGMQp6xORJGN1'
DATE_FROM = dt.datetime(2020,8,3, tzinfo=tz)
DATE_TO = dt.datetime(2020,11,3, tzinfo=tz)

# 1 Data Collection
## 1.1 Get raw data
### 1.1.1 Get graph data
Keywords of nyt articles are used as nodes and

In [None]:
def get_nyt_articles(year, month):
    url = f'https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={NYT_API_KEYa}'
    call = requests.get(url)
    try:
        content = call.json()['response']['docs']
    except ValueError:
        content = []
    except KeyError:
        print(call.json())
        raise
    return content

In [None]:
def save_article_keywordsin_db(year):
    article_keywords = pd.read_feather('article_keywords.feather')
    article_keywords.index = article_keywords['index']
    article_keywords = article_keywords.drop(columns='index')

    for month in range(12, 0, -1):
        if [f'{year}{month}'] in article_keywords.pub_date.apply(lambda x: f'{x.year}{x.month}').values:
            print(f'{year}-{month}: Articles already downloaded.')
            continue

        content = get_nyt_articles(year,month)
        print(f'{year}-{month}: Found {len(content)} articles.')

        for article in content:
            article_keywords.at[article['uri'], 'pub_date'] = pd.to_datetime(article['pub_date'])
            article_keywords.at[article['uri'], 'keywords'] = list(article['keywords'])

    article_keywords.reset_index().to_feather('article_keywords.feather')

# save_article_keywordsin_db(year=2020)


In [None]:
article_keywords = pd.read_feather('article_keywords.feather')
article_keywords = article_keywords[(article_keywords.pub_date >= DATE_FROM) & (article_keywords.pub_date <= DATE_TO)]

In [None]:
unique_keywords = {}
for index, row in article_keywords.iterrows():
    for keyword_data in row.keywords:
        if keyword_data['name'] not in unique_keywords.keys():
            unique_keywords[keyword_data['name']] = []

        if keyword_data['value'] not in unique_keywords[keyword_data['name']]:
            unique_keywords[keyword_data['name']].append(keyword_data['value'])

unique_keywords

In [None]:
def get_keyword_features(concept, value):
    url = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={concept}:("{value}")&api-key={NYT_API_KEY}'
    call = requests.get(url)
    return call.json()['response']['docs']

get_keyword_features('subject', 'Presidential Election of 2020')

In [None]:
## 1.2 Process raw data
### 1.2.1 Build Adjacency matrix

In [None]:
article_keywords = pd.read_feather('article_keywords.feather')
article_keywords = article_keywords[(article_keywords.pub_date >= DATE_FROM) & (article_keywords.pub_date <= DATE_TO)]

In [None]:
unique_keywords = {}
for index, row in article_keywords.iterrows():
    for keyword_data in row.keywords:
        if keyword_data['name'] not in unique_keywords.keys():
            unique_keywords[keyword_data['name']] = []

        if keyword_data['value'] not in unique_keywords[keyword_data['name']]:
            unique_keywords[keyword_data['name']].append(keyword_data['value'])

unique_keywords

In [None]:
def get_keyword_features(concept, value):
    url = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={concept}:("{value}")&api-key={NYT_API_KEY}'
    call = requests.get(url)
    return call.json()['response']['docs']

get_keyword_features('subject', 'Presidential Election of 2020')

In [None]:
## 1.2 Process raw data
### 1.2.1 Build Adjacency matrix

In [None]:
def get_adjacency_matrix(date_from, date_to):
    df = pd.read_feather('article_keywords.feather')
    df.index = df['index']
    df = df.drop(columns='index')
    df = df[(df.pub_date >= date_from) & (df.pub_date <= date_to)]
    df.keywords = df.keywords.apply(lambda x: [y['value'] for y in x])

    unique_keywords = list(set([item for sublist in df.keywords for item in sublist]))
    num_keywords = len(unique_keywords)
    print(f'{num_keywords} Keywords found.')
    df_exploded = df.explode('keywords')['keywords']

    print('Start One Hot Encoding.', end=' ')
    # art_key_matrix = pd.get_dummies(df_exploded).groupby('index').sum()[unique_keywords]#.to_numpy()
    # print(art_key_matrix.shape)

    enc = OneHotEncoder(sparse=False)
    art_key_matrix = enc.fit_transform(df_exploded.values.reshape(-1, 1))#to print the encoded features for train data
    art_key_matrix = pd.DataFrame(art_key_matrix, index=df_exploded.index, columns=list(enc.categories_[0])).groupby('index').sum()[unique_keywords].to_numpy()

    print('Done.')
    total_occurance_matrix = np.zeros((num_keywords,num_keywords))

    print('Calc total_occurance_matrix:')
    for i in range(num_keywords):
        print(f'{i}/{num_keywords}', end='\r')
        for j in range(num_keywords):
            if j >= i:
                continue
            temp = art_key_matrix[:,i] + art_key_matrix[:,j]
            total_occurance = temp[temp==2].sum()
            total_occurance_matrix[[i,j], [j,i]] = total_occurance

    keyword_count = total_occurance_matrix.sum(axis=1)
    keyword_count[keyword_count == 0] = 1

    adja_matrix = total_occurance_matrix / keyword_count

    with open(f'AdjacencyMatrices/{date_from.strftime("%y%m%d")}_{date_to.strftime("%y%m%d")}_AdjacencyMatrix.pickle', 'wb') as file:
        pickle.dump([adja_matrix, keyword_count, unique_keywords], file)

    return adja_matrix, keyword_count, unique_keywords

In [None]:
adja_matrix, keyword_count, unique_keywords = get_adjacency_matrix(dt.datetime(2020,8,3, tzinfo=tz),
                                                                   dt.datetime(2020,11,3, tzinfo=tz))
adja_matrix

In [None]:
article_keywords = pd.read_feather('article_keywords.feather')
article_keywords = article_keywords[(article_keywords.pub_date >= DATE_FROM) & (article_keywords.pub_date <= DATE_TO)]
article_keywords = article_keywords.reset_index(drop=True)

In [None]:
unique_keywords = pd.DataFrame(columns=['name', 'value'])
for index, row in article_keywords.iterrows():
    print(f'{index}/{len(article_keywords)}', end='\r')
    for keyword in row['keywords']:
        if unique_keywords[(unique_keywords['name'] == keyword['name']) & (unique_keywords['value'] == keyword['value'])].empty:
            unique_keywords.loc[len(unique_keywords), ['name','value']] = keyword['name'], keyword['value']
unique_keywords.to_feather('unique_keywords.feather')
unique_keywords

In [None]:
unique_keywords = pd.read_feather('unique_keywords.feather')

In [None]:
labels = pd.DataFrame()

for file in os.listdir('BioguideProfiles'):
    with open(f'BioguideProfiles/{file}') as json_data:
        data = json.load(json_data)

    if 'Republican' in str(data) and not 'Democrat' in str(data):
        labels.loc[len(labels), ['familyName', 'givenName', 'party']] = data['familyName'], data['givenName'], 'Republican Party'
    elif 'Republican' not in str(data) and 'Democrat' in str(data):
        labels.loc[len(labels), ['familyName', 'givenName', 'party']] = data['familyName'], data['givenName'], 'Democratic Party'        
labels['cleaned_string'] = labels['familyName'].str.lower() + ', ' + labels['givenName'].str.lower()
labels = labels[~labels.cleaned_string.duplicated()]
labels

In [None]:
unique_keywords['cleaned_string'] = unique_keywords['value'].apply(lambda x: re.match(r'^.+, \w+', x))
unique_keywords['cleaned_string'] = unique_keywords['cleaned_string'].apply(lambda x: x.group(0).lower() if x else None)
unique_keywords['party'] = unique_keywords.cleaned_string.apply(lambda x: labels[labels.cleaned_string == x].party.values[0] if x in labels.cleaned_string.values else None)
unique_keywords.party.value_counts()

In [None]:
adja_matrix, keyword_count, unique_keywords_list = pickle.load(open('AdjacencyMatrices/200803_201103_AdjacencyMatrix.pickle', 'rb'))

In [None]:
temp = pd.DataFrame({'node_name': unique_keywords_list, 'keyword_count': keyword_count})

In [None]:
for index, row in unique_keywords.iterrows():
    temp.loc[temp.node_name == row.value, 'party'] = row.party

In [None]:
with open(f'AdjacencyMatrices/200803_201103_AdjacencyMatrix_labeled.pickle', 'wb') as file:
    pickle.dump([adja_matrix, temp], file)