In [33]:
import pandas as pd
import numpy as np
import re
import scipy
from scipy import stats

In [19]:
data = pd.read_csv('../data/csv_data/dtm_data.csv')
data.head()

Unnamed: 0,id,length,date,country,publisher,snippet,body,body_tokens,body_tokens_normalized,snippet_tokens,trunc_sentence,sentiment_score,neg_sentim_score,body_tokens_reduced
0,88033929,554,21-10-01,US,New York Times,A pilot program hints at an eventual end to Au...,A pilot program hints at an eventual end to A...,"['A', 'pilot', 'program', 'hints', 'at', 'an',...","['pilot', 'program', 'hints', 'eventual', 'end...","['A', 'pilot', 'program', 'hints', 'at', 'an',...","[""A pilot program hints at an eventual end to ...","[{'label': 'NEGATIVE', 'score': 0.978422582149...",0.4,"['pilot', 'program', 'hints', 'eventual', 'end..."
1,88035715,7205,21-10-01,US,New York Times,Merck says a trial shows it has produced the f...,Covid News : U.S. Parents ' Views Are Shiftin...,"['Covid', 'News', 'U.S.', 'Parents', 'Views', ...","['covid', 'news', 'us', 'parents', 'views', 's...","['Merck', 'says', 'a', 'trial', 'shows', 'it',...","['Covid News : U', ""Parents ' Views Are Shifti...","[{'label': 'NEGATIVE', 'score': 0.732781469821...",0.666667,"['covid', 'news', 'parents', 'views', 'shiftin..."
2,88046027,300,21-10-03,US,Fox News,Fauci defends California school vaccine mandat...,""" I have been and I still am in favor of thes...","['I', 'have', 'been', 'and', 'I', 'still', 'am...","['favor', 'kinds', 'mandates', 'fauci', 'said'...","['Fauci', 'defends', 'California', 'school', '...","['"" I have been and I still am in favor of the...","[{'label': 'POSITIVE', 'score': 0.997584581375...",0.5,"['favor', 'kinds', 'mandates', 'fauci', 'abc',..."
3,88046227,501,21-10-03,US,New York Times,Israel will require a booster shot to be consi...,Israel will require a booster shot to be cons...,"['Israel', 'will', 'require', 'a', 'booster', ...","['israel', 'require', 'booster', 'shot', 'cons...","['Israel', 'will', 'require', 'a', 'booster', ...",['Israel will require a booster shot to be con...,"[{'label': 'NEGATIVE', 'score': 0.997277677059...",0.8,"['israel', 'require', 'booster', 'shot', 'cons..."
4,88055023,1417,21-10-05,US,New York Times,How Do Thousands Prepare for a Climate Summit?...,"In a few weeks , an estimated 20,000 minister...","['In', 'a', 'few', 'weeks', 'an', 'estimated',...","['weeks', 'estimated', 'ministers', 'activists...","['How', 'Do', 'Thousands', 'Prepare', 'for', '...","['In a few weeks , an estimated 20,000 ministe...","[{'label': 'POSITIVE', 'score': 0.997138023376...",0.5,"['weeks', 'estimated', 'ministers', 'activists..."


In [23]:
[re.sub('[^0-9a-zA-Z]+', '', k) for k in data['body_tokens_normalized'][0].split(',')]

['pilot',
 'program',
 'hints',
 'eventual',
 'end',
 'australia',
 'hotel',
 'quarantine',
 'system',
 'australian',
 'state',
 'new',
 'south',
 'wales',
 'allow',
 'returning',
 'international',
 'travelers',
 'quarantine',
 'home',
 'starting',
 'end',
 'month',
 'possibly',
 'signaling',
 'beginning',
 'end',
 'country',
 'strict',
 'hotel',
 'quarantine',
 'system',
 'pilot',
 'program',
 'allow',
 'fully',
 'vaccinated',
 'people',
 'isolate',
 'homes',
 'days',
 'spend',
 'weeks',
 'government',
 'appointed',
 'facility',
 'stuart',
 'ayres',
 'new',
 'south',
 'wales',
 'government',
 'official',
 'announced',
 'friday',
 'police',
 'employ',
 'location',
 'based',
 'tracking',
 'facial',
 'recognition',
 'technology',
 'monitor',
 'new',
 'arrivals',
 'movements',
 'added',
 'similar',
 'technology',
 'western',
 'australia',
 'november',
 'program',
 'help',
 'country',
 'plan',
 'steps',
 'ending',
 'current',
 'system',
 'mr',
 'ayres',
 'said',
 'news',
 'conference',
 'v

In [24]:
data['body_tokens_normalized'] = data['body_tokens_normalized'].apply(lambda x: [re.sub('[^0-9a-zA-Z]+', '', k)
                                                                     for k in x.split(',')])

In [25]:
nyt = data[data['publisher'] == 'New York Times'].copy()
fox = data[data['publisher'] == 'Fox News'].copy()

In [26]:
nytWords = set(nyt['body_tokens_normalized'].sum())
foxWords = set(fox['body_tokens_normalized'].sum())

In [28]:
overlapWords = nytWords & foxWords

overlapWordsDict = {word: index for index, word in enumerate(overlapWords)}
overlapWordsDict['covid']

38645

In [29]:
def makeProbsArray(dfColumn, overlapDict):
    words = dfColumn.sum()
    countList = [0] * len(overlapDict)
    for word in words:
        try:
            countList[overlapDict[word]] += 1
        except KeyError:
            #The word is not common so we skip it
            pass
    countArray = np.array(countList)
    return countArray / countArray.sum()

nytProbArray = makeProbsArray(nyt['body_tokens_normalized'], overlapWordsDict)
foxProbArray = makeProbsArray(fox['body_tokens_normalized'], overlapWordsDict)
foxProbArray.sum()

1.0

In [34]:
nyt_foxDivergence = scipy.stats.entropy(nytProbArray, foxProbArray)
print(nyt_foxDivergence)
ken_whDivergence = scipy.stats.entropy(foxProbArray, nytProbArray)
print(ken_whDivergence)

0.06855089382623267
0.07118029246931237


In [35]:
nyt_foxDivergence_ew = scipy.special.kl_div(nytProbArray, foxProbArray)
kl_df = pd.DataFrame(list(overlapWordsDict.keys()), columns = ['word'], index = list(overlapWordsDict.values()))
kl_df = kl_df.sort_index()
kl_df['elementwise divergence'] = nyt_foxDivergence_ew
kl_df[:10]

Unnamed: 0,word,elementwise divergence
0,,6.777982e-06
1,angela,5.995161e-07
2,cobbled,5.202145e-07
3,farthest,1.699706e-07
4,underlines,5.854059e-07
5,puzzled,3.107422e-07
6,indicating,2.502555e-07
7,strident,6.496174e-07
8,biopic,1.416422e-07
9,gecko,2.832844e-08


In [36]:
kl_df.sort_values(by='elementwise divergence', ascending=False)[:10]

Unnamed: 0,word,elementwise divergence
29593,mr,0.002295
3760,credit,0.000786
12497,image,0.00075
10515,ms,0.000635
31190,clip,0.000349
29761,fox,0.000275
5205,gutfeld,0.000254
7042,swisher,0.000244
9926,york,0.000242
7091,doyle,0.000235
