## Run Fightin' Words (Monroe et al) analysis on the groups used for the language model analyses

In [3]:
import csv
import json
import numpy as np
import pandas as pd
import nltk
from IPython.display import display, HTML
from sklearn.feature_extraction.text import CountVectorizer as CV

### Preprocess data

In [4]:
sfp_data = pd.read_csv("../sample_data/sfp_langsample_v4.csv")

In [5]:
td_data = pd.read_csv("../sample_data/td_langsample_v4.csv")

In [6]:
politics_data = pd.read_csv("../sample_data/politics_sample1.csv", 
                            header=None, 
                            names=['author','subreddit','body','score', 'created_dt', 'rand'])

In [7]:
politics_text = politics_data.loc[politics_data['body'].notna()]['body'].values

In [8]:
sfp_authors = sfp_data['author'].unique()
td_authors = td_data['author'].unique()
print(len(sfp_authors))
print(len(td_authors))

print(np.sum(sfp_data['subreddit'] == 'SandersForPresident'))
print(np.sum(sfp_data['subreddit'] == 'The_Donald'))
print(np.sum(td_data['subreddit'] == 'SandersForPresident'))
print(np.sum(td_data['subreddit'] == 'The_Donald'))

2000
2000
82916
1885
4476
62877


In [9]:
sfp_posts = sfp_data.loc[sfp_data['subreddit'] == 'SandersForPresident']
td_posts = td_data.loc[td_data['subreddit'] == 'The_Donald']

In [10]:
sfp_min_times = [min(sfp_posts.loc[sfp_posts['author'] == auth]['created_utc']) for auth in sfp_authors]
td_min_times = [min(td_posts.loc[td_posts['author'] == auth]['created_utc']) for auth in td_authors]

In [11]:
sfp_politics = sfp_data.loc[sfp_data['subreddit'] == 'politics']
td_politics = td_data.loc[td_data['subreddit'] == 'politics']
print(sfp_politics.shape)
print(td_politics.shape)

(204584, 5)
(82119, 5)


In [12]:
sfp_politics_before = pd.DataFrame([], columns = sfp_politics.columns)
sfp_politics_after = pd.DataFrame([], columns = sfp_politics.columns)

for i in range(len(sfp_authors)):
    curr_auth_posts = sfp_politics.loc[sfp_politics['author'] == sfp_authors[i]]
    
    if len(curr_auth_posts) > 0:
        sfp_politics_before = \
sfp_politics_before.append(curr_auth_posts.loc[curr_auth_posts['created_utc'] < sfp_min_times[i]])
        
        sfp_politics_after = \
sfp_politics_after.append(curr_auth_posts.loc[curr_auth_posts['created_utc'] > sfp_min_times[i]])
    

In [13]:
td_politics_before = pd.DataFrame([], columns = td_politics.columns)
td_politics_after = pd.DataFrame([], columns = td_politics.columns)

for i in range(len(td_authors)):
    curr_auth_posts = td_politics.loc[td_politics['author'] == td_authors[i]]
    
    if len(curr_auth_posts) > 0:
        td_politics_before = \
td_politics_before.append(curr_auth_posts.loc[curr_auth_posts['created_utc'] < td_min_times[i]])
        
        td_politics_after = \
td_politics_after.append(curr_auth_posts.loc[curr_auth_posts['created_utc'] > td_min_times[i]])
    

In [14]:
sfp_politics_before_text = sfp_politics_before.loc[sfp_politics_before['body'].notna()]['body'].values
sfp_politics_after_text = sfp_politics_after.loc[sfp_politics_after['body'].notna()]['body'].values

td_politics_before_text = td_politics_before.loc[td_politics_before['body'].notna()]['body'].values
td_politics_after_text = td_politics_after.loc[td_politics_after['body'].notna()]['body'].values

sfp_posts_text = sfp_posts.loc[sfp_posts['body'].notna()]['body'].values
td_posts_text = td_posts.loc[td_posts['body'].notna()]['body'].values

In [32]:
sfp_posts = sfp_data.loc[sfp_data['subreddit'] == 'SandersForPresident']
td_posts = td_data.loc[td_data['subreddit'] == 'The_Donald']
sfp_posts_text = sfp_posts.loc[sfp_posts['body'].notna()]['body'].values
td_posts_text = td_posts.loc[td_posts['body'].notna()]['body'].values

In [53]:
print(len([i for i in politics_text if 'deleted' in i]))
print(len([i for i in politics_text if 'removed' in i]))
# print([i for i in politics_text if 'deleted' in i][:20])
# print([i for i in politics_text if 'removed' in i][:20])
deleted_str = [i for i in politics_text if 'deleted' in i][0]
removed_str = [i for i in politics_text if 'removed' in i][0]
print(len([i for i in politics_text if i == deleted_str]))
print(len([i for i in politics_text if i == removed_str]))
print(len(politics_text))

539
294
527
151
9999


### Fightin' Words implementation

In [15]:
def compare1(l1, l2, prior=0.01):
    cv = CV(decode_error = 'ignore', min_df = 10, max_df = .5, ngram_range=(1,1),
        binary = False,
        max_features = 15000)
    
    counts_mat = cv.fit_transform(l1 + l2).toarray()
    vocab_size = len(cv.vocabulary_)
    #print(vocab_size)
    priors = np.array([prior for i in range(vocab_size)])
    
    z_scores = np.empty(priors.shape[0])
    count_matrix = np.empty([2, vocab_size], dtype=np.float32)
    count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
    count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
    a0 = np.sum(priors)
    n1 = 1.*np.sum(count_matrix[0,:])
    n2 = 1.*np.sum(count_matrix[1,:])

    for i in range(vocab_size):
        #compute delta
        term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
        term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))        
        delta = term1 - term2
        #compute variance on delta
        var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
        #store final score
        z_scores[i] = delta/np.sqrt(var)
    index_to_term = {v:k for k,v in cv.vocabulary_.items()}
    sorted_indices = np.argsort(z_scores)
    return_list = []
    for i in sorted_indices:
        return_list.append((index_to_term[i], z_scores[i]))
        
    return return_list

In [33]:
comparison_1 = compare1(list(sfp_politics_before_text), list(td_politics_before_text))

In [24]:
print("Most characteristic of sfp politics before")
print(comparison_1[-10:])
print("Most characteristic of td politics before")
print(comparison_1[:10])

Most characteristic of sfp politics before
[('marriage', 11.464935237332657), ('senate', 11.666904300053812), ('law', 12.369936902956647), ('constitutional', 12.398412318618059), ('oregon', 12.843061455890009), ('to', 13.135759740394793), ('constitution', 13.460413829227422), ('rights', 15.013835239133643), ('court', 15.612126652409339), ('gt', 20.22825399579643)]
Most characteristic of td politics before
[('trump', -32.75576158410538), ('he', -18.28811511546875), ('govt', -17.65519879227717), ('hillary', -17.006993055651904), ('bernie', -16.15253762246233), ('yea', -13.543247367014422), ('cruz', -13.261377935702438), ('immigration', -12.774719059468499), ('min', -12.704376847798697), ('rubio', -12.462579315326607)]


In [25]:
comparison_2 = compare1(list(sfp_politics_after_text), list(td_politics_after_text))

In [26]:
print("Most characteristic of sfp politics after")
print(comparison_2[-10:])
print("Most characteristic of td politics after")
print(comparison_2[:10])

Most characteristic of sfp politics after
[('not', 9.084190621669212), ('bernie', 9.247175590576232), ('hillary', 9.688116419502283), ('dnc', 9.893651064192591), ('county', 10.068695852105929), ('but', 11.527496337648081), ('clinton', 17.74746989991353), ('her', 20.065874753437463), ('sanders', 20.20842176488534), ('she', 23.95615946074161)]
Most characteristic of td politics after
[('trump', -49.10091692080196), ('archive', -23.426147638009763), ('immigrants', -19.24262703323421), ('illegal', -18.835392812574074), ('racist', -18.241522420577073), ('immigration', -16.707503355122242), ('donald', -14.830037213464056), ('mexico', -14.032720949955062), ('he', -13.98517061447056), ('http', -13.062854105091295)]


In [27]:
comparison_3 = compare1(list(sfp_politics_before_text), list(sfp_politics_after_text))

In [29]:
print("Most characteristic of sfp politics before")
print(comparison_3[-10:])
print("Most characteristic of sfp politics after")
print(comparison_3[:10])

Most characteristic of sfp politics before
[('our', 22.841195890851253), ('you', 24.045287598150708), ('federal', 24.875366175014232), ('paul', 25.24087801163513), ('rights', 25.509074313831967), ('constitution', 25.56940777379856), ('law', 26.114006358355493), ('police', 26.414350824918362), ('romney', 28.303927422561138), ('government', 45.697546316858485)]
Most characteristic of sfp politics after
[('she', -99.37430468000342), ('sanders', -93.79984484750177), ('clinton', -92.42763698113937), ('bernie', -85.65424300764772), ('hillary', -85.5357975901374), ('her', -83.29228040875994), ('trump', -80.73451755332773), ('he', -53.36343367370652), ('supporters', -49.685523537946786), ('https', -43.49626676467192)]


In [30]:
comparison_4 = compare1(list(td_politics_before_text), list(td_politics_after_text))

In [34]:
print("Most characteristic of td politics before")
print(comparison_4[-10:])
print("Most characteristic of td politics after")
print(comparison_4[:10])

Most characteristic of td politics before
[('police', 9.726026916835153), ('are', 9.731362615146697), ('society', 10.32383957220608), ('insurance', 10.352052192535245), ('that', 11.427934755735768), ('of', 12.017707559222051), ('romney', 12.583836480078473), ('ron', 13.140120201698315), ('paul', 15.854880354064962), ('government', 17.69098951244687)]
Most characteristic of td politics after
[('trump', -79.80049059960629), ('hillary', -46.20842611512671), ('clinton', -44.491381346111496), ('bernie', -43.43896437005444), ('sanders', -40.62619609373871), ('supporters', -33.46782279863542), ('he', -31.865821890010107), ('she', -31.500113111343868), ('https', -30.343246246803005), ('her', -27.585533288481326)]


In [35]:
comparison_5 = compare1(list(sfp_politics_before_text), list(politics_text))

In [38]:
print("Most characteristic of sfp politics before")
print(comparison_5[-10:])
print("Most characteristic of /r/politics")
print(comparison_5[:10])

Most characteristic of sfp politics before
[('we', 7.0180043370529654), ('ron', 7.373011587469217), ('police', 7.409467427666796), ('corporations', 7.734280304272829), ('paul', 7.751734793513766), ('rights', 7.8148380111125375), ('law', 7.838935354287101), ('constitution', 8.36464350607692), ('romney', 9.483828713524865), ('government', 12.790609012596748)]
Most characteristic of /r/politics
[('trump', -53.226675747400996), ('hillary', -49.309612785378995), ('bernie', -47.688979320537555), ('clinton', -42.0508836506008), ('sanders', -40.643073390688656), ('https', -38.61523376930448), ('deleted', -35.62353579128365), ('she', -33.4967231687057), ('removed', -32.242303752003686), ('politics', -30.829457295269354)]


In [37]:
comparison_6 = compare1(list(sfp_politics_after_text), list(politics_text))

In [39]:
print("Most characteristic of sfp politics after")
print(comparison_6[-10:])
print("Most characteristic of /r/politics")
print(comparison_6[:10])

Most characteristic of sfp politics after
[('county', 6.37011745530198), ('he', 6.544536821059878), ('campaign', 6.947383541162555), ('dnc', 7.625767721619438), ('bernie', 9.53348579491636), ('hillary', 11.292429635297127), ('her', 13.368043773852706), ('she', 15.407222997992658), ('clinton', 15.82050777573038), ('sanders', 15.948349171830394)]
Most characteristic of /r/politics
[('deleted', -39.26407911094589), ('removed', -28.602341744738556), ('removal', -23.24503097164692), ('message', -22.591999820608844), ('politics', -22.312123566594636), ('submission', -21.15363206220953), ('regarding', -20.821980017327064), ('reddit', -19.06091273209774), ('moderators', -18.11726929988101), ('hi', -15.751426887179619)]


In [49]:
comparison_7 = compare1(list(td_politics_before_text), list(politics_text))

In [50]:
print("Most characteristic of td politics before")
print(comparison_7[-10:])
print("Most characteristic of /r/politics")
print(comparison_7[:10])

Most characteristic of td politics before
[('economy', 5.849488485534582), ('corporations', 6.148013237854099), ('police', 6.261071754758695), ('company', 6.635697338158269), ('society', 6.66882326838764), ('are', 7.039459576997191), ('romney', 9.086734562390058), ('ron', 9.512556507595994), ('paul', 10.659595105758761), ('government', 12.956763803359888)]
Most characteristic of /r/politics
[('clinton', -33.98861361030492), ('hillary', -33.967017898955945), ('bernie', -33.00222434558593), ('sanders', -32.38004102599821), ('politics', -29.668786261454454), ('https', -28.57200912623476), ('trump', -28.47970951299201), ('she', -28.380485262031836), ('deleted', -28.35149024770632), ('removed', -25.955764904718)]


In [51]:
comparison_8 = compare1(list(td_politics_after_text), list(politics_text))

In [52]:
print("Most characteristic of td politics after")
print(comparison_8[-10:])
print("Most characteristic of /r/politics")
print(comparison_8[:10])

Most characteristic of td politics after
[('immigration', 7.291139689293844), ('supporters', 7.453902461036277), ('archive', 7.657131053186096), ('http', 7.700607700384159), ('donald', 8.002104766971586), ('immigrants', 8.178537998588288), ('racist', 8.553047472204018), ('his', 9.262026724935232), ('he', 13.88107723575582), ('trump', 27.420548957205554)]
Most characteristic of /r/politics
[('politics', -17.56579470344), ('deleted', -17.06632631698139), ('message', -16.454235020353885), ('removed', -16.09689004221014), ('removal', -12.278088536229408), ('regarding', -11.827408573705215), ('reddit', -11.118566340664577), ('please', -10.6106734086735), ('comment', -10.094967378579845), ('question', -9.965952922677747)]


In [54]:
comparison_9 = compare1(list(sfp_politics_before_text), list(sfp_posts_text))

In [55]:
comparison_10 = compare1(list(sfp_politics_after_text), list(sfp_posts_text))

In [56]:
comparison_11 = compare1(list(td_politics_before_text), list(td_posts_text))

In [57]:
comparison_12 = compare1(list(td_politics_after_text), list(td_posts_text))

In [58]:
print("Most characteristic of sfp politics before")
print(comparison_9[-10:])
print("Most characteristic of sfp")
print(comparison_9[:10])

Most characteristic of sfp politics before
[('tax', 27.10484955379054), ('federal', 28.274509476757373), ('obama', 29.126054720568423), ('of', 29.303916403919732), ('their', 29.604686023318386), ('that', 30.79382583394437), ('law', 33.7298181343646), ('you', 35.93130752375819), ('gt', 43.35096918681282), ('government', 50.14668114212394)]
Most characteristic of sfp
[('bernie', -114.63841857556642), ('sanders', -90.59465354930272), ('hillary', -81.87531885981572), ('clinton', -81.6799953781497), ('https', -71.79580630735121), ('she', -68.49335757669984), ('her', -59.5205617536509), ('com', -59.275896263748656), ('trump', -54.76905491318175), ('campaign', -52.221234894486244)]


In [59]:
print("Most characteristic of sfp politics after")
print(comparison_10[-10:])
print("Most characteristic of sfp")
print(comparison_10[:10])

Most characteristic of sfp politics after
[('shit', 15.817596805510243), ('of', 16.455022704157788), ('server', 17.24778367059468), ('emails', 17.56540555296091), ('fbi', 18.11365211305663), ('obama', 18.18497716744409), ('her', 22.00288948917088), ('she', 29.27894290293328), ('gt', 34.40343780115682), ('trump', 35.05512855607713)]
Most characteristic of sfp
[('bernie', -62.73556912686443), ('https', -45.3100928014652), ('sandersforpresident', -42.69615738722605), ('reddit', -40.29689338941674), ('com', -39.99542671069134), ('message', -36.85573152767616), ('this', -31.059425171027033), ('please', -30.98559069004854), ('www', -28.44069976550496), ('removed', -28.095841217573813)]


In [60]:
print("Most characteristic of td politics before")
print(comparison_11[-10:])
print("Most characteristic of td")
print(comparison_11[:10])

Most characteristic of td politics before
[('paul', 21.46555528001312), ('system', 21.745256097876805), ('obama', 22.208049303046717), ('money', 22.312061328034535), ('tax', 22.39263647291169), ('you', 25.152586175528345), ('of', 31.78147598059674), ('that', 33.33715351989539), ('the', 34.33563486038037), ('government', 34.690637562088114)]
Most characteristic of td
[('trump', -88.80827523708332), ('hillary', -85.01979716906384), ('com', -82.87177912490827), ('clinton', -78.84576299425267), ('http', -63.80062007050364), ('https', -59.54064319514849), ('imgur', -51.44909239575585), ('isis', -40.54510915217822), ('donald', -37.77058532376729), ('energy', -33.25411687069987)]


In [61]:
print("Most characteristic of td politics after")
print(comparison_12[-10:])
print("Most characteristic of td")
print(comparison_12[:10])

Most characteristic of td politics after
[('republicans', 14.675318708520809), ('you', 15.165289444591957), ('obama', 16.02968800023907), ('supporters', 16.11098117622025), ('taxes', 16.214485045190614), ('archive', 16.274146336365124), ('bernie', 17.982481656610236), ('tax', 19.341530304390766), ('party', 19.643693893190783), ('sanders', 23.937767413156294)]
Most characteristic of td
[('com', -40.80950947281718), ('hillary', -37.0934995013998), ('imgur', -33.31918146623839), ('http', -33.2530605818253), ('clinton', -32.95966014129759), ('https', -25.833114820756197), ('isis', -24.301512017715158), ('if', -24.13147766140712), ('this', -20.068094174014043), ('fucking', -19.977844319266826)]


### Notes

Need to do more preprocessing (removing links and '[removed]' posts. In addition, it seems to be that the "after" posts (and perhaps politics in general?) become more focused on people (Clinton, Sanders, Trump), which in itself isn't necessarily interesting (increased election-related news coverage; presumably it's on people's minds more regardless). However, it may be interesting if coupled with differences of variability in language. 