## Run Fightin' Words (Monroe et al) analysis on the groups used for the language model analyses

In [22]:
import csv
import json
import numpy as np
import pandas as pd
import nltk
import re
from IPython.display import display, HTML
from sklearn.feature_extraction.text import CountVectorizer as CV

### Preprocess data

In [3]:
sfp_data = pd.read_csv("../sample_data/sfp_langsample_v4.csv")

In [4]:
td_data = pd.read_csv("../sample_data/td_langsample_v4.csv")

In [5]:
politics_data = pd.read_csv("../sample_data/politics_sample1.csv", 
                            header=None, 
                            names=['author','subreddit','body','score', 'created_dt', 'rand'])

In [6]:
politics_text = politics_data.loc[politics_data['body'].notna()]['body'].values

In [7]:
sfp_authors = sfp_data['author'].unique()
td_authors = td_data['author'].unique()
print(len(sfp_authors))
print(len(td_authors))

print(np.sum(sfp_data['subreddit'] == 'SandersForPresident'))
print(np.sum(sfp_data['subreddit'] == 'The_Donald'))
print(np.sum(td_data['subreddit'] == 'SandersForPresident'))
print(np.sum(td_data['subreddit'] == 'The_Donald'))

2000
2000
82916
1885
4476
62877


In [8]:
sfp_posts = sfp_data.loc[sfp_data['subreddit'] == 'SandersForPresident']
td_posts = td_data.loc[td_data['subreddit'] == 'The_Donald']

In [9]:
sfp_min_times = [min(sfp_posts.loc[sfp_posts['author'] == auth]['created_utc']) for auth in sfp_authors]
td_min_times = [min(td_posts.loc[td_posts['author'] == auth]['created_utc']) for auth in td_authors]

In [10]:
sfp_politics = sfp_data.loc[sfp_data['subreddit'] == 'politics']
td_politics = td_data.loc[td_data['subreddit'] == 'politics']
print(sfp_politics.shape)
print(td_politics.shape)

(204584, 5)
(82119, 5)


In [11]:
sfp_politics_before = pd.DataFrame([], columns = sfp_politics.columns)
sfp_politics_after = pd.DataFrame([], columns = sfp_politics.columns)

for i in range(len(sfp_authors)):
    curr_auth_posts = sfp_politics.loc[sfp_politics['author'] == sfp_authors[i]]
    
    if len(curr_auth_posts) > 0:
        sfp_politics_before = \
sfp_politics_before.append(curr_auth_posts.loc[curr_auth_posts['created_utc'] < sfp_min_times[i]])
        
        sfp_politics_after = \
sfp_politics_after.append(curr_auth_posts.loc[curr_auth_posts['created_utc'] > sfp_min_times[i]])
    

In [12]:
td_politics_before = pd.DataFrame([], columns = td_politics.columns)
td_politics_after = pd.DataFrame([], columns = td_politics.columns)

for i in range(len(td_authors)):
    curr_auth_posts = td_politics.loc[td_politics['author'] == td_authors[i]]
    
    if len(curr_auth_posts) > 0:
        td_politics_before = \
td_politics_before.append(curr_auth_posts.loc[curr_auth_posts['created_utc'] < td_min_times[i]])
        
        td_politics_after = \
td_politics_after.append(curr_auth_posts.loc[curr_auth_posts['created_utc'] > td_min_times[i]])
    

In [13]:
sfp_politics_before_text = sfp_politics_before.loc[sfp_politics_before['body'].notna()]['body'].values
sfp_politics_after_text = sfp_politics_after.loc[sfp_politics_after['body'].notna()]['body'].values

td_politics_before_text = td_politics_before.loc[td_politics_before['body'].notna()]['body'].values
td_politics_after_text = td_politics_after.loc[td_politics_after['body'].notna()]['body'].values

sfp_posts_text = sfp_posts.loc[sfp_posts['body'].notna()]['body'].values
td_posts_text = td_posts.loc[td_posts['body'].notna()]['body'].values

In [14]:
sfp_posts = sfp_data.loc[sfp_data['subreddit'] == 'SandersForPresident']
td_posts = td_data.loc[td_data['subreddit'] == 'The_Donald']
sfp_posts_text = sfp_posts.loc[sfp_posts['body'].notna()]['body'].values
td_posts_text = td_posts.loc[td_posts['body'].notna()]['body'].values

In [18]:
print(len([i for i in politics_text if 'deleted' in i]))
print(len([i for i in politics_text if 'removed' in i]))
# print([i for i in politics_text if 'deleted' in i][:20])
# print([i for i in politics_text if 'removed' in i][:20])
deleted_str = [i for i in politics_text if 'deleted' in i][0]
removed_str = [i for i in politics_text if 'removed' in i][0]
print(len([i for i in politics_text if i == deleted_str]))
print(len([i for i in politics_text if i == removed_str]))
print(len([i for i in politics_text if (i != removed_str and i != deleted_str)]))
print(len(politics_text))

539
294
527
151
9321
9999


### Fightin' Words implementation

In [49]:
def compare1(l1, l2, prior=0.01):
    cv = CV(decode_error = 'ignore', min_df = 10, max_df = .5, ngram_range=(1,1),
        binary = False,
        max_features = 15000)
    
    counts_mat = cv.fit_transform(l1 + l2).toarray()
    vocab_size = len(cv.vocabulary_)
    #print(vocab_size)
    priors = np.array([prior for i in range(vocab_size)])
    
    z_scores = np.empty(priors.shape[0])
    count_matrix = np.empty([2, vocab_size], dtype=np.float32)
    count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
    count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
    a0 = np.sum(priors)
    n1 = 1.*np.sum(count_matrix[0,:])
    n2 = 1.*np.sum(count_matrix[1,:])

    for i in range(vocab_size):
        #compute delta
        term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
        term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))        
        delta = term1 - term2
        #compute variance on delta
        var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
        #store final score
        z_scores[i] = delta/np.sqrt(var)
    index_to_term = {v:k for k,v in cv.vocabulary_.items()}
    sorted_indices = np.argsort(z_scores)
    return_list = []
    for i in sorted_indices:
        return_list.append((index_to_term[i], z_scores[i]))
        
    return return_list

In [33]:
comparison_1 = compare1(list(sfp_politics_before_text), list(td_politics_before_text))

In [24]:
print("Most characteristic of sfp politics before")
print(comparison_1[-10:])
print("Most characteristic of td politics before")
print(comparison_1[:10])

Most characteristic of sfp politics before
[('marriage', 11.464935237332657), ('senate', 11.666904300053812), ('law', 12.369936902956647), ('constitutional', 12.398412318618059), ('oregon', 12.843061455890009), ('to', 13.135759740394793), ('constitution', 13.460413829227422), ('rights', 15.013835239133643), ('court', 15.612126652409339), ('gt', 20.22825399579643)]
Most characteristic of td politics before
[('trump', -32.75576158410538), ('he', -18.28811511546875), ('govt', -17.65519879227717), ('hillary', -17.006993055651904), ('bernie', -16.15253762246233), ('yea', -13.543247367014422), ('cruz', -13.261377935702438), ('immigration', -12.774719059468499), ('min', -12.704376847798697), ('rubio', -12.462579315326607)]


In [25]:
comparison_2 = compare1(list(sfp_politics_after_text), list(td_politics_after_text))

In [26]:
print("Most characteristic of sfp politics after")
print(comparison_2[-10:])
print("Most characteristic of td politics after")
print(comparison_2[:10])

Most characteristic of sfp politics after
[('not', 9.084190621669212), ('bernie', 9.247175590576232), ('hillary', 9.688116419502283), ('dnc', 9.893651064192591), ('county', 10.068695852105929), ('but', 11.527496337648081), ('clinton', 17.74746989991353), ('her', 20.065874753437463), ('sanders', 20.20842176488534), ('she', 23.95615946074161)]
Most characteristic of td politics after
[('trump', -49.10091692080196), ('archive', -23.426147638009763), ('immigrants', -19.24262703323421), ('illegal', -18.835392812574074), ('racist', -18.241522420577073), ('immigration', -16.707503355122242), ('donald', -14.830037213464056), ('mexico', -14.032720949955062), ('he', -13.98517061447056), ('http', -13.062854105091295)]


In [27]:
comparison_3 = compare1(list(sfp_politics_before_text), list(sfp_politics_after_text))

In [29]:
print("Most characteristic of sfp politics before")
print(comparison_3[-10:])
print("Most characteristic of sfp politics after")
print(comparison_3[:10])

Most characteristic of sfp politics before
[('our', 22.841195890851253), ('you', 24.045287598150708), ('federal', 24.875366175014232), ('paul', 25.24087801163513), ('rights', 25.509074313831967), ('constitution', 25.56940777379856), ('law', 26.114006358355493), ('police', 26.414350824918362), ('romney', 28.303927422561138), ('government', 45.697546316858485)]
Most characteristic of sfp politics after
[('she', -99.37430468000342), ('sanders', -93.79984484750177), ('clinton', -92.42763698113937), ('bernie', -85.65424300764772), ('hillary', -85.5357975901374), ('her', -83.29228040875994), ('trump', -80.73451755332773), ('he', -53.36343367370652), ('supporters', -49.685523537946786), ('https', -43.49626676467192)]


In [30]:
comparison_4 = compare1(list(td_politics_before_text), list(td_politics_after_text))

In [34]:
print("Most characteristic of td politics before")
print(comparison_4[-10:])
print("Most characteristic of td politics after")
print(comparison_4[:10])

Most characteristic of td politics before
[('police', 9.726026916835153), ('are', 9.731362615146697), ('society', 10.32383957220608), ('insurance', 10.352052192535245), ('that', 11.427934755735768), ('of', 12.017707559222051), ('romney', 12.583836480078473), ('ron', 13.140120201698315), ('paul', 15.854880354064962), ('government', 17.69098951244687)]
Most characteristic of td politics after
[('trump', -79.80049059960629), ('hillary', -46.20842611512671), ('clinton', -44.491381346111496), ('bernie', -43.43896437005444), ('sanders', -40.62619609373871), ('supporters', -33.46782279863542), ('he', -31.865821890010107), ('she', -31.500113111343868), ('https', -30.343246246803005), ('her', -27.585533288481326)]


In [35]:
comparison_5 = compare1(list(sfp_politics_before_text), list(politics_text))

In [38]:
print("Most characteristic of sfp politics before")
print(comparison_5[-10:])
print("Most characteristic of /r/politics")
print(comparison_5[:10])

Most characteristic of sfp politics before
[('we', 7.0180043370529654), ('ron', 7.373011587469217), ('police', 7.409467427666796), ('corporations', 7.734280304272829), ('paul', 7.751734793513766), ('rights', 7.8148380111125375), ('law', 7.838935354287101), ('constitution', 8.36464350607692), ('romney', 9.483828713524865), ('government', 12.790609012596748)]
Most characteristic of /r/politics
[('trump', -53.226675747400996), ('hillary', -49.309612785378995), ('bernie', -47.688979320537555), ('clinton', -42.0508836506008), ('sanders', -40.643073390688656), ('https', -38.61523376930448), ('deleted', -35.62353579128365), ('she', -33.4967231687057), ('removed', -32.242303752003686), ('politics', -30.829457295269354)]


In [37]:
comparison_6 = compare1(list(sfp_politics_after_text), list(politics_text))

In [39]:
print("Most characteristic of sfp politics after")
print(comparison_6[-10:])
print("Most characteristic of /r/politics")
print(comparison_6[:10])

Most characteristic of sfp politics after
[('county', 6.37011745530198), ('he', 6.544536821059878), ('campaign', 6.947383541162555), ('dnc', 7.625767721619438), ('bernie', 9.53348579491636), ('hillary', 11.292429635297127), ('her', 13.368043773852706), ('she', 15.407222997992658), ('clinton', 15.82050777573038), ('sanders', 15.948349171830394)]
Most characteristic of /r/politics
[('deleted', -39.26407911094589), ('removed', -28.602341744738556), ('removal', -23.24503097164692), ('message', -22.591999820608844), ('politics', -22.312123566594636), ('submission', -21.15363206220953), ('regarding', -20.821980017327064), ('reddit', -19.06091273209774), ('moderators', -18.11726929988101), ('hi', -15.751426887179619)]


In [49]:
comparison_7 = compare1(list(td_politics_before_text), list(politics_text))

In [50]:
print("Most characteristic of td politics before")
print(comparison_7[-10:])
print("Most characteristic of /r/politics")
print(comparison_7[:10])

Most characteristic of td politics before
[('economy', 5.849488485534582), ('corporations', 6.148013237854099), ('police', 6.261071754758695), ('company', 6.635697338158269), ('society', 6.66882326838764), ('are', 7.039459576997191), ('romney', 9.086734562390058), ('ron', 9.512556507595994), ('paul', 10.659595105758761), ('government', 12.956763803359888)]
Most characteristic of /r/politics
[('clinton', -33.98861361030492), ('hillary', -33.967017898955945), ('bernie', -33.00222434558593), ('sanders', -32.38004102599821), ('politics', -29.668786261454454), ('https', -28.57200912623476), ('trump', -28.47970951299201), ('she', -28.380485262031836), ('deleted', -28.35149024770632), ('removed', -25.955764904718)]


In [51]:
comparison_8 = compare1(list(td_politics_after_text), list(politics_text))

In [52]:
print("Most characteristic of td politics after")
print(comparison_8[-10:])
print("Most characteristic of /r/politics")
print(comparison_8[:10])

Most characteristic of td politics after
[('immigration', 7.291139689293844), ('supporters', 7.453902461036277), ('archive', 7.657131053186096), ('http', 7.700607700384159), ('donald', 8.002104766971586), ('immigrants', 8.178537998588288), ('racist', 8.553047472204018), ('his', 9.262026724935232), ('he', 13.88107723575582), ('trump', 27.420548957205554)]
Most characteristic of /r/politics
[('politics', -17.56579470344), ('deleted', -17.06632631698139), ('message', -16.454235020353885), ('removed', -16.09689004221014), ('removal', -12.278088536229408), ('regarding', -11.827408573705215), ('reddit', -11.118566340664577), ('please', -10.6106734086735), ('comment', -10.094967378579845), ('question', -9.965952922677747)]


In [54]:
comparison_9 = compare1(list(sfp_politics_before_text), list(sfp_posts_text))

In [55]:
comparison_10 = compare1(list(sfp_politics_after_text), list(sfp_posts_text))

In [56]:
comparison_11 = compare1(list(td_politics_before_text), list(td_posts_text))

In [57]:
comparison_12 = compare1(list(td_politics_after_text), list(td_posts_text))

In [58]:
print("Most characteristic of sfp politics before")
print(comparison_9[-10:])
print("Most characteristic of sfp")
print(comparison_9[:10])

Most characteristic of sfp politics before
[('tax', 27.10484955379054), ('federal', 28.274509476757373), ('obama', 29.126054720568423), ('of', 29.303916403919732), ('their', 29.604686023318386), ('that', 30.79382583394437), ('law', 33.7298181343646), ('you', 35.93130752375819), ('gt', 43.35096918681282), ('government', 50.14668114212394)]
Most characteristic of sfp
[('bernie', -114.63841857556642), ('sanders', -90.59465354930272), ('hillary', -81.87531885981572), ('clinton', -81.6799953781497), ('https', -71.79580630735121), ('she', -68.49335757669984), ('her', -59.5205617536509), ('com', -59.275896263748656), ('trump', -54.76905491318175), ('campaign', -52.221234894486244)]


In [59]:
print("Most characteristic of sfp politics after")
print(comparison_10[-10:])
print("Most characteristic of sfp")
print(comparison_10[:10])

Most characteristic of sfp politics after
[('shit', 15.817596805510243), ('of', 16.455022704157788), ('server', 17.24778367059468), ('emails', 17.56540555296091), ('fbi', 18.11365211305663), ('obama', 18.18497716744409), ('her', 22.00288948917088), ('she', 29.27894290293328), ('gt', 34.40343780115682), ('trump', 35.05512855607713)]
Most characteristic of sfp
[('bernie', -62.73556912686443), ('https', -45.3100928014652), ('sandersforpresident', -42.69615738722605), ('reddit', -40.29689338941674), ('com', -39.99542671069134), ('message', -36.85573152767616), ('this', -31.059425171027033), ('please', -30.98559069004854), ('www', -28.44069976550496), ('removed', -28.095841217573813)]


In [60]:
print("Most characteristic of td politics before")
print(comparison_11[-10:])
print("Most characteristic of td")
print(comparison_11[:10])

Most characteristic of td politics before
[('paul', 21.46555528001312), ('system', 21.745256097876805), ('obama', 22.208049303046717), ('money', 22.312061328034535), ('tax', 22.39263647291169), ('you', 25.152586175528345), ('of', 31.78147598059674), ('that', 33.33715351989539), ('the', 34.33563486038037), ('government', 34.690637562088114)]
Most characteristic of td
[('trump', -88.80827523708332), ('hillary', -85.01979716906384), ('com', -82.87177912490827), ('clinton', -78.84576299425267), ('http', -63.80062007050364), ('https', -59.54064319514849), ('imgur', -51.44909239575585), ('isis', -40.54510915217822), ('donald', -37.77058532376729), ('energy', -33.25411687069987)]


In [61]:
print("Most characteristic of td politics after")
print(comparison_12[-10:])
print("Most characteristic of td")
print(comparison_12[:10])

Most characteristic of td politics after
[('republicans', 14.675318708520809), ('you', 15.165289444591957), ('obama', 16.02968800023907), ('supporters', 16.11098117622025), ('taxes', 16.214485045190614), ('archive', 16.274146336365124), ('bernie', 17.982481656610236), ('tax', 19.341530304390766), ('party', 19.643693893190783), ('sanders', 23.937767413156294)]
Most characteristic of td
[('com', -40.80950947281718), ('hillary', -37.0934995013998), ('imgur', -33.31918146623839), ('http', -33.2530605818253), ('clinton', -32.95966014129759), ('https', -25.833114820756197), ('isis', -24.301512017715158), ('if', -24.13147766140712), ('this', -20.068094174014043), ('fucking', -19.977844319266826)]


### Notes

Need to do more preprocessing (removing links and '[removed]' posts). In addition, it seems to be that the "after" posts (and perhaps politics in general?) become more focused on people (Clinton, Sanders, Trump), which in itself isn't necessarily interesting (increased election-related news coverage; presumably it's on people's minds more regardless). However, it may be interesting if coupled with differences of variability in language. Lastly, I should rethink whether the timeframe for /r/politics is ideal, as it seems to be more in line with the "after" posts than the "before" posts.

### Additional preprocessing

In [16]:
sfp_politics_before_text1 = [i for i in sfp_politics_before_text if (i != deleted_str and i != removed_str)]
print(len(sfp_politics_before_text))
print(len(sfp_politics_before_text1))

112162
112162


In [19]:
print(len([i for i in td_posts_text if (i != removed_str and i != deleted_str)]))
print(len(td_posts_text))

62862
62873


In [20]:
for data in [sfp_politics_before_text, sfp_politics_after_text, td_politics_before_text, td_politics_after_text, sfp_posts_text, td_posts_text]:
    print(len([i for i in data if (i != removed_str and i != deleted_str)]))
    print(len(data))
    print()

112162
112162

92361
92376

60931
60931

21187
21187

82913
82915

62862
62873



In [21]:
for data in [sfp_politics_before_text, sfp_politics_after_text, td_politics_before_text, td_politics_after_text, sfp_posts_text, td_posts_text]:
    print(len([i for i in data if ('http' not in i)]))
    print(len(data))
    print()

104886
112162

86890
92376

57412
60931

19679
21187

76503
82915

58139
62873



In [36]:
for data in [sfp_politics_before_text, sfp_politics_after_text, td_politics_before_text, td_politics_after_text, sfp_posts_text, td_posts_text]:
    print([re.sub(r'https?:\/\/[\S]+', ' ', i, flags=re.MULTILINE) for i in data if ('http' in i)][:10])
    print()

['[You serious?]( ', 'So here are the facts:  ', 'The person who submitted the URL noticed that the article had already been submitted before and so he prefixed the URL with an unnecessary login sequence (name@domain). Firefox notices this and also notices that the website doesn\'t even ask for a username/password and it asks you if you still want to continue. I\'m not sure why it would ever be risky to "log into" a site that requires no authentication, but maybe somebody else can chime in there.\r\n\r\nJust to add though, he could have easily appended the URL with a fake variable instead. (e.g.  ', ' \n\nJust sayin...', " \n\nMeh. People aren't even trying anymore. We saw this a month ago.", ' ', "This is what we have now:\n \n\nWhen he was running under the Office of President-Elect, change.gov had a forum where you could post and vote on ideas. It was wrapped up and from what I understand they're evaluating all of the ideas that were submitted. I don't really know all of the specifi

In [37]:
def transform_data(data):
    new_data = [re.sub(r'https?:\/\/[\S]+', ' ', i, flags=re.MULTILINE) for i in data]
    new_data = [i for i in new_data if (i != removed_str and i != deleted_str)]
    return new_data

In [47]:
sfp_politics_before_text1 = transform_data(sfp_politics_before_text) 
sfp_politics_after_text1 = transform_data(sfp_politics_after_text)
td_politics_before_text1 = transform_data(td_politics_before_text) 
td_politics_after_text1 = transform_data(td_politics_after_text)
sfp_posts_text1 = transform_data(sfp_posts_text)
td_posts_text1 = transform_data(td_posts_text)  
politics_text1 = transform_data(politics_text)

In [50]:
comparison_1m = compare1(list(sfp_politics_before_text1), list(td_politics_before_text1))

In [51]:
print("Most characteristic of sfp politics before")
print(comparison_1m[-10:])
print("Most characteristic of td politics before")
print(comparison_1m[:10])

Most characteristic of sfp politics before
[('senate', 11.33825642704323), ('supreme', 11.43457215451083), ('law', 12.216910657300389), ('constitutional', 12.454600979152794), ('oregon', 12.643869118052905), ('constitution', 13.480319243180645), ('to', 13.688963058189264), ('rights', 14.941687843504596), ('court', 15.318217067238812), ('gt', 20.4103132080107)]
Most characteristic of td politics before
[('trump', -32.49884658395796), ('he', -18.1003879800118), ('govt', -17.757555639824165), ('hillary', -16.978311977382415), ('bernie', -16.125939743083148), ('yea', -13.514768717920674), ('cruz', -13.240139184070538), ('immigration', -13.167591917994798), ('min', -12.662809804833188), ('dont', -12.410391480379772)]


In [52]:
comparison_2m = compare1(list(sfp_politics_after_text1), list(td_politics_after_text1))

In [53]:
print("Most characteristic of sfp politics after")
print(comparison_2m[-10:])
print("Most characteristic of td politics after")
print(comparison_2m[:10])

Most characteristic of sfp politics after
[('vote', 8.314071134623605), ('not', 8.523710428250661), ('county', 9.517415947213648), ('dnc', 9.743512349413985), ('but', 11.021882611025196), ('hillary', 12.122772784762587), ('sanders', 18.714577928984117), ('her', 19.802033635805415), ('clinton', 20.559682335336703), ('she', 23.58709350467939)]
Most characteristic of td politics after
[('trump', -48.23520485166701), ('racist', -18.525859798576295), ('illegal', -18.236056847019235), ('immigrants', -18.054581614855422), ('immigration', -16.56241511326069), ('he', -14.589956072242405), ('mexico', -14.187367356217099), ('violent', -12.601002383557137), ('judge', -12.350752006491321), ('donald', -11.834146523563374)]


In [54]:
comparison_3m = compare1(list(sfp_politics_before_text1), list(sfp_politics_after_text1))

In [55]:
print("Most characteristic of sfp politics before")
print(comparison_3m[-10:])
print("Most characteristic of sfp politics after")
print(comparison_3m[:10])

Most characteristic of sfp politics before
[('our', 22.63618847165981), ('you', 23.108303415348654), ('paul', 24.802535284378322), ('federal', 25.05086880660778), ('rights', 25.349935308448387), ('constitution', 25.393447966640263), ('law', 25.76215756254381), ('police', 26.155298368795254), ('romney', 27.895593077256382), ('government', 45.597474807132535)]
Most characteristic of sfp politics after
[('she', -99.63481055197533), ('sanders', -91.30655248916086), ('clinton', -89.48524903282994), ('her', -83.52300314779959), ('bernie', -83.38833814154691), ('hillary', -83.163422094394), ('trump', -79.60332693603942), ('he', -53.994195411244995), ('supporters', -49.66154068284942), ('candidate', -42.91558911513844)]


In [56]:
comparison_4m = compare1(list(td_politics_before_text1), list(td_politics_after_text1))

In [57]:
print("Most characteristic of td politics before")
print(comparison_4m[-10:])
print("Most characteristic of td politics after")
print(comparison_4m[:10])

Most characteristic of td politics before
[('health', 9.605338641991418), ('police', 9.697746152009769), ('that', 9.7001219121693), ('insurance', 10.095910729300737), ('society', 10.297531299435308), ('of', 10.46643225031583), ('romney', 12.339484337345187), ('ron', 12.940685026305786), ('paul', 15.58010121050074), ('government', 17.59716010238678)]
Most characteristic of td politics after
[('trump', -78.5721004416358), ('bernie', -43.27115570024865), ('hillary', -41.72602806919734), ('sanders', -40.22940096316957), ('clinton', -38.866661987615764), ('supporters', -33.63294636275014), ('he', -32.91649213002195), ('she', -31.97704867012858), ('her', -27.903650348952805), ('candidate', -20.341453244991396)]


In [58]:
comparison_5m = compare1(list(sfp_politics_before_text1), list(politics_text1))

In [59]:
print("Most characteristic of sfp politics before")
print(comparison_5m[-10:])
print("Most characteristic of /r/politics")
print(comparison_5m[:10])

Most characteristic of sfp politics before
[('debt', 6.87180335880116), ('ron', 7.1577581833756625), ('police', 7.228279659656836), ('paul', 7.437709583169424), ('corporations', 7.635419982100003), ('law', 7.661732903639682), ('rights', 7.746801018511705), ('constitution', 8.290022399371527), ('romney', 9.34395822536099), ('government', 12.711725960608248)]
Most characteristic of /r/politics
[('trump', -52.597549453807716), ('bernie', -47.09258666591536), ('hillary', -46.995112954561435), ('sanders', -39.74582884871466), ('clinton', -38.71654842823778), ('she', -33.729123585960245), ('removal', -28.24798035653597), ('her', -27.07405740186644), ('submission', -26.132622606910704), ('supporters', -23.39038610494753)]


In [60]:
comparison_6m = compare1(list(sfp_politics_after_text1), list(politics_text1))

In [61]:
print("Most characteristic of sfp politics after")
print(comparison_6m[-10:])
print("Most characteristic of /r/politics")
print(comparison_6m[:10])

Most characteristic of sfp politics after
[('reporting', 6.186242197924017), ('he', 6.281900033408276), ('campaign', 6.488732481108509), ('dnc', 7.449797898200622), ('bernie', 8.621198325896472), ('hillary', 11.330983009861932), ('her', 13.148171151136998), ('sanders', 15.206749186008965), ('she', 15.2440390998092), ('clinton', 16.01634211275044)]
Most characteristic of /r/politics
[('removal', -23.27480439447159), ('submission', -21.182106422643933), ('regarding', -20.839424744189735), ('moderators', -18.15329685528378), ('removed', -17.49662699338669), ('message', -16.70684840106849), ('hi', -15.736384044641971), ('participating', -15.118699917756029), ('please', -14.79023809915942), ('questions', -13.18170606973819)]


In [62]:
comparison_7m = compare1(list(td_politics_before_text1), list(politics_text1))

In [63]:
print("Most characteristic of td politics before")
print(comparison_7m[-10:])
print("Most characteristic of /r/politics")
print(comparison_7m[:10])

Most characteristic of td politics before
[('economy', 6.027359107417494), ('police', 6.032855745925215), ('corporations', 6.049607580788157), ('are', 6.454405546579397), ('society', 6.5606035755065255), ('company', 6.648193846061862), ('romney', 8.975624136270351), ('ron', 9.381365254109744), ('paul', 10.489512267744932), ('government', 12.793824227571333)]
Most characteristic of /r/politics
[('bernie', -32.543136343029516), ('hillary', -32.00335968825887), ('sanders', -31.596571416743174), ('clinton', -31.387897295407097), ('she', -28.673030135401152), ('trump', -28.127368020590815), ('her', -24.42146038044632), ('removal', -21.88179381610442), ('message', -20.552649318734854), ('regarding', -18.9461401043056)]


In [66]:
comparison_8m = compare1(list(td_politics_after_text1), list(politics_text1))

In [67]:
print("Most characteristic of td politics after")
print(comparison_8m[-10:])
print("Most characteristic of /r/politics")
print(comparison_8m[:10])

Most characteristic of td politics after
[('illegal', 6.382071503687986), ('america', 6.593795852093893), ('mexico', 6.860171941179147), ('immigration', 7.119996332338804), ('supporters', 7.443892070100035), ('immigrants', 7.685372694988114), ('racist', 8.65674790197013), ('his', 9.260103552537375), ('he', 13.987407986237903), ('trump', 26.771585840957357)]
Most characteristic of /r/politics
[('message', -13.618468648244862), ('removal', -12.267029914346196), ('regarding', -12.112597373765198), ('please', -10.57472416623357), ('removed', -10.412374643653605), ('comment', -10.044050454897063), ('questions', -9.488604917658936), ('moderators', -9.153565973985467), ('submission', -8.962233528232792), ('hi', -8.63708568972849)]


In [68]:
comparison_9m = compare1(list(sfp_politics_before_text1), list(sfp_posts_text1))

In [69]:
comparison_10m = compare1(list(sfp_politics_after_text1), list(sfp_posts_text1))

In [70]:
comparison_11m = compare1(list(td_politics_before_text1), list(td_posts_text1))

In [71]:
comparison_12m = compare1(list(td_politics_after_text1), list(td_posts_text1))

In [72]:
print("Most characteristic of sfp politics before")
print(comparison_9m[-10:])
print("Most characteristic of sfp")
print(comparison_9m[:10])

Most characteristic of sfp politics before
[('of', 26.621615177143394), ('tax', 26.85040784735617), ('obama', 27.832312807345424), ('federal', 27.880620721232393), ('that', 28.46403105504504), ('their', 28.713335003023218), ('law', 33.16342483591756), ('you', 34.02296568540546), ('gt', 42.54648218114958), ('government', 49.6965501161795)]
Most characteristic of sfp
[('bernie', -113.11871203007799), ('sanders', -88.94579815259455), ('hillary', -81.03789564128498), ('clinton', -80.78315716775295), ('she', -69.23166907339403), ('her', -60.23693414128286), ('trump', -54.55730589913392), ('campaign', -52.1667651888893), ('vote', -49.8980005382998), ('win', -47.25299685386287)]


In [73]:
print("Most characteristic of sfp politics after")
print(comparison_10m[-10:])
print("Most characteristic of sfp")
print(comparison_10m[:10])

Most characteristic of sfp politics after
[('fucking', 15.550178431227177), ('shit', 15.58287631418329), ('emails', 16.592063243396364), ('server', 16.9938624936156), ('obama', 17.3196900417105), ('fbi', 17.614995076999655), ('her', 21.36538501698456), ('she', 28.6002638929509), ('trump', 33.61983416849249), ('gt', 33.97110423903559)]
Most characteristic of sfp
[('bernie', -64.11317973582892), ('this', -31.96328935140618), ('please', -31.195461451102503), ('we', -28.795805410721833), ('message', -28.605724506906398), ('removed', -28.274334177953076), ('link', -26.48911091590091), ('thank', -25.891211736658168), ('moderators', -25.190729223616298), ('my', -24.810599119367517)]


In [74]:
print("Most characteristic of td politics before")
print(comparison_11m[-10:])
print("Most characteristic of td")
print(comparison_11m[:10])

Most characteristic of td politics before
[('paul', 20.842670538835574), ('system', 21.055873992821454), ('money', 21.19826852039764), ('you', 21.26861616230126), ('obama', 21.448082508246188), ('tax', 21.778963909722126), ('the', 26.833515282246136), ('of', 27.137634536888626), ('that', 28.86118925279632), ('government', 33.74133087464238)]
Most characteristic of td
[('trump', -89.16611639255923), ('hillary', -85.6770377327289), ('clinton', -79.21315127405852), ('isis', -40.86583082503335), ('donald', -36.46815217086809), ('if', -34.19756928972393), ('energy', -33.949489880036346), ('fucking', -32.70580049516698), ('source', -32.54108911054748), ('open', -31.59971178540883)]


In [75]:
print("Most characteristic of td politics after")
print(comparison_12m[-10:])
print("Most characteristic of td")
print(comparison_12m[:10])

Most characteristic of td politics after
[('you', 13.592991800160796), ('gt', 13.867330527833213), ('republicans', 14.234732954512907), ('obama', 14.574439054426476), ('supporters', 15.556351834802426), ('taxes', 15.589580623625212), ('bernie', 16.80317770894974), ('tax', 18.5997131511903), ('party', 19.070020661570535), ('sanders', 22.944165349278393)]
Most characteristic of td
[('hillary', -40.9500816114052), ('clinton', -37.594972939797614), ('if', -25.22374064297781), ('isis', -24.434953841306783), ('this', -21.436952692068473), ('fucking', -20.32270190105012), ('fuck', -19.584349445751155), ('my', -18.062387142344875), ('donald', -18.01815332892516), ('energy', -17.999115399673375)]


In [93]:
print([i for i in politics_text1 if 'submission' in i][:10])
print(len([i for i in politics_text1 if 'submission' in i]))
print(len([i for i in politics_text1 if 'your submission' in i]))
print(len([i for i in politics_text1 if 'i am a bot' in i.lower()]))
print([i for i in politics_text1 if 'submission' in i and 'i am a bot' not in i.lower()][:10])
print(len([i for i in politics_text1 if ('your submission' in i and 
       'has been removed for the following reason' in i) or 'i am a bot' in i.lower()]))
print([i for i in politics_text1 if 'submission' in i and 'your submission' not in i and 
       'has been removed for the following reason' not in i and 'i am a bot' not in i.lower()])

["Hi `DragonPup`. Thank you for participating in /r/Politics. However, [your submission](  has been removed for the following reason(s):\n\n* [Unacceptable Title](  - Your headline must be comprised only of the copied and pasted headline of the article OR a continuous quote taken from the article. If using a quote, it should reflect the article as a whole\n\n    **We recommend not using the Reddit 'suggest a title' as it tends to not give the exact title of the article.**\n\nYou are welcome to resubmit your link, but please follow the submissions rules listed in the sidebar.\n\n\n\nIf you have any questions about this removal, please feel free to [message the moderators.](  regarding the removal of this submission by /u/DragonPup&amp;message=I have a question regarding the removal of this [submission.]( ", '&gt; **The /r/Politics On Topic Statement**\n\n&gt; All submissions to /r/politics need to be *explicitly* about current US politics.  This means that if a subject has political *im

In [119]:
print(len([i for i in politics_text1 if 'message' in i]))
print(len([i for i in politics_text1 if 'message' in i and 'your submission' not in i and 
       'has been removed for the following reason' not in i and 'i am a bot' not in i.lower()]))
print([i for i in politics_text1 if 'message' in i and 'your submission' not in i and 
       'has been removed for the following reason' not in i and 'i am a bot' not in i.lower()][:10])
print(len([i for i in politics_text1 if ('your submission' in i and 
       'has been removed for the following reason' in i or 'i am a bot' in i.lower()) and 
                 'thank you for participating in' not in i.lower()]))
print(len([i for i in politics_text1 if 'i am a bot' in i.lower() or 
                 'thank you for participating in' in i.lower()]))

155
42
["I think that was because he knew they weren't really going to address healthcare during the debate. You have to keep in mind that he's always getting new eyeballs each debate. People who don't know his message. He has to make sure to get as much of his platform in as possible. If Bernie wanted to play dirty, he would have pivoted from the Boeing talking point to the donations to the Clinton Foundation and the investigation. ", "Good. Everybody should get the chance to vote for him even if it's just to send a message to Clinton.", "It's a good move, they're not firing anyone, they're just bringing some additional help in. This is more than likely a result of a longer then expected primary season.\n\nClinton has a good message, she really does... and being moderate helps her tremendously in general regard, but her message has been quieted by overzealous media outlets who would prefer higher ratings to actual facts. Sanders has been bit by this reality as well.\n\nIn fact, I woul

In [108]:
print(len([i for i in politics_text2 if 'thank you for participating in' in i.lower()]))
print(len([i for i in politics_text2 if 'comment' in i.lower()]))
print(len([i for i in politics_text2 if 'removal' in i.lower()]))
print([i for i in politics_text2 if 'comment' in i.lower() and 'removal' not in i.lower()][:10])
print([i for i in politics_text2 if 'removal' in i.lower() and 'thank you for participating in' not in i.lower()])

32
216
36
['Wow the Clinton team is stepping up their game, next time try not writing your comment like a Huffington Post headline.\n\nThe past 50 days of your posts are only about Hillary Clinton.\n\nAnd the only things you have commented on in the past 60 days has been Hillary Clinton related.\n\nDo you have any thing to say about that or is it just a coincidence?', "No, I mean Congress. That's where laws are proposed and voted on for the governor to sign into law. The ironic part of your comment is that the examples of far away pharmacies are myths. Mail order pharmacies operate overnight! I could mail you birth control and it show up at your door by noon. There really isn't an excuse. ", 'My question had no downvotes. Nearly every comment in the entire post was at 1 or 2 with maybe 3 out of 60 comments at 0. There was no brigading, they just banned me under the guise of me being "a troll", despite all evidence to the contrary. They have thin skin. They want everyone to be PC to the

In [120]:
politics_text2 = [i for i in politics_text1 if 'i am a bot' not in i.lower() and 
                 'thank you for participating in' not in i.lower()]

In [121]:
print(len(politics_text1))
print(len(politics_text2))

9321
9208


In [111]:
comparison_5m1 = compare1(list(sfp_politics_before_text1), list(politics_text2))
comparison_6m1 = compare1(list(sfp_politics_after_text1), list(politics_text2))
comparison_7m1 = compare1(list(td_politics_before_text1), list(politics_text2))
comparison_8m1 = compare1(list(td_politics_after_text1), list(politics_text2))

In [112]:
print("Most characteristic of sfp politics before")
print(comparison_5m1[-10:])
print("Most characteristic of /r/politics")
print(comparison_5m1[:10])

Most characteristic of sfp politics before
[('police', 7.014041284693117), ('ron', 7.043397557612062), ('paul', 7.243742338170755), ('law', 7.342045359670338), ('rights', 7.459722431358123), ('corporations', 7.470839808185907), ('our', 7.542933575519699), ('constitution', 8.132196792845198), ('romney', 9.198866170299086), ('government', 12.240014604686287)]
Most characteristic of /r/politics
[('trump', -53.052602624894135), ('bernie', -47.629796214023116), ('hillary', -47.49294573689576), ('sanders', -40.293017533870334), ('clinton', -39.23439687817787), ('she', -34.47022807918367), ('her', -27.70539083431693), ('supporters', -23.72478862022055), ('cruz', -17.954262098183698), ('candidate', -17.883703960711227)]


In [113]:
print("Most characteristic of sfp politics after")
print(comparison_6m1[-10:])
print("Most characteristic of /r/politics")
print(comparison_6m1[:10])

Most characteristic of sfp politics after
[('county', 5.795103638983482), ('reporting', 6.082541305219561), ('campaign', 6.124506488009943), ('dnc', 7.2638276949494), ('bernie', 7.96473047466728), ('hillary', 10.700433324232744), ('her', 12.475585226609809), ('she', 14.450338635428047), ('sanders', 14.572201635445609), ('clinton', 15.410235827020582)]
Most characteristic of /r/politics
[('you', -7.620880338632236), ('dont', -6.675575310294289), ('government', -6.598668481544169), ('business', -6.537356677804144), ('property', -6.476151159736168), ('court', -6.214930738703188), ('sex', -6.151623110628008), ('ip', -6.135266791450714), ('wage', -5.85731470988063), ('salary', -5.744778042515255)]


In [114]:
print("Most characteristic of td politics before")
print(comparison_7m1[-10:])
print("Most characteristic of /r/politics")
print(comparison_7m1[:10])

Most characteristic of td politics before
[('are', 5.6403245572373075), ('economy', 5.789961318293567), ('police', 5.821499441991935), ('corporations', 5.886860792813392), ('society', 6.3645764162043434), ('company', 6.475422739330804), ('romney', 8.83089147496581), ('ron', 9.26655982607467), ('paul', 10.29539647788375), ('government', 12.32760875061358)]
Most characteristic of /r/politics
[('bernie', -33.07725584563781), ('hillary', -32.508494766136884), ('sanders', -32.106034097794456), ('clinton', -31.864746577339034), ('she', -29.369296550840005), ('trump', -28.646197537824094), ('her', -25.010841370784554), ('supporters', -19.028483792306485), ('candidate', -15.628986265505242), ('vote', -14.429838297281627)]


In [115]:
print("Most characteristic of td politics after")
print(comparison_8m1[-10:])
print("Most characteristic of /r/politics")
print(comparison_8m1[:10])

Most characteristic of td politics after
[('america', 6.297913490043675), ('media', 6.746404327449269), ('mexico', 6.752136262043538), ('immigration', 6.96895684957936), ('supporters', 7.087463996369), ('immigrants', 7.541939158324698), ('racist', 8.439878938689047), ('his', 8.589213610511443), ('he', 12.913930956118062), ('trump', 26.13163206753394)]
Most characteristic of /r/politics
[('but', -6.08177072914486), ('republicans', -5.630261697734938), ('that', -5.355312654082056), ('gop', -5.267652448356456), ('malley', -5.0502942537097), ('debates', -4.957944284499719), ('religious', -4.837211203400614), ('aca', -4.792065210916407), ('not', -4.7650177377684955), ('logical', -4.724057758080657)]
