## Run Fightin' Words (Monroe et al) analysis on the groups used for the language model analyses

In [1]:
import csv
import json
import numpy as np
import pandas as pd
import nltk
import re
from IPython.display import display, HTML
from sklearn.feature_extraction.text import CountVectorizer as CV

### Preprocess data

In [None]:
sfp_data = pd.read_csv("../sample_data/sfp_langsample_v4.csv")

In [None]:
td_data = pd.read_csv("../sample_data/td_langsample_v4.csv")

In [None]:
politics_data = pd.read_csv("../sample_data/politics_sample1.csv", 
                            header=None, 
                            names=['author','subreddit','body','score', 'created_dt', 'rand'])

In [None]:
politics_text = politics_data.loc[politics_data['body'].notna()]['body'].values

In [None]:
sfp_authors = sfp_data['author'].unique()
td_authors = td_data['author'].unique()
print(len(sfp_authors))
print(len(td_authors))

print(np.sum(sfp_data['subreddit'] == 'SandersForPresident'))
print(np.sum(sfp_data['subreddit'] == 'The_Donald'))
print(np.sum(td_data['subreddit'] == 'SandersForPresident'))
print(np.sum(td_data['subreddit'] == 'The_Donald'))

In [None]:
sfp_posts = sfp_data.loc[sfp_data['subreddit'] == 'SandersForPresident']
td_posts = td_data.loc[td_data['subreddit'] == 'The_Donald']

In [None]:
sfp_min_times = [min(sfp_posts.loc[sfp_posts['author'] == auth]['created_utc']) for auth in sfp_authors]
td_min_times = [min(td_posts.loc[td_posts['author'] == auth]['created_utc']) for auth in td_authors]

In [None]:
sfp_politics = sfp_data.loc[sfp_data['subreddit'] == 'politics']
td_politics = td_data.loc[td_data['subreddit'] == 'politics']
print(sfp_politics.shape)
print(td_politics.shape)

In [None]:
sfp_politics_before = pd.DataFrame([], columns = sfp_politics.columns)
sfp_politics_after = pd.DataFrame([], columns = sfp_politics.columns)

for i in range(len(sfp_authors)):
    curr_auth_posts = sfp_politics.loc[sfp_politics['author'] == sfp_authors[i]]
    
    if len(curr_auth_posts) > 0:
        sfp_politics_before = \
sfp_politics_before.append(curr_auth_posts.loc[curr_auth_posts['created_utc'] < sfp_min_times[i]])
        
        sfp_politics_after = \
sfp_politics_after.append(curr_auth_posts.loc[curr_auth_posts['created_utc'] > sfp_min_times[i]])
    

In [None]:
td_politics_before = pd.DataFrame([], columns = td_politics.columns)
td_politics_after = pd.DataFrame([], columns = td_politics.columns)

for i in range(len(td_authors)):
    curr_auth_posts = td_politics.loc[td_politics['author'] == td_authors[i]]
    
    if len(curr_auth_posts) > 0:
        td_politics_before = \
td_politics_before.append(curr_auth_posts.loc[curr_auth_posts['created_utc'] < td_min_times[i]])
        
        td_politics_after = \
td_politics_after.append(curr_auth_posts.loc[curr_auth_posts['created_utc'] > td_min_times[i]])
    

In [None]:
sfp_politics_before_text = sfp_politics_before.loc[sfp_politics_before['body'].notna()]['body'].values
sfp_politics_after_text = sfp_politics_after.loc[sfp_politics_after['body'].notna()]['body'].values

td_politics_before_text = td_politics_before.loc[td_politics_before['body'].notna()]['body'].values
td_politics_after_text = td_politics_after.loc[td_politics_after['body'].notna()]['body'].values

sfp_posts_text = sfp_posts.loc[sfp_posts['body'].notna()]['body'].values
td_posts_text = td_posts.loc[td_posts['body'].notna()]['body'].values

In [None]:
sfp_posts = sfp_data.loc[sfp_data['subreddit'] == 'SandersForPresident']
td_posts = td_data.loc[td_data['subreddit'] == 'The_Donald']
sfp_posts_text = sfp_posts.loc[sfp_posts['body'].notna()]['body'].values
td_posts_text = td_posts.loc[td_posts['body'].notna()]['body'].values

In [None]:
print(len([i for i in politics_text if 'deleted' in i]))
print(len([i for i in politics_text if 'removed' in i]))
# print([i for i in politics_text if 'deleted' in i][:20])
# print([i for i in politics_text if 'removed' in i][:20])
deleted_str = [i for i in politics_text if 'deleted' in i][0]
removed_str = [i for i in politics_text if 'removed' in i][0]
print(len([i for i in politics_text if i == deleted_str]))
print(len([i for i in politics_text if i == removed_str]))
print(len([i for i in politics_text if (i != removed_str and i != deleted_str)]))
print(len(politics_text))

### Fightin' Words implementation

In [None]:
def compare1(l1, l2, prior=0.01):
    cv = CV(decode_error = 'ignore', min_df = 10, max_df = .5, ngram_range=(1,1),
        binary = False,
        max_features = 15000)
    
    counts_mat = cv.fit_transform(l1 + l2).toarray()
    vocab_size = len(cv.vocabulary_)
    #print(vocab_size)
    priors = np.array([prior for i in range(vocab_size)])
    
    z_scores = np.empty(priors.shape[0])
    count_matrix = np.empty([2, vocab_size], dtype=np.float32)
    count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
    count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
    a0 = np.sum(priors)
    n1 = 1.*np.sum(count_matrix[0,:])
    n2 = 1.*np.sum(count_matrix[1,:])

    for i in range(vocab_size):
        #compute delta
        term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
        term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))        
        delta = term1 - term2
        #compute variance on delta
        var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
        #store final score
        z_scores[i] = delta/np.sqrt(var)
    index_to_term = {v:k for k,v in cv.vocabulary_.items()}
    sorted_indices = np.argsort(z_scores)
    return_list = []
    for i in sorted_indices:
        return_list.append((index_to_term[i], z_scores[i]))
        
    return return_list

In [None]:
comparison_1 = compare1(list(sfp_politics_before_text), list(td_politics_before_text))

In [None]:
print("Most characteristic of sfp politics before")
print(comparison_1[-10:])
print("Most characteristic of td politics before")
print(comparison_1[:10])

In [None]:
comparison_2 = compare1(list(sfp_politics_after_text), list(td_politics_after_text))

In [None]:
print("Most characteristic of sfp politics after")
print(comparison_2[-10:])
print("Most characteristic of td politics after")
print(comparison_2[:10])

In [None]:
comparison_3 = compare1(list(sfp_politics_before_text), list(sfp_politics_after_text))

In [None]:
print("Most characteristic of sfp politics before")
print(comparison_3[-10:])
print("Most characteristic of sfp politics after")
print(comparison_3[:10])

In [None]:
comparison_4 = compare1(list(td_politics_before_text), list(td_politics_after_text))

In [None]:
print("Most characteristic of td politics before")
print(comparison_4[-10:])
print("Most characteristic of td politics after")
print(comparison_4[:10])

In [None]:
comparison_5 = compare1(list(sfp_politics_before_text), list(politics_text))

In [None]:
print("Most characteristic of sfp politics before")
print(comparison_5[-10:])
print("Most characteristic of /r/politics")
print(comparison_5[:10])

In [None]:
comparison_6 = compare1(list(sfp_politics_after_text), list(politics_text))

In [None]:
print("Most characteristic of sfp politics after")
print(comparison_6[-10:])
print("Most characteristic of /r/politics")
print(comparison_6[:10])

In [None]:
comparison_7 = compare1(list(td_politics_before_text), list(politics_text))

In [None]:
print("Most characteristic of td politics before")
print(comparison_7[-10:])
print("Most characteristic of /r/politics")
print(comparison_7[:10])

In [None]:
comparison_8 = compare1(list(td_politics_after_text), list(politics_text))

In [None]:
print("Most characteristic of td politics after")
print(comparison_8[-10:])
print("Most characteristic of /r/politics")
print(comparison_8[:10])

In [None]:
comparison_9 = compare1(list(sfp_politics_before_text), list(sfp_posts_text))

In [None]:
comparison_10 = compare1(list(sfp_politics_after_text), list(sfp_posts_text))

In [None]:
comparison_11 = compare1(list(td_politics_before_text), list(td_posts_text))

In [None]:
comparison_12 = compare1(list(td_politics_after_text), list(td_posts_text))

In [None]:
print("Most characteristic of sfp politics before")
print(comparison_9[-10:])
print("Most characteristic of sfp")
print(comparison_9[:10])

In [None]:
print("Most characteristic of sfp politics after")
print(comparison_10[-10:])
print("Most characteristic of sfp")
print(comparison_10[:10])

In [None]:
print("Most characteristic of td politics before")
print(comparison_11[-10:])
print("Most characteristic of td")
print(comparison_11[:10])

In [None]:
print("Most characteristic of td politics after")
print(comparison_12[-10:])
print("Most characteristic of td")
print(comparison_12[:10])

### Notes

Need to do more preprocessing (removing links and '[removed]' posts). In addition, it seems to be that the "after" posts (and perhaps politics in general?) become more focused on people (Clinton, Sanders, Trump), which in itself isn't necessarily interesting (increased election-related news coverage; presumably it's on people's minds more regardless). However, it may be interesting if coupled with differences of variability in language. Lastly, I should rethink whether the timeframe for /r/politics is ideal, as it seems to be more in line with the "after" posts than the "before" posts.

### Additional preprocessing

In [None]:
sfp_politics_before_text1 = [i for i in sfp_politics_before_text if (i != deleted_str and i != removed_str)]
print(len(sfp_politics_before_text))
print(len(sfp_politics_before_text1))

In [None]:
print(len([i for i in td_posts_text if (i != removed_str and i != deleted_str)]))
print(len(td_posts_text))

In [None]:
for data in [sfp_politics_before_text, sfp_politics_after_text, td_politics_before_text, td_politics_after_text, sfp_posts_text, td_posts_text]:
    print(len([i for i in data if (i != removed_str and i != deleted_str)]))
    print(len(data))
    print()

In [None]:
for data in [sfp_politics_before_text, sfp_politics_after_text, td_politics_before_text, td_politics_after_text, sfp_posts_text, td_posts_text]:
    print(len([i for i in data if ('http' not in i)]))
    print(len(data))
    print()

In [None]:
for data in [sfp_politics_before_text, sfp_politics_after_text, td_politics_before_text, td_politics_after_text, sfp_posts_text, td_posts_text]:
    print([re.sub(r'https?:\/\/[\S]+', ' ', i, flags=re.MULTILINE) for i in data if ('http' in i)][:10])
    print()

In [None]:
def transform_data(data):
    new_data = [re.sub(r'https?:\/\/[\S]+', ' ', i, flags=re.MULTILINE) for i in data]
    new_data = [i for i in new_data if (i != removed_str and i != deleted_str)]
    return new_data

In [None]:
sfp_politics_before_text1 = transform_data(sfp_politics_before_text) 
sfp_politics_after_text1 = transform_data(sfp_politics_after_text)
td_politics_before_text1 = transform_data(td_politics_before_text) 
td_politics_after_text1 = transform_data(td_politics_after_text)
sfp_posts_text1 = transform_data(sfp_posts_text)
td_posts_text1 = transform_data(td_posts_text)  
politics_text1 = transform_data(politics_text)

In [None]:
comparison_1m = compare1(list(sfp_politics_before_text1), list(td_politics_before_text1))

In [None]:
print("Most characteristic of sfp politics before")
print(comparison_1m[-10:])
print("Most characteristic of td politics before")
print(comparison_1m[:10])

In [None]:
comparison_2m = compare1(list(sfp_politics_after_text1), list(td_politics_after_text1))

In [None]:
print("Most characteristic of sfp politics after")
print(comparison_2m[-10:])
print("Most characteristic of td politics after")
print(comparison_2m[:10])

In [None]:
comparison_3m = compare1(list(sfp_politics_before_text1), list(sfp_politics_after_text1))

In [None]:
print("Most characteristic of sfp politics before")
print(comparison_3m[-10:])
print("Most characteristic of sfp politics after")
print(comparison_3m[:10])

In [None]:
comparison_4m = compare1(list(td_politics_before_text1), list(td_politics_after_text1))

In [None]:
print("Most characteristic of td politics before")
print(comparison_4m[-10:])
print("Most characteristic of td politics after")
print(comparison_4m[:10])

In [None]:
comparison_5m = compare1(list(sfp_politics_before_text1), list(politics_text1))

In [None]:
print("Most characteristic of sfp politics before")
print(comparison_5m[-10:])
print("Most characteristic of /r/politics")
print(comparison_5m[:10])

In [None]:
comparison_6m = compare1(list(sfp_politics_after_text1), list(politics_text1))

In [None]:
print("Most characteristic of sfp politics after")
print(comparison_6m[-10:])
print("Most characteristic of /r/politics")
print(comparison_6m[:10])

In [None]:
comparison_7m = compare1(list(td_politics_before_text1), list(politics_text1))

In [None]:
print("Most characteristic of td politics before")
print(comparison_7m[-10:])
print("Most characteristic of /r/politics")
print(comparison_7m[:10])

In [None]:
comparison_8m = compare1(list(td_politics_after_text1), list(politics_text1))

In [None]:
print("Most characteristic of td politics after")
print(comparison_8m[-10:])
print("Most characteristic of /r/politics")
print(comparison_8m[:10])

In [None]:
comparison_9m = compare1(list(sfp_politics_before_text1), list(sfp_posts_text1))

In [None]:
comparison_10m = compare1(list(sfp_politics_after_text1), list(sfp_posts_text1))

In [None]:
comparison_11m = compare1(list(td_politics_before_text1), list(td_posts_text1))

In [None]:
comparison_12m = compare1(list(td_politics_after_text1), list(td_posts_text1))

In [None]:
print("Most characteristic of sfp politics before")
print(comparison_9m[-10:])
print("Most characteristic of sfp")
print(comparison_9m[:10])

In [None]:
print("Most characteristic of sfp politics after")
print(comparison_10m[-10:])
print("Most characteristic of sfp")
print(comparison_10m[:10])

In [None]:
print("Most characteristic of td politics before")
print(comparison_11m[-10:])
print("Most characteristic of td")
print(comparison_11m[:10])

In [None]:
print("Most characteristic of td politics after")
print(comparison_12m[-10:])
print("Most characteristic of td")
print(comparison_12m[:10])

In [None]:
print([i for i in politics_text1 if 'submission' in i][:10])
print(len([i for i in politics_text1 if 'submission' in i]))
print(len([i for i in politics_text1 if 'your submission' in i]))
print(len([i for i in politics_text1 if 'i am a bot' in i.lower()]))
print([i for i in politics_text1 if 'submission' in i and 'i am a bot' not in i.lower()][:10])
print(len([i for i in politics_text1 if ('your submission' in i and 
       'has been removed for the following reason' in i) or 'i am a bot' in i.lower()]))
print([i for i in politics_text1 if 'submission' in i and 'your submission' not in i and 
       'has been removed for the following reason' not in i and 'i am a bot' not in i.lower()])

In [None]:
print(len([i for i in politics_text1 if 'message' in i]))
print(len([i for i in politics_text1 if 'message' in i and 'your submission' not in i and 
       'has been removed for the following reason' not in i and 'i am a bot' not in i.lower()]))
print([i for i in politics_text1 if 'message' in i and 'your submission' not in i and 
       'has been removed for the following reason' not in i and 'i am a bot' not in i.lower()][:10])
print(len([i for i in politics_text1 if ('your submission' in i and 
       'has been removed for the following reason' in i or 'i am a bot' in i.lower()) and 
                 'thank you for participating in' not in i.lower()]))
print(len([i for i in politics_text1 if 'i am a bot' in i.lower() or 
                 'thank you for participating in' in i.lower()]))

In [None]:
print(len([i for i in politics_text2 if 'thank you for participating in' in i.lower()]))
print(len([i for i in politics_text2 if 'comment' in i.lower()]))
print(len([i for i in politics_text2 if 'removal' in i.lower()]))
print([i for i in politics_text2 if 'comment' in i.lower() and 'removal' not in i.lower()][:10])
print([i for i in politics_text2 if 'removal' in i.lower() and 'thank you for participating in' not in i.lower()])

In [None]:
politics_text2 = [i for i in politics_text1 if 'i am a bot' not in i.lower() and 
                 'thank you for participating in' not in i.lower()]

In [None]:
print(len(politics_text1))
print(len(politics_text2))

In [None]:
comparison_5m1 = compare1(list(sfp_politics_before_text1), list(politics_text2))
comparison_6m1 = compare1(list(sfp_politics_after_text1), list(politics_text2))
comparison_7m1 = compare1(list(td_politics_before_text1), list(politics_text2))
comparison_8m1 = compare1(list(td_politics_after_text1), list(politics_text2))

In [None]:
print("Most characteristic of sfp politics before")
print(comparison_5m1[-10:])
print("Most characteristic of /r/politics")
print(comparison_5m1[:10])

In [None]:
print("Most characteristic of sfp politics after")
print(comparison_6m1[-10:])
print("Most characteristic of /r/politics")
print(comparison_6m1[:10])

In [None]:
print("Most characteristic of td politics before")
print(comparison_7m1[-10:])
print("Most characteristic of /r/politics")
print(comparison_7m1[:10])

In [None]:
print("Most characteristic of td politics after")
print(comparison_8m1[-10:])
print("Most characteristic of /r/politics")
print(comparison_8m1[:10])