In [1]:
import csv
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer as CV

In [2]:
with open('stored_variables/sfp_subreddits_by_author.json', 'r') as f:
    sfp_subreddits_by_author = json.load(f)

with open('stored_variables/td_subreddits_by_author.json', 'r') as f:
    td_subreddits_by_author = json.load(f)
    
with open('stored_variables/neither_subreddits_by_author.json', 'r') as f:
    neither_subreddits_by_author = json.load(f)

## First: uniform prior, un-normalized

In [3]:
sfp_subreddits_list = []

for author in sfp_subreddits_by_author.keys():
    curr_str = ""
    
    for subreddit in sfp_subreddits_by_author[author].keys():
        curr_str += (subreddit + " ")
        
    sfp_subreddits_list.append(curr_str)

td_subreddits_list = []

for author in td_subreddits_by_author.keys():
    curr_str = ""
    
    for subreddit in td_subreddits_by_author[author].keys():
        curr_str += subreddit + " "
    
    td_subreddits_list.append(curr_str)

In [12]:
def compare1(l1, l2, prior=0.01):
    cv = CV(decode_error = 'ignore', min_df = 10, max_df = .5, ngram_range=(1,1),
        binary = False,
        max_features = 15000)
    
    counts_mat = cv.fit_transform(l1 + l2).toarray()
    vocab_size = len(cv.vocabulary_)
    #print(vocab_size)
    priors = np.array([prior for i in range(vocab_size)])
    
    z_scores = np.empty(priors.shape[0])
    count_matrix = np.empty([2, vocab_size], dtype=np.float32)
    count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
    count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
    a0 = np.sum(priors)
    n1 = 1.*np.sum(count_matrix[0,:])
    n2 = 1.*np.sum(count_matrix[1,:])

    for i in range(vocab_size):
        #compute delta
        term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
        term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))        
        delta = term1 - term2
        #compute variance on delta
        var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
        #store final score
        z_scores[i] = delta/np.sqrt(var)
    index_to_term = {v:k for k,v in cv.vocabulary_.items()}
    sorted_indices = np.argsort(z_scores)
    return_list = []
    for i in sorted_indices:
        return_list.append((index_to_term[i], z_scores[i]))
        
    return return_list

In [13]:
comparison_1 = compare1(sfp_subreddits_list, td_subreddits_list)
print(comparison_1[-10:])
print(comparison_1[:10])

[('thebutton', 3.777473337176859), ('askscience', 3.97654623114658), ('progressive', 4.044684957814867), ('restorethefourth', 4.074991038357249), ('seattle', 4.126992184683207), ('secretsanta', 4.343393488401389), ('basicincome', 4.488484208518572), ('doctorwho', 4.499111229201192), ('occupywallstreet', 5.036168985066751), ('blog', 5.523181789875759)]
[('pokemongo', -7.587610454482942), ('cringeanarchy', -7.480103708528793), ('overwatch', -7.409909049309773), ('globaloffensive', -6.2801295838618945), ('roastme', -6.062970625925755), ('4chan', -5.892696460418769), ('dota2', -4.887291774343714), ('thedivision', -4.858327105652409), ('theredpill', -4.789393729985014), ('2007scape', -4.7164541864877565)]


## Normalize by author

In [26]:
def compare2(l1, l2, prior=0.01):
    cv = CV(decode_error = 'ignore', min_df = 10, max_df = .5, ngram_range=(1,1),
        binary = False,
        max_features = 15000)
    
    counts_mat = cv.fit_transform(l1 + l2).toarray()
    vocab_size = len(cv.vocabulary_)
    #print(vocab_size)
    priors = np.array([prior for i in range(vocab_size)])
    
    #normalization:
    counts_mat = (counts_mat.T/(np.sum(counts_mat, axis=1) + 1)).T
    
    z_scores = np.empty(priors.shape[0])
    count_matrix = np.empty([2, vocab_size], dtype=np.float32)
    count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
    count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
    a0 = np.sum(priors)
    n1 = 1.*np.sum(count_matrix[0,:])
    n2 = 1.*np.sum(count_matrix[1,:])

    for i in range(vocab_size):
        #compute delta
        term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
        term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))        
        delta = term1 - term2
        #compute variance on delta
        var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
        #store final score
        z_scores[i] = delta/np.sqrt(var)
    index_to_term = {v:k for k,v in cv.vocabulary_.items()}
    sorted_indices = np.argsort(z_scores)
    return_list = []
    for i in sorted_indices:
        return_list.append((index_to_term[i], z_scores[i]))
        
    return return_list

In [27]:
comparison_2 = compare2(sfp_subreddits_list, td_subreddits_list)
print(comparison_2[-10:])
print(comparison_2[:10])

[('portland', 0.7519315076409359), ('basicincome', 0.7529277737864452), ('doctorwho', 0.7550536575541459), ('asoiaf', 0.7717712754236867), ('atheism', 0.7755427008407607), ('blog', 0.790035117701019), ('worldpolitics', 0.801693016767784), ('occupywallstreet', 0.8136298870756509), ('seattle', 0.8260242994134416), ('technology', 0.9088369615983086)]
[('overwatch', -1.2927113614115437), ('pokemongo', -1.2785102857437642), ('globaloffensive', -1.2767144216669855), ('leagueoflegends', -1.2089432795482127), ('4chan', -1.1500965027148624), ('theredpill', -1.0171051377670564), ('cringeanarchy', -0.9970115620357181), ('dota2', -0.9467274678179401), ('roastme', -0.9076702534043156), ('2007scape', -0.8671520506447393)]


## Use Neither as a prior

In [28]:
neither_subreddits_list = []

for author in neither_subreddits_by_author.keys():
    curr_str = ""
    
    for subreddit in neither_subreddits_by_author[author].keys():
        curr_str += (subreddit + " ")
        
    neither_subreddits_list.append(curr_str)


In [29]:
def compare3(l1, l2, baseline, normalize=False):
    cv_baseline = CV(decode_error = 'ignore', min_df = 10, max_df = .5, ngram_range=(1,1),
        binary = False, max_features = 15000)
    
    counts_prior = cv_baseline.fit_transform(baseline).toarray()
    priors = np.sum(counts_prior, axis=0)
    
    cv = CV(decode_error = 'ignore', min_df = 10, max_df = .5, ngram_range=(1,1),
        binary = False, max_features = 15000, vocabulary = cv_baseline.vocabulary_)
    
    counts_mat = cv.fit_transform(l1 + l2).toarray()
    vocab_size = len(cv.vocabulary_)
    #print(vocab_size)
    
    #normalization:
    if(normalize):
        counts_mat = (counts_mat.T/(np.sum(counts_mat, axis=1) + 1)).T
    
    z_scores = np.empty(priors.shape[0])
    count_matrix = np.empty([2, vocab_size], dtype=np.float32)
    count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
    count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
    a0 = np.sum(priors)
    n1 = 1.*np.sum(count_matrix[0,:])
    n2 = 1.*np.sum(count_matrix[1,:])

    for i in range(vocab_size):
        #compute delta
        term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
        term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))        
        delta = term1 - term2
        #compute variance on delta
        var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
        #store final score
        z_scores[i] = delta/np.sqrt(var)
    index_to_term = {v:k for k,v in cv.vocabulary_.items()}
    sorted_indices = np.argsort(z_scores)
    return_list = []
    for i in sorted_indices:
        return_list.append((index_to_term[i], z_scores[i]))
        
    return return_list

In [30]:
comparison_3 = compare3(sfp_subreddits_list, td_subreddits_list, neither_subreddits_list)
print(comparison_3[-10:])
print(comparison_3[:10])

[('frugal', 3.2837962683991084), ('mapporn', 3.3301398109000924), ('netflixbestof', 3.3877090531313887), ('politics', 3.6386095624894472), ('thebutton', 3.7618815974563584), ('askscience', 3.77833925460946), ('seattle', 3.822106582141636), ('secretsanta', 4.141789141188557), ('doctorwho', 4.2055421054635715), ('blog', 5.388076676853797)]
[('pokemongo', -6.941230985241581), ('overwatch', -6.60472364308496), ('globaloffensive', -5.810640385635588), ('roastme', -5.63380768403611), ('4chan', -5.586541302382348), ('dota2', -4.454882825665547), ('darksouls3', -3.798243471527248), ('mma', -3.753678837700957), ('imgoingtohellforthis', -3.6227099401951213), ('tumblrinaction', -3.5172477773774764)]


In [32]:
#Normalized
comparison_4 = compare3(sfp_subreddits_list, td_subreddits_list, neither_subreddits_list, normalize=True)
print(comparison_4[-10:])
print(comparison_4[:10])

[('doctorwho', 0.30158956085646954), ('mapporn', 0.3051431127897479), ('seattle', 0.3102731333529693), ('askscience', 0.3287725189128407), ('frugal', 0.3298617991362475), ('thebutton', 0.37994196943699454), ('technology', 0.39849558972243077), ('blog', 0.40533975775212594), ('asoiaf', 0.43100721190189023), ('politics', 0.4484807437591217)]
[('4chan', -0.5882915193747954), ('pokemongo', -0.5672638141093601), ('globaloffensive', -0.565718367800873), ('overwatch', -0.5200039468047248), ('pcmasterrace', -0.48442485852551403), ('dota2', -0.4800917097580366), ('leagueoflegends', -0.4525732634171881), ('guns', -0.43718812090301995), ('roastme', -0.40368119202936303), ('tumblrinaction', -0.3957858400269076)]
