In [1]:
import glob
import os
import re
from collections import defaultdict
import math
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import numpy as np

In [2]:
stops = set(stopwords.words('english'))

### Plot distribution of comments across all subs

In [3]:
dataDir = "./comment_count/"
yearMonth = "count_RC_2012-12"

sub_comments = {}
file = open(dataDir + yearMonth +'.txt')
for line in file:
    value, key = line.split() #count, word
    if key not in stops:
            sub_comments[key] = int(value)
print("Number of comments",sum(sub_comments.values()))
print("Number of subreddits", len(sub_comments))

Number of comments 26080276
Number of subreddits 22622


## Test reading in a single subreddit

In [4]:
dataDir = "./word_count/"
yearMonth = "RC_2012-12_"
subreddit = '"anime"'

word_dict = {}
file = open(dataDir + yearMonth + subreddit +'.txt')
for line in file:
    value, key = line.split() #count, word
    if key not in stops:
            word_dict[key] = int(value)

list(word_dict.items())[:5]

[('anime', 6809),
 ('like', 6776),
 ('one', 4906),
 ('really', 4383),
 ('im', 3938)]

## Read in all subreddits to dictionary 

In [5]:
dataDir = "./word_count/"

sub_dict = {}
for filename in glob.glob(os.path.join(dataDir, '*.txt')):
    subreddit = ''.join(re.findall('"([^"]*)"', filename)) #get name of sub
    word_dict = {}
    with open(os.path.join(os.getcwd(), filename), 'r') as f: 
        for line in f:
            value, key = line.split() #count, word
            if key not in stops:
                word_dict[key] = int(value)
    sub_dict[subreddit] = word_dict
print(f"We have {len(sub_dict)} files")

We have 373 files


In [6]:
sub_dict['PS3']['game']

2856

In [7]:
N = len(sub_dict)

df = defaultdict(int)
for s in sub_dict:
    for w in sub_dict[s]: #count of subs where w appears
        df[w] += 1
df['game']

371

In [4]:
for i in range(1000):
    if i
    print(i, end="\r")

012345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627

In [10]:
s = {"a": 1, "b": 2}
s1 = s
s['c'] = 3
s1['d'] = 4
s

{'a': 1, 'b': 2, 'c': 3, 'd': 4}

In [8]:
for s in sub_dict:
    for key, value in sub_dict[s].items():
        tf = (value/
                  (sum(sub_dict[s].values())-value+1))
        idf = (math.log2((N-df[key]+1) / df[key])) #log inverse doc freq
        tfidf = tf*idf
        sub_dict[s][key] = tfidf
    

In [9]:
dict(sorted(sub_dict['PS3'].items(), key=lambda item: item[1], 
            reverse = True)[:5]).items()

dict_items([('1470', 2.553293516708434), ('128', 0.980996442992062), ('1943', 0.8638640511398717), ('2099', 0.8614357038717592), ('2nps3', 0.7554085741975543)])

In [10]:
for s in sub_dict:
    print(s, 
          list(dict(sorted(sub_dict[s].items(), 
                           key=lambda item: item[1], reverse = True)
                    [:3]).items()))

japan [('2500', 0.4088389768877432), ('akasaka', 0.2716332342889575), ('aerospace', 0.18257308483291484)]
Anarchism [('1917', 1.1725782289193905), ('1918', 0.990296668384143), ('1936', 0.8589018827411479)]
gamedev [('025', 2.855662411467426), ('199', 0.7869125838063824), ('1d', 0.7642235785371414)]
SquaredCircle [('00s', 1.1861280298257806), ('1000000', 0.6130823882212911), ('5mod', 0.47246047909675715)]
actuallesbians [('150', 0.953370596835816), ('511', 0.6878619427439621), ('29', 0.36705683329494304)]
startrek [('1967n', 1.9005414462979024), ('100th', 1.863903496136976), ('1982', 0.5467102597856879)]
airsoft [('048', 2.2270549768183003), ('020', 2.148629662227624), ('152a', 1.9610078389542094)]
tifu [('26', 1.7516499290234522), ('4g', 0.9012758823337873), ('asiatic', 0.2772062712222055)]
DebateAChristian [('1025', 1.6889374261295174), ('1952', 0.8429619954161438), ('2219', 0.6326083055932126)]
playitforward [('333', 1.6216463966962267), ('444', 1.3146009969386558), ('150', 0.6442343

Games [('012538n', 1.1349391003490155), ('017', 0.7802147478438233), ('0001', 0.6374921956672296)]
MakeupAddiction [('115', 0.887057300668257), ('3oz', 0.7527377733797476), ('1520', 0.3950967051471907)]
videos [('009', 0.7470460442282715), ('042', 0.6727283776925084), ('0000', 0.6431453989388685)]
swtor [('1440', 1.7336546104126345), ('1499', 1.165166402428484), ('26s', 0.797113538659965)]
news [('1022', 0.6352789506650727), ('03', 0.5519388214668012), ('146000', 0.5276259652341507)]
CanadaPolitics [('1947', 1.8809672527630503), ('1900', 1.6188855749393853), ('1960s', 0.7708395380791372)]
Drugs [('05l', 1.4319177243142658), ('05mg', 1.253523547977238), ('005', 0.7619241117104008)]
pokemon [('10yearold', 1.2038955276817764), ('1024', 0.9263584939193383), ('1012', 0.601203702962816)]
Steam [('099', 18.30301962728503), ('120gb', 1.2995793291068949), ('1100', 0.9863748431985053)]
truegaming [('001', 0.7373489478261973), ('12n', 0.679725800911248), ('04', 0.5000506858144219)]
runescape [('1

anime [('0083', 3.1712624305128827), ('2526', 0.6906067071690598), ('3x3', 0.28169087124915415)]
Libertarian [('12632', 0.8523069856739544), ('1022', 0.6767216421139085), ('17t', 0.6673413715129906)]
trees [('100quarter', 0.9686253681742648), ('009', 0.873859780817859), ('10guy', 0.6015932839560642)]
books [('1q84n56', 0.7715480550896485), ('129', 0.6836144591141057), ('000', 0.6129285575174136)]
LadyBoners [('640', 1.1010917038229755), ('2013', 0.4089926928709881), ('adaptation', 0.27587504462532386)]
nosleep [('1am', 5.411113722605263), ('16th', 2.6148292930289583), ('abalam', 1.1796548864595697)]
magicTCG [('077', 1.0005027889235762), ('010', 0.529961228422381), ('099', 0.492622708520642)]
Coffee [('16g', 3.2585343008946923), ('17g', 2.2280532574320273), ('24g', 1.441085064749636)]
explainlikeimfive [('0125', 0.7495568689288188), ('011', 0.6793134358323833), ('002', 0.6222340278960327)]
comics [('001', 9.219286121440591), ('0n', 1.931124002888371), ('01', 1.5168364652263828)]
Suicid

## Cosine Similarity

In [11]:
def cosine_dic(dic1,dic2):
    numerator = 0
    dena = 0
    for key1,val1 in dic1.items():
        numerator += val1*dic2.get(key1,0.0)
        dena += val1*val1
    if(dena == 0): 
        dena=10
    
    denb = 0
    for val2 in dic2.values():
        denb += val2*val2
    if(denb == 0): 
        denb=10
    return numerator/math.sqrt(dena*denb)

In [12]:
similarity = {}
for s in sub_dict:
    similarity[s] = cosine_dic(sub_dict['politics'], sub_dict[s])
    
dict(sorted(similarity.items(), key=lambda item: item[1], 
            reverse = True)[:4]).items()

dict_items([('politics', 1.0), ('todayilearned', 0.11546575794042174), ('bodybuilding', 0.0978649080391264), ('technology', 0.09352939317034799)])

In [13]:
similarity_all = {}
for s1 in sub_dict:
    similarity = {}
    for s2 in sub_dict:
        if s2 != s1:
            similarity[s2] = cosine_dic(sub_dict[s1], sub_dict[s2])
    similarity_all[s1] = similarity
similarity_all

{'japan': {'Anarchism': 0.008793579842323901,
  'gamedev': 0.006125066124981807,
  'SquaredCircle': 0.011011309536758902,
  'actuallesbians': 0.007182012930263022,
  'startrek': 0.006779364436333378,
  'airsoft': 0.006052586703952842,
  'tifu': 0.007120333606451569,
  'DebateAChristian': 0.09394328889932535,
  'playitforward': 0.008209913497709569,
  'Austin': 0.005585114978719015,
  'Metal': 0.015147808799643668,
  'space': 0.001858233065848758,
  'techsupport': 0.005882848941385372,
  'boston': 0.0356610791363422,
  'fitnesscirclejerk': 0.003905980673417526,
  'iphone': 0.006055098695244987,
  'apple': 0.006416452304088992,
  'WorldofTanks': 0.0072340862973039035,
  'sharedota2': 0.003287348310606908,
  'personalfinance': 0.007570466318289823,
  'applehelp': 0.0055430700681006785,
  'Israel': 0.006177141351885564,
  'Naruto': 0.005823362657678781,
  'todayilearned': 0.008037865502995683,
  'Music': 0.008437680361082157,
  'RandomActsOfGaming': 0.0108051491726999,
  'harrypotter': 0.1

In [14]:
[key for key, value in similarity_all['pics'].items() 
 if value == max(similarity_all['pics'].values())]

['AskReddit']

In [15]:
out = []
for s in similarity_all:
#     val = max(similarity_all[s].values())
#     if similarity_all[s].values()
#     out.append(list(sorted(similarity_all[s].items(), 
#            key=lambda item: item[1], reverse = True)[:1]))
    out.append([[s,key,value] for key, value in similarity_all[s].items() 
 if value == max(similarity_all[s].values())])
out = [val for sublist in out for val in sublist]
out[:5]

[['japan', 'harrypotter', 0.16025555187759938],
 ['Anarchism', 'RedditLaqueristas', 0.1611230979961829],
 ['gamedev', 'bestof', 0.4977701998445554],
 ['SquaredCircle', 'comicbooks', 0.4050046804525535],
 ['actuallesbians', 'GetMotivated', 0.8752063368485509]]

In [50]:
sorted(out, key=lambda x: x[2], reverse = False)[:50]

[['Atlanta', 'japan', 0.0],
 ['Atlanta', 'Anarchism', 0.0],
 ['Atlanta', 'gamedev', 0.0],
 ['Atlanta', 'SquaredCircle', 0.0],
 ['Atlanta', 'actuallesbians', 0.0],
 ['Atlanta', 'startrek', 0.0],
 ['Atlanta', 'airsoft', 0.0],
 ['Atlanta', 'tifu', 0.0],
 ['Atlanta', 'DebateAChristian', 0.0],
 ['Atlanta', 'playitforward', 0.0],
 ['Atlanta', 'Austin', 0.0],
 ['Atlanta', 'Metal', 0.0],
 ['Atlanta', 'space', 0.0],
 ['Atlanta', 'techsupport', 0.0],
 ['Atlanta', 'boston', 0.0],
 ['Atlanta', 'fitnesscirclejerk', 0.0],
 ['Atlanta', 'iphone', 0.0],
 ['Atlanta', 'apple', 0.0],
 ['Atlanta', 'WorldofTanks', 0.0],
 ['Atlanta', 'sharedota2', 0.0],
 ['Atlanta', 'personalfinance', 0.0],
 ['Atlanta', 'applehelp', 0.0],
 ['Atlanta', 'Israel', 0.0],
 ['Atlanta', 'Naruto', 0.0],
 ['Atlanta', 'todayilearned', 0.0],
 ['Atlanta', 'Music', 0.0],
 ['Atlanta', 'RandomActsOfGaming', 0.0],
 ['Atlanta', 'harrypotter', 0.0],
 ['Atlanta', 'LiverpoolFC', 0.0],
 ['Atlanta', 'pettyrevenge', 0.0],
 ['Atlanta', 'NoFap', 0.0

In [24]:
similarity = {}
for s in sub_dict:
    similarity[s] = cosine_dic(sub_dict['rage'], sub_dict[s])
    
print(dict(sorted(similarity.items(), key=lambda item: item[1], 
            reverse = True)[:4]).items())
print(dict(sorted(similarity.items(), key=lambda item: item[1], 
            reverse = True)[-4:]).items())

dict_items([('rage', 1.0), ('confession', 0.5407195900167088), ('programming', 0.11644740437156918), ('worldpolitics', 0.0903456653544835)])
dict_items([('gonewildcurvy', -0.03248033008149309), ('offbeat', -0.03411467509707272), ('gaymers', -0.04095434600338531), ('TumblrInAction', -0.22479994919868146)])


In [30]:
similarity = {}
for s in sub_dict:
    similarity[s] = cosine_dic(sub_dict['politics'], sub_dict[s])
    
print(dict(sorted(similarity.items(), key=lambda item: item[1], 
            reverse = True)[:4]).items())
print(dict(sorted(similarity.items(), key=lambda item: item[1], 
            reverse = True)[-4:]).items())

dict_items([('politics', 1.0), ('todayilearned', 0.11546575794042174), ('bodybuilding', 0.0978649080391264), ('technology', 0.09352939317034799)])
dict_items([('shittyaskscience', 0.00020735121179027522), ('cigars', 0.00014736752415208607), ('vegan', 2.657706344399436e-05), ('Atlanta', 0.0)])


In [39]:
similarity = {}
for s in sub_dict:
    similarity[s] = cosine_dic(sub_dict['Christianity'], sub_dict[s])
    
print(dict(sorted(similarity.items(), key=lambda item: item[1], 
            reverse = True)[:4]).items())
print(dict(sorted(similarity.items(), key=lambda item: item[1], 
            reverse = True)[-4:]).items())

dict_items([('Christianity', 1.0), ('Libertarian', 0.17563528048430344), ('Homebrewing', 0.16098987815408705), ('news', 0.13993452276857865)])
dict_items([('Atlanta', 0.0), ('chelseafc', -0.00408854269971283), ('reactiongifs', -0.02649995098410768), ('vegan', -0.09198850406878491)])


In [49]:
similarity = {}
for s in sub_dict:
    similarity[s] = cosine_dic(sub_dict['batman'], sub_dict[s])
    
print(dict(sorted(similarity.items(), key=lambda item: item[1], 
            reverse = True)[:4]).items())
print(dict(sorted(similarity.items(), key=lambda item: item[1], 
            reverse = True)[-4:]).items())

dict_items([('batman', 1.0), ('minecraftsuggestions', 0.39179288232395354), ('ForeverAlone', 0.33009755214738506), ('creepyPMs', 0.14593208971273589)])
dict_items([('pettyrevenge', -0.26288519302116264), ('creepy', -0.39850590176624057), ('newzealand', -0.42294737623041884), ('lgbt', -0.6387867667470255)])
