In [10]:
import os
import time
from IPython.display import clear_output
from collections import defaultdict, Counter
import math
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import numpy as np
import copy
import random
import pickle
import warnings
warnings.filterwarnings("ignore")

In [11]:
stop_words = set(stopwords.words('english'))

In [12]:
# must have n lines
# read in contents of single file as dictionary
# load all files as dict of dicts
# calculate tf-idf for each word
# get top n 

#sparse matrix representation

In [4]:
## Function to extract top N for specific sub
## Function to get idf for specific word

## Read in files to dictionary

In [5]:
src_dir = '/Users/choldawa/Documents/Projects/RedditCorpus/word_count/RC_2012-12'

tick = time.time()

d = {}
problem_subs = []
cnt = 0
for file in os.listdir(src_dir):
    path = src_dir+"/"+file
    with open(path) as f:
        filename = os.path.basename(file).split("_")[2].split(".")[0]
        clear_output(wait=True)
        print(f"working on: {filename}")
        tock = time.time()
        print(f"Time elapsed: {tock - tick} seconds")
        sub_d = {}
        num_lines = sum(1 for line in open(path)) #must have at least 100 unique words
        if num_lines > 100:
            for line in f:
                try:
                    (val, key) = line.strip().split() #invert the order 
                    if not any(map(str.isdigit, key)) and int(val)>1: # each word must occur more than once
                         if key not in stop_words: #check if stopword   
                            sub_d[key] = int(val)
                except:
                    problem_subs.append(filename)
            d[filename] = sub_d
end = time.time()
print(f"Total Time elapsed: {end - tick} seconds")
print(f"We have {len(d)} Subreddits")
print(f"Found problems with {len(problem_subs)} subreddits")

working on: CrowdfundedBoardgames
Time elapsed: 104.78139781951904 seconds
Total Time elapsed: 104.7824158668518 seconds
We have 11427 Subreddits
Found problems with 122 subreddits


In [44]:
list(d.keys())[:11]

['terrariums',
 'oklahoma',
 'hackerspaces',
 'TheContinuum',
 'GothicMetal',
 'metalgearsolid',
 'auckland',
 'Mabinogi',
 'smashbros',
 'fightporn',
 'crazystairs']

In [6]:
subreddit = 'politics'
print(f"Here are some example common words in r/{subreddit}:")
print(list(d[subreddit].items())[:5])

Here are some example common words in r/politics:
[('people', 116175), ('would', 84858), ('dont', 77698), ('like', 70930), ('one', 56556)]


In [7]:
def get_word_sub_count(word, month_dictionary):
    '''
    get the count subs in a given month that contain a gieven word
    requires a word, and a dictionary for that month
    '''
    N = len(month_dictionary)
    
    cnt = 0
    for s in month_dictionary:
        if word in month_dictionary[s]: #count of subs where w appears
            cnt += 1
    return cnt

In [8]:
get_word_sub_count('bitcoin', d)

113

## SKLEARN tf-idf approach

In [9]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
dv = DictVectorizer()
#D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] #EXAMPLE FORMAT
X = dv.fit_transform(list(d.values()))
tv = TfidfTransformer()
tfidf = tv.fit_transform(X)
print(tfidf.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [29]:
dv.get_feature_names()[535048]

'staircases'

In [None]:
tfidf.toarray().shape

In [40]:
def get_top_k_tfidf(k, d, tfidf_matrix, subreddit):
    '''
    Get the top tfidf embeddings for a given subreddit
    Takes: k (top k tfidf words), 
            month dictionary,
            the tfidf matrix, 
            and target subreddit
    '''
    index = list(d.keys()).index(subreddit)
    # Take the indices of the largest k elements from each row
    top_k_inds = np.argsort(tfidf[index,:].toarray())[:, -1:-k - 1:-1]
#     # Take the values at those indices
#     top_k = np.take_along_axis(tfidf.toarray(), top_k_inds, axis=-1)
#     top_k_pairs = np.stack((top_k_inds, top_k), axis=2)
    
    #return list of top words
    top_tfidf_words = []
    for ind in top_k_inds[0]:
        top_tfidf_words.append(dv.get_feature_names()[ind])
    return top_tfidf_words

In [45]:
k, subreddit = 5, 'crazystairs'

print(f'The top {k} words for r/{subreddit} are:')
get_top_k_tfidf(k, d, tfidf, subreddit)

The top 5 words for r/crazystairs are:


['stairs', 'nsfw', 'xpost', 'staircase', 'staircases']

### Plot histogram of word counts

In [None]:
print(f"We have {len(df)} unique words and {len(d)} unique subreddits")
logD = {k:np.log10(v) for k, v in df.items()}
plt.hist(logD.values(), bins = 25)
plt.xlabel("Log10 num of subreddits")
plt.ylabel("Count of words")
plt.plot()

### Plotting sorted dict

In [None]:
sub_reddit = 'theoffice' # theoffice
topN = 6
x = list(dict(sorted(sub_dict[sub_reddit].items(), 
                           key=lambda item: item[1], reverse = True)
                    [:topN]).keys())
y = list(dict(sorted(sub_dict[sub_reddit].items(), 
                           key=lambda item: item[1], reverse = True)
                    [:topN]).values())


plt.bar(x,y, align='center')
plt.xticks(rotation=70)
plt.title(sub_reddit)
plt.show()