# TF-IDF calculator for Reddit months

This file calculates tf-idf embeddings for words in each subreddit in a given month (subreddits as documents). Subreddits with less than 100 distinct words are excluded as well as all stopwords. The final output is represented as a sparse matrix. 

This notebook also provides functionality for counting the number of subs containing a given word and tracking these values over time.


In [2]:
import os
import time
from IPython.display import clear_output
from collections import defaultdict, Counter
import math
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import numpy as np
import copy
import random
import pickle
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


In [3]:
date_list = ['2005-12',
             '2006-12',
             '2007-12',
             '2008-12',
             '2009-12',
             '2010-12',
             '2011-12',
             '2012-12',
             '2013-12',
             '2014-12',
             '2015-12',
             '2016-12']
dict_list = []
for date in date_list:
    print(f"loading {data}...")
    with open(f'dict_{date}.pickle', 'rb') as data:
        d = pickle.load(data)
    dict_list.append(d)

For example, here are some example common words in r/politics:


KeyError: 'politics'

In [5]:
subreddit = 'politics'
print(f"For example, here are some example common words in r/{subreddit}:")
print(list(dict_list[3][subreddit].items())[:5])

For example, here are some example common words in r/politics:
[('people', 12075), ('would', 10133), ('dont', 8886), ('like', 8712), ('deleted', 8626)]


In [11]:
#Load our two comparison years (2013 and 2014)
d13 = dict_list[8]
d14 = dict_list[9]

## Get most common subreddits

In [6]:
def get_top_N_subs(N, month_dictionary):
    data = {}
    for sub in month_dictionary.items():
            data[sub[0]] = sum(sub[1].values())
    return sorted(data.items(), key=lambda x: x[1],
                  reverse = True)[:N]

In [12]:
print(get_top_N_subs(5,d13))
print(get_top_N_subs(5,d14))

[('AskReddit', 54074734), ('AdviceAnimals', 13637911), ('leagueoflegends', 10108980), ('funny', 9534077), ('pics', 7317629)]
[('leagueoflegends', 9801783), ('worldnews', 9195902), ('DestinyTheGame', 9090857), ('news', 8844423), ('nfl', 8454921)]


## Get tfi-idf transformer and matrix using sklearn

In [10]:
def build_vectorizer_tfidf_matrix(d):
    """
    Provide the word count dictionary
    Will output two objects: 
        - a dictVecotrizer which will be used to access the column names (words)
        - the tfidf transformed matrix (sparse matrix) to get tf-idf values
    """
    print("Building tfidf...")
    dv = DictVectorizer()
    #D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] #EXAMPLE FORMAT
    X = dv.fit_transform(list(d.values()))
    tv = TfidfTransformer()
    tfidf = tv.fit_transform(X)
    print("done")
    return dv, tfidf
#     print(tfidf.toarray())

In [None]:
#Run vectorizer, this can take some time
tfidf = build_vectorizer_tfidf_matrix(d)
print("shape:", tfidf[1].toarray().shape)

In [None]:
def get_top_k_tfidf(k, d, tfidf, subreddit):
    '''
    Get the top tfidf embeddings for a given subreddit
    Takes: k (top k tfidf words), 
            month dictionary,
            the tfidf matrix, 
            and target subreddit
    '''
    index = list(d.keys()).index(subreddit)
    # Take the indices of the largest k elements from each row
    top_k_inds = np.argsort(tfidf[1][index,:].toarray())[:, -1:-k - 1:-1]
#     # Take the values at those indices
#     top_k = np.take_along_axis(tfidf.toarray(), top_k_inds, axis=-1)
#     top_k_pairs = np.stack((top_k_inds, top_k), axis=2)
    
    #return list of top words
    top_tfidf_words = []
    for ind in top_k_inds[0]:
        top_tfidf_words.append(tfidf[0].get_feature_names()[ind])
    return top_tfidf_words

In [None]:
k, subreddit = 5, 'leagueoflegends'

print(f'The top {k} words for r/{subreddit} are:')
get_top_k_tfidf(k, d, tfidf, subreddit)

## Get count of subs containing given word over time

In [None]:
#Get count of subs containing given word over time
date_list = ['2008-12', '2010-12','2012-12', '2014-12']
dict_list = []
for date in date_list:
    dict_list.append(build_word_count_dictionary(date))


In [None]:
word = 'bitcoin'
sub_count_list = []

for d in dict_list:
    sub_count_list.append(get_word_sub_count(word, d)/len(d))
print(sub_count_list)
plt.plot(date_list, sub_count_list)
for x,y in zip(date_list,sub_count_list):

    label = "{:.3f}".format(y)

    plt.annotate(label, # this is the text
                 (x,y), # these are the coordinates to position the label
                 textcoords="offset points", # how to position the text
                 xytext=(0,1), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center
plt.title(f"Word: '{word}'")
plt.ylabel("Prop of Subs")
plt.show()

In [None]:
sub_count_list

### Plot histogram of word counts

In [None]:
print(f"We have {len(df)} unique words and {len(d)} unique subreddits")
logD = {k:np.log10(v) for k, v in df.items()}
plt.hist(logD.values(), bins = 25)
plt.xlabel("Log10 num of subreddits")
plt.ylabel("Count of words")
plt.plot()

### Plotting sorted dict

In [None]:
sub_reddit = 'theoffice' # theoffice
topN = 6
x = list(dict(sorted(sub_dict[sub_reddit].items(), 
                           key=lambda item: item[1], reverse = True)
                    [:topN]).keys())
y = list(dict(sorted(sub_dict[sub_reddit].items(), 
                           key=lambda item: item[1], reverse = True)
                    [:topN]).values())


plt.bar(x,y, align='center')
plt.xticks(rotation=70)
plt.title(sub_reddit)
plt.show()