# TF-IDF notebook
This notebook applies TF-IDF to a given corpus

In [1]:
import gensim
import os
import re
import pandas as pd

In [20]:
# Open files and add to a list
documents = []
for f in os.listdir('by_decade'):
    if f[0] is not '.':
        with open('by_decade/'+f,'r') as file:
            documents.extend(file.readlines())

In [23]:
documents = list(map(lambda x : re.sub("[0-9]|\.|---","",x.rstrip()),documents))

In [24]:
texts = [d.split() for d in documents]

In [25]:
dictionary = gensim.corpora.Dictionary(texts)

In [26]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [27]:
tfidf = gensim.models.tfidfmodel.TfidfModel(corpus,id2word=dictionary)

In [28]:
corpus_tfidf = tfidf[corpus]

In [38]:
df = pd.DataFrame(columns = ['year','term','tfidf'])
df['year'] = df['year'].astype(int)
df['term'] = df['term'].astype(str)
df['tfidf'] = df['tfidf'].astype(float)

In [39]:
for n in [(0,1910),(1,1920),(2,1930),(3,1940),(4,1950),(5,1960),(6,1970),(7,1980),(8,1990),(9,2000)]:
    document = sorted(corpus_tfidf[n[0]],key=lambda x: x[1],reverse=True)
    for entry in zip([(dictionary[key]) for key in [x[0] for x in document[0:100]]],[y[1] for y in document]):
        df = df.append(pd.DataFrame([[n[1],entry[0],entry[1]]],columns = ['year','term','tfidf']))

In [41]:
df

Unnamed: 0,year,term,tfidf
0,1910,kelvin,0.477322
0,1910,elsbeth,0.280477
0,1910,ryder,0.195372
0,1910,joshes,0.155574
0,1910,truelove,0.155173
0,1910,josh,0.150989
0,1910,taber,0.143656
0,1910,lieut,0.123445
0,1910,scarlett,0.115216
0,1910,karo,0.103334


In [42]:
df.to_csv('tfidf_decade.csv')

In [None]:
raw_counts

In [None]:
percentage

In [None]:
df['raw_counts'] = pd.Series(raw_counts).values

In [None]:
df['percentage'] = pd.Series(percentage).values

In [None]:
df

In [None]:
df.sort_values(by = 'percentage', ascending=False)

In [None]:
df.to_csv('tfidf.csv')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
years = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2010]
for y in years:
    plt.figure(figsize=(15,7))
    plt.title(y)
    sns.barplot(x = 'tfidf',y = 'term',data=df[df['year']==y]);