# tf-idf Manual Calculation
https://en.wikipedia.org/wiki/Tf%E2%80%93idf  
https://baike.baidu.com/item/tf-idf/8816134

In [1]:
import numpy as np
import pandas as pd

bow = pd.read_csv('bag_of_words.csv')
bow_1 = bow.iloc[:, 1:] # Exclude "quarter_statement"
bow_1.head()

Unnamed: 0,a.,abstain,abstent,acceler,accept,accompani,accord,accordingli,account,accru,...,taster,tey,tuggl,unannounc,unif,unnecessari,valser,websit,wisconsin,www.coca-cola.com
0,1.0,1.0,1.0,1.0,4.0,1.0,2.0,1.0,25.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,3.0,2.0,5.0,2.0,37.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,2.0,1.0,2.0,0.0,25.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,0.0,0.0,0.0,2.0,1.0,1.0,3.0,35.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,1.0,1.0,0.0,2.0,1.0,1.0,1.0,9.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
tf = bow_1.div(bow_1.sum(axis=1), axis=0)
tf.head()
# .sum(axis=1): Sum up each row
# .div(axis=0): Divide each cell by its row sum

Unnamed: 0,a.,abstain,abstent,acceler,accept,accompani,accord,accordingli,account,accru,...,taster,tey,tuggl,unannounc,unif,unnecessari,valser,websit,wisconsin,www.coca-cola.com
0,0.000403,0.000403,0.000403,0.000403,0.001613,0.000403,0.000806,0.000403,0.010081,0.00121,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000505,0.000337,0.000842,0.000337,0.006231,0.000674,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000332,0.0,0.0,0.0,0.000665,0.000332,0.000665,0.0,0.008311,0.00133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000282,0.0,0.0,0.0,0.000282,0.000141,0.000141,0.000423,0.004934,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000812,0.000406,0.000406,0.0,0.000812,0.000406,0.000406,0.000406,0.003654,0.001218,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
N = len(bow_1)
idf = np.log10(N / (bow_1 != 0).sum(axis=0))
idf.head()
# N: Total number of documents
# (!= 0): True if a word appears in a document
# .sum(axis=0): Sum up each column -> Number of documents where the word appears

a.         0.234083
abstain    0.857332
abstent    0.653213
acceler    0.301030
accept     0.024824
dtype: float64

In [4]:
tf_idf = tf * idf
tf_idf.head()

Unnamed: 0,a.,abstain,abstent,acceler,accept,accompani,accord,accordingli,account,accru,...,taster,tey,tuggl,unannounc,unif,unnecessari,valser,websit,wisconsin,www.coca-cola.com
0,9.4e-05,0.000346,0.000263,0.000121,4e-05,2.6e-05,4.1e-05,0.000131,0.0,6.2e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.3e-05,2.2e-05,4.3e-05,0.00011,0.0,3.4e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7.8e-05,0.0,0.0,0.0,1.7e-05,2.2e-05,3.4e-05,0.0,0.0,6.8e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6.6e-05,0.0,0.0,0.0,7e-06,9e-06,7e-06,0.000138,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.00019,0.000348,0.000265,0.0,2e-05,2.6e-05,2.1e-05,0.000132,0.0,6.2e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Drop columns where all cells are 0
tf_idf_drop_zero = tf_idf.replace(0, np.nan).dropna(axis=1, how='all')
tf_idf_drop_zero = tf_idf_drop_zero.fillna(0)
tf_idf_drop_zero.insert(0, 'quarter_statement', bow['quarter_statement'])
tf_idf_drop_zero.head()

Unnamed: 0,quarter_statement,a.,abstain,abstent,acceler,accept,accompani,accord,accordingli,accru,...,taster,tey,tuggl,unannounc,unif,unnecessari,valser,websit,wisconsin,www.coca-cola.com
0,1994Q1,9.4e-05,0.000346,0.000263,0.000121,4e-05,2.6e-05,4.1e-05,0.000131,6.2e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1994Q2,0.0,0.0,0.0,0.0,1.3e-05,2.2e-05,4.3e-05,0.00011,3.4e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1994Q3,7.8e-05,0.0,0.0,0.0,1.7e-05,2.2e-05,3.4e-05,0.0,6.8e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1994Q4,6.6e-05,0.0,0.0,0.0,7e-06,9e-06,7e-06,0.000138,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1995Q1,0.00019,0.000348,0.000265,0.0,2e-05,2.6e-05,2.1e-05,0.000132,6.2e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Export ".csv" file
tf_idf_drop_zero.to_csv('tf_idf.csv', index=False)