In [41]:
import numpy as np
import pandas as pd
import sklearn.feature_selection as skf
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import ast
import pickle
import os

In [42]:
with open('chi_squared.pkl', 'rb') as chi_sq:
    chi_squared_dict = dict(pickle.load(chi_sq))
chi_squared_series = pd.Series(chi_squared_dict)
print(chi_squared_series)

faustian        3.000180
shaffner        1.000020
outdoorsman     0.333353
donlon          1.000020
unsur           1.258582
                  ...   
repertori       0.333353
bodypress       1.000020
wonderley       1.000020
wain            0.000000
multimillion    1.000020
Length: 48447, dtype: float64


In [50]:
with open('freq_table.pkl', 'rb') as freq_table:
    freq_table_dict = dict(pickle.load(freq_table))
freq_table_series = pd.Series({key: freq_table_dict[key] for key in freq_table_dict if chi_squared_dict[key] > 2})
print(freq_table_series)

faustian                            [[25000, 0], [24997, 3]]
steven      [[24862, 104, 25, 5, 4], [24872, 113, 14, 1, 0]]
initi             [[24832, 155, 13, 0], [24771, 215, 13, 1]]
raciest                             [[25000, 0], [24998, 2]]
lilt                                [[24996, 4], [25000, 0]]
                                  ...                       
ugghhh                              [[24998, 2], [25000, 0]]
unrev                               [[25000, 0], [24998, 2]]
thrust                [[24981, 19, 0, 0], [24964, 34, 1, 1]]
lehar                               [[24998, 2], [25000, 0]]
ditch                 [[24980, 20, 0, 0], [24989, 10, 0, 1]]
Length: 17839, dtype: object


In [51]:
total = sum(sum(r) for r in freq_table_series.values[0])
def get_idf(freq : list[list[int]]) -> float:
    cnt = sum(sum(r[1:]) for r in freq)
    return np.log(total / (cnt + 1))

selected_words = pd.DataFrame(freq_table_series)
selected_words.columns = ["freq table"]
idf_list = [get_idf(i) for i in selected_words["freq table"]]
selected_words.insert(1, "idf", idf_list)
print(selected_words.sort_values("idf"))

                                                  freq table       idf
movi       [[16390, 2412, 1846, 1417, 953, 675, 454, 302,...  1.136937
film       [[17759, 2423, 1651, 1067, 686, 452, 296, 230,...  1.223427
one        [[18024, 3686, 1829, 800, 344, 159, 78, 37, 20...  1.264714
like       [[18430, 3596, 1670, 677, 325, 170, 68, 35, 16...  1.394972
time       [[20092, 3312, 1085, 349, 100, 39, 17, 2, 3, 0...  1.615758
...                                                      ...       ...
grandaddi                           [[25000, 0], [24998, 2]]  9.721166
namedrop                            [[25000, 0], [24998, 2]]  9.721166
tiburon                             [[24998, 2], [25000, 0]]  9.721166
flagstaff                           [[24998, 2], [25000, 0]]  9.721166
desplat                       [[25000, 0, 0], [24998, 1, 1]]  9.721166

[17839 rows x 2 columns]


In [52]:
all_comments_df = pd.read_csv("all_comments.csv", encoding="utf-8")
print(all_comments_df)

                                                    Text  Positive  \
0      bromwell high is a cartoon comedy. it ran at t...      True   
1      homelessness (or houselessness as george carli...      True   
2      brilliant over-acting by lesley ann warren. be...      True   
3      this is easily the most underrated film inn th...      True   
4      this is not the typical mel brooks film. it wa...      True   
...                                                  ...       ...   
24995  towards the end of the movie, i felt it was to...     False   
24996  this is the kind of movie that my enemies cont...     False   
24997  i saw 'descent' last night at the stockholm fi...     False   
24998  some films that you pick up for a pound turn o...     False   
24999  this is one of the dumbest films, i've ever se...     False   

                                                    Freq  
0      {'bromwel': 4, 'high': 5, 'cartoon': 1, 'comed...  
1      {'homeless': 4, 'houseless': 1, 'g

In [53]:
tag_list = []
for word_freq_str in all_comments_df["Freq"]:
    tf_idf = {}
    word_freq = ast.literal_eval(word_freq_str)
    total_words_cnt = sum(word_freq[w] for w in word_freq)
    for w in word_freq:
        if w not in selected_words.index: continue
        tf_idf[w] = word_freq[w] / total_words_cnt * selected_words["idf"][w]
    tag_list.append({k:word_freq[k] for k in sorted(tf_idf, reverse=True)[:10]})


In [54]:
all_comments_df.insert(len(all_comments_df.columns), "tags", tag_list)
print(all_comments_df)

                                                    Text  Positive  \
0      bromwell high is a cartoon comedy. it ran at t...      True   
1      homelessness (or houselessness as george carli...      True   
2      brilliant over-acting by lesley ann warren. be...      True   
3      this is easily the most underrated film inn th...      True   
4      this is not the typical mel brooks film. it wa...      True   
...                                                  ...       ...   
24995  towards the end of the movie, i felt it was to...     False   
24996  this is the kind of movie that my enemies cont...     False   
24997  i saw 'descent' last night at the stockholm fi...     False   
24998  some films that you pick up for a pound turn o...     False   
24999  this is one of the dumbest films, i've ever se...     False   

                                                    Freq  \
0      {'bromwel': 4, 'high': 5, 'cartoon': 1, 'comed...   
1      {'homeless': 4, 'houseless': 1, 

In [55]:
all_comments_df.to_csv("tagged_comments.csv", index=False, encoding="utf-8")