In [1]:
from collections import Counter #count word occurrences
import pandas as pd

In [2]:
file_path = r"C:\Users\night\Downloads\Processed_Reviews.csv"
df = pd.read_csv(file_path)

In [3]:
tokenized_reviews = df['tokenized'].dropna().apply(eval)

In [4]:
all_words = [word for review in tokenized_reviews for word in review]
unique_words = list(set(all_words))

In [5]:
word_freq = Counter(all_words)
sorted_word_freq = dict(sorted(word_freq.items(), key=lambda item: item[1],
reverse=True))

In [6]:
document_vectors = []
for review in tokenized_reviews:
    document_vector = [1 if word in review else 0 for word in sorted_word_freq.keys()]
    document_vectors.append(document_vector)

In [7]:
doc_vectors_df = pd.DataFrame(document_vectors, columns=sorted_word_freq.keys())

In [8]:
doc_vectors_df.to_csv("document_vectors.csv", index=False)

In [9]:
word_freq_df = pd.DataFrame(list(sorted_word_freq.items()), columns=["Word",
"Frequency"])
print("Word Frequency Table:")
print(word_freq_df)

Word Frequency Table:
           Word  Frequency
0       product          7
1       quality          3
2         great          2
3       amazing          2
4          love          2
5       awesome          2
6          work          2
7     perfectly          2
8          life          2
9        expect          2
10       arrive          1
11         time          1
12    packaging          1
13        amaze          1
14          buy          1
15        phone          1
16           hz          1
17      display          1
18      totally          1
19        worth          1
20          wow          1
21          bit          1
22    expensive          1
23       laptop          1
24         fine          1
25        check          1
26         full          1
27       detail          1
28     purchase          1
29        happy          1
30      battery          1
31    excellent          1
32       charge          1
33        cable          1
34        short          1
35    

In [10]:
import pandas as pd
import math # use for log function
from collections import Counter

In [11]:
file_path = r"C:\Users\night\Downloads\Processed_Reviews.csv"
df = pd.read_csv(file_path)

In [12]:
tokenized_reviews = df['tokenized'].dropna().apply(eval)

In [13]:
def compute_tf(document):
    word_count = Counter(document)
    tf = {word: count / len(document) for word, count in word_count.items()}
    return tf

In [14]:
def compute_idf(documents):
    N = len(documents) # Total number of documents
    idf = {}
    all_words = set(word for doc in documents for word in doc) # Unique words
    for word in all_words:
        count = sum(1 for doc in documents if word in doc)
        idf[word] = math.log(N / count)
    return idf

In [15]:
def compute_tfidf(document, idf):
    tfidf = {}
    tf = compute_tf(document) # Get TF values for the document
    for word, tf_value in tf.items():
        tfidf[word] = tf_value * idf[word] # Multiply TF and IDF
    return tfidf

In [16]:
documents = tokenized_reviews.tolist()

In [17]:
tf_data = [compute_tf(doc) for doc in documents]
tf_df = pd.DataFrame(tf_data).fillna(0)
tf_df.to_csv("tf_scores.csv", index=False)

In [18]:
idf = compute_idf(documents)
idf_df = pd.DataFrame([idf]).fillna(0)
idf_df.to_csv("idf_scores.csv", index=False)

In [19]:
tfidf_data = [compute_tfidf(doc, idf) for doc in documents]
tfidf_df = pd.DataFrame(tfidf_data).fillna(0)
tfidf_df.to_csv("tfidf_scores.csv", index=False)