In [3]:
import pandas as pd
import math  # For log function
from collections import Counter

# Load the dataset
file_path = "Processed_Reviews.csv"
df = pd.read_csv(file_path)

# Convert tokenized column to lowercase and drop NaN values
tokenized_reviews = df['tokenized'].dropna().apply(lambda x: x.lower())

# Function to compute Term Frequency (TF)
def compute_tf(document):
    word_count = Counter(document.split())  # Tokenize by splitting words
    tf = {word: count / len(document.split()) for word, count in word_count.items()}
    return tf

# Function to compute Inverse Document Frequency (IDF)
def compute_idf(documents):
    N = len(documents)  # Total number of documents
    idf = {}
    
    # Create a set of all unique words in documents
    all_words = set(word for doc in documents for word in doc.split())  
    
    for word in all_words:
        count = sum(1 for doc in documents if word in doc.split())  # Count docs containing word
        if count > 0:
            idf[word] = math.log(N / count)  # Compute IDF
        else:
            idf[word] = 0  # Avoid division errors
    
    return idf

# Function to compute TF-IDF
def compute_tfidf(document, idf):
    tfidf = {}
    tf = compute_tf(document)
    
    for word, tf_value in tf.items():
        if word in idf:  # Ensure word exists in IDF before multiplying
            tfidf[word] = tf_value * idf[word]
        else:
            print(f"Warning: '{word}' not found in IDF. Setting TF-IDF to 0.")
            tfidf[word] = 0  # Assign 0 if the word is missing
    
    return tfidf

# Convert tokenized reviews into a list
documents = tokenized_reviews.tolist()

# Compute TF scores, convert to DataFrame, and save to CSV
tf_data = [compute_tf(doc) for doc in documents]
tf_df = pd.DataFrame(tf_data).fillna(0)
tf_df.to_csv("tf_scores.csv", index=False)

# Compute IDF scores, convert to DataFrame, and save to CSV
idf = compute_idf(documents)
idf_df = pd.DataFrame([idf]).fillna(0)
idf_df.to_csv("idf_scores.csv", index=False)

# Compute TF-IDF scores, convert to DataFrame, and save to CSV
tfidf_data = [compute_tfidf(doc, idf) for doc in documents]
tfidf_df = pd.DataFrame(tfidf_data).fillna(0)
tfidf_df.to_csv("tfidf_scores.csv", index=False)

print("TF, IDF, and TF-IDF scores successfully computed and saved!")


TF, IDF, and TF-IDF scores successfully computed and saved!
