<a href="https://colab.research.google.com/github/Santapaji/NBC_for_rotten_tomatoes/blob/main/DM_assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [64]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/rt_reviews.csv", encoding='iso-8859-1')

print(df)
# Divide the dataset into train, development, and test sets
train_df = df.sample(frac=0.6, random_state=42)
dev_df = df.drop(train_df.index).sample(frac=0.5, random_state=42)
test_df = df.drop(train_df.index).drop(dev_df.index)


       Freshness                                             Review
0          fresh   Manakamana doesn't answer any questions, yet ...
1          fresh   Wilfully offensive and powered by a chest-thu...
2         rotten   It would be difficult to imagine material mor...
3         rotten   Despite the gusto its star brings to the role...
4         rotten   If there was a good idea at the core of this ...
...          ...                                                ...
479995    rotten   Zemeckis seems unable to admit that the motio...
479996     fresh   Movies like The Kids Are All Right -- beautif...
479997    rotten   Film-savvy audiences soon will catch onto Win...
479998     fresh                        An odd yet enjoyable film. 
479999     fresh   No other animation studio, even our beloved P...

[480000 rows x 2 columns]


In [73]:
from collections import Counter

# Build the vocabulary
# Set the threshold for rare words
threshold = 10

# Build the vocabulary
word_counts = Counter()
for i, row in train_df.iterrows():
    words = row["tokens"]
    word_counts.update(words)

vocab = [w for w, c in word_counts.items() if c >= threshold]
word2idx = {w: i for i, w in enumerate(vocab)}


In [72]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download the required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Tokenization
train_df["tokens"] = train_df["Review"].apply(nltk.word_tokenize)

# Stop Word Removal
stop_words = set(stopwords.words('english'))
train_df["tokens"] = train_df["tokens"].apply(lambda x: [word for word in x if word.lower() not in stop_words])

# Stemming
stemmer = PorterStemmer()
train_df["tokens"] = train_df["tokens"].apply(lambda x: [stemmer.stem(word) for word in x])

# Lemmatization
lemmatizer = WordNetLemmatizer()
train_df["tokens"] = train_df["tokens"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [74]:
import numpy as np
import math

# Calculate the prior probabilities
num_docs = len(train_df)
num_fresh = len(train_df[train_df["Freshness"] == "fresh"])
num_rotten = len(train_df[train_df["Freshness"] == "rotten"])
p_fresh = num_fresh / num_docs
p_rotten = num_rotten / num_docs

# Batch Processing
batch_size = 10000
word_counts_fresh = np.zeros(len(vocab), dtype=int)
word_counts_rotten = np.zeros(len(vocab), dtype=int)
for i in range(math.ceil(num_docs/batch_size)):
    batch_df = train_df.iloc[i*batch_size:(i+1)*batch_size]
    for j, row in batch_df.iterrows():
        tokens = row["tokens"]
        if row["Freshness"] == "fresh":
            word_counts_fresh[[word2idx[w] for w in tokens if w in vocab]] += 1
        else:
            word_counts_rotten[[word2idx[w] for w in tokens if w in vocab]] += 1

alpha = 1.0 # smoothing parameter
p_word_fresh = (word_counts_fresh + alpha) / (num_fresh + alpha * len(vocab))
p_word_rotten = (word_counts_rotten + alpha) / (num_rotten + alpha * len(vocab))


In [82]:
def predict_review(review):
    # Split the review into words and convert to lowercase
    words = review.lower().split()

    # Calculate the log-likelihood of each class
    log_p_fresh = np.log(p_fresh) + np.sum(np.log(p_word_fresh[[word2idx[w] for w in words if w in vocab]]))
    log_p_rotten = np.log(p_rotten) + np.sum(np.log(p_word_rotten[[word2idx[w] for w in words if w in vocab]]))

    # Return the predicted class
    if log_p_fresh > log_p_rotten:
        return "fresh"
    else:
        return "rotten"
        
# Test the NBC on a sample review
review = "amazing movie totally worth it ."
predicted_class = predict_review(review)
print(predicted_class)


fresh


In [83]:
def evaluate(df):
    y_true = df["Freshness"]
    y_pred = df["Review"].apply(predict_review)
    
    tp = ((y_true == "fresh") & (y_pred == "fresh")).sum()
    fp = ((y_true == "rotten") & (y_pred == "fresh")).sum()
    fn = ((y_true == "fresh") & (y_pred == "rotten")).sum()
    tn = ((y_true == "rotten") & (y_pred == "rotten")).sum()
    
    accuracy = (tp + tn) / len(df)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * precision * recall / (precision + recall)
    
    return accuracy, precision, recall, f1_score
    
# Evaluate the NBC on the development set
accuracy, precision, recall, f1_score = evaluate(dev_df)
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1_score:.2f}")


Accuracy: 0.63
Precision: 0.60
Recall: 0.78
F1-score: 0.68


In [84]:
top_words_fresh = [vocab[i] for i in np.argsort(-p_word_fresh)[:10]]
top_words_rotten = [vocab[i] for i in np.argsort(-p_word_rotten)[:10]]
print("Top 10 words that predict freshness:", top_words_fresh)
print("Top 10 words that predict rottenness:", top_words_rotten)


Top 10 words that predict freshness: ['.', ',', "'s", 'film', 'movi', 'one', "n't", '...', 'make', 'like']
Top 10 words that predict rottenness: ['.', ',', "'s", 'film', 'movi', "n't", 'like', 'one', '...', "''"]


In [85]:
import matplotlib.pyplot as plt

alphas = np.logspace(-3, 3, num=7)
accuracies = []
for alpha in alphas:
    # Calculate the conditional probabilities with smoothing
    p_word_fresh = (word_counts_fresh + alpha) / (num_fresh + alpha * len(vocab))
    p_word_rotten = (word_counts_rotten + alpha) / (num_rotten + alpha * len(vocab))

    # Train the NBC with smoothing
    def predict_review(review):
        words = review.lower().split()
        log_p_fresh = np.log(p_fresh) + np.sum(np.log(p_word_fresh[[word2idx[w] for w in words if w in vocab]]))
        log_p_rotten = np.log(p_rotten) + np.sum(np.log(p_word_rotten[[word2idx[w] for w in words if w in vocab]]))
        if log_p_fresh > log_p_rotten:
            return "fresh"
        else:
            return "rotten"
    accuracy, _, _, _ = evaluate(dev_df)
    accuracies.append(accuracy)
    
plt.semilogx(alphas, accuracies, "bo-")
plt


KeyboardInterrupt: ignored