In [1]:
# importing necessary libraries 
import pandas as pd
import numpy as np

In [2]:
pip install "numpy<2"

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Reading the Dataset
df = pd.read_csv('movie.csv')

In [4]:
# checking the first 5 rows
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [5]:
# Checking for missing values
df.isnull().sum()

text     0
label    0
dtype: int64

In [6]:
# importing necessary libraries
import re
import math
from collections import defaultdict, Counter

In [7]:
# importing stop words
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
STOP_WORDS = set(ENGLISH_STOP_WORDS)

In [8]:
# splitting the data for testing and traning
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
train_df = df_shuffled.iloc[:1000].copy()
remaining_df = df_shuffled.iloc[1000:].copy()
test_pool_class_0 = remaining_df[remaining_df['label'] == 0]
test_pool_class_1 = remaining_df[remaining_df['label'] == 1]
test_class_0 = test_pool_class_0.sample(n=100, random_state=42)
test_class_1 = test_pool_class_1.sample(n=100, random_state=42)
test_df = pd.concat([test_class_0, test_class_1]).sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
# traing data
train_df

Unnamed: 0,text,label
0,The central theme in this movie seems to be co...,0
1,"An excellent example of ""cowboy noir"", as it's...",1
2,The ending made my heart jump up into my throa...,0
3,Only the chosen ones will appreciate the quali...,1
4,"This is a really funny film, especially the se...",1
...,...,...
995,"Together with the even more underrated , The S...",1
996,I bought this DVD after seeing it highly ranke...,0
997,"This is a known fact, Mr. Seagal cannot smile,...",0
998,I must have been in a good mood to give this s...,0


In [10]:
# testing data
test_df

Unnamed: 0,text,label
0,I'd have to agree with the previous reviewer: ...,0
1,I have been a huge Lynn Peterson fan ever sinc...,0
2,I am one of the biggest fans of silent comedia...,0
3,Atlantis was much better than I had anticipate...,1
4,Seth McFarlane is a true genius. He has crafte...,1
...,...,...
195,There is a lot of crap coming out of Hollywood...,1
196,This is one for the Golden Turkey book. It's a...,0
197,"As I write this user-comment, Tim Burton's int...",0
198,"LE GRAND VOYAGE is a gentle miracle of a film,...",1


In [11]:
# testing data of negative class
test_class_0

Unnamed: 0,text,label
20111,When recounting these events that took place s...,0
1814,No mention if Ann Rivers Siddons adapted the m...,0
31656,"This is one strange hacked together film, you ...",0
9386,"No redeeming features, this film is rubbish. I...",0
18452,Advertised by channel seven in Australia as th...,0
...,...,...
5396,I'd have to agree with the previous reviewer: ...,0
11153,Even my five year old was bored.<br /><br />Ve...,0
4476,"Wow. Not because of the 3-D imagery, which at ...",0
30287,This movie was o.k. but it could have been muc...,0


In [12]:
# testing data of positive class
test_class_1

Unnamed: 0,text,label
28101,This probably ranks in my Top-5 list of the fu...,1
7086,"Zachary Scott does what he does best, i.e., pl...",1
33940,I just wanted to say that I am watching Nation...,1
20581,This movie is the Latino Godfather. An unlikel...,1
36006,I'll be brief: I normally hate films like this...,1
...,...,...
20625,"A grumpy old baronet, happily unmarried, decid...",1
23568,Back in 74 Eric Monte made the classic T.V sho...,1
22148,Nicely and intelligently played by the two you...,1
3145,Bwana Devil is reputedly the first major studi...,1


In [13]:
# preprocessing: lemmatisation, removing stop words, handling negation, removing symbols.
def preprocess_text(text, lemmatize_words=False, remove_stop_words=False, handle_logical_negation=False):
    text = re.sub(r'<s>|</s>|<br>|<br/>', '. ', text)
    tokens = text.split()
    processed_tokens = []
    
    if handle_logical_negation:
        negation_words = ['not', "n't", 'no']
        negate_next = False
        for token in tokens:
            if negate_next:
                processed_tokens.append(f"{token}_NEG")
                negate_next = False
            elif token.lower() in negation_words:
                processed_tokens.append(token)
                negate_next = True
            else:
                processed_tokens.append(token)
        tokens = processed_tokens
        processed_tokens = []
    if remove_stop_words:
        for token in tokens:
            if token.lower() not in STOP_WORDS:
                processed_tokens.append(token)
        tokens = processed_tokens
        processed_tokens = []
    
    processed_tokens = tokens
    
    return " ".join(processed_tokens)    

In [14]:
def preprocess_text_simple(text):
    # Lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text
train_df.loc[:, 'text_cleaned'] = train_df['text'].apply(preprocess_text_simple)

In [15]:
# Developing Naive bayes
class NaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_priors = {}
        self.word_counts = defaultdict(lambda: defaultdict(int))
        self.total_words_per_class = defaultdict(int)
        self.vocabulary = set()

    def fit(self, X_train, y_train):
        num_reviews = len(y_train)
        
        for i, text in enumerate(X_train):
            label = y_train.iloc[i]
            words = text.split()
            for word in words:
                self.word_counts[label][word] += 1
                self.total_words_per_class[label] += 1
                self.vocabulary.add(word)
        
        # Calculate log prior probabilities
        self.class_priors[0] = math.log(y_train.value_counts()[0] / num_reviews)
        self.class_priors[1] = math.log(y_train.value_counts()[1] / num_reviews)
    
    def predict(self, X_test):
        predictions = []
        vocab_size = len(self.vocabulary)
        
        for text in X_test:
            scores = {0: self.class_priors[0], 1: self.class_priors[1]}
            words = text.split()
            
            for label in [0, 1]:
                total_words = self.total_words_per_class[label]
                for word in words:
                    # Apply Laplace smoothing
                    word_count = self.word_counts[label][word] + self.alpha
                    total_count = total_words + self.alpha * vocab_size
                    likelihood = word_count / total_count
                    if likelihood > 0: # Avoid log(0)
                        scores[label] += math.log(likelihood)
            
            if scores[0] > scores[1]:
                predictions.append(0)
            else:
                predictions.append(1)
                
        return predictions


In [16]:
# Evaluation metrics
def calculate_metrics(y_true, y_pred):
    true_pos = 0
    true_neg = 0
    false_pos = 0
    false_neg = 0
    
    for true, pred in zip(y_true, y_pred):
        if pred == 1 and true == 1:
            true_pos += 1
        elif pred == 0 and true == 0:
            true_neg += 1
        elif pred == 1 and true == 0:
            false_pos += 1
        elif pred == 0 and true == 1:
            false_neg += 1
            
    # Precision, Recall, F1 for each class
    precision_0 = true_neg / (true_neg + false_neg) if (true_neg + false_neg) > 0 else 0
    recall_0 = true_neg / (true_neg + false_pos) if (true_neg + false_pos) > 0 else 0
    f1_0 = 2 * (precision_0 * recall_0) / (precision_0 + recall_0) if (precision_0 + recall_0) > 0 else 0

    precision_1 = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
    recall_1 = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
    f1_1 = 2 * (precision_1 * recall_1) / (precision_1 + recall_1) if (precision_1 + recall_1) > 0 else 0
    
    return [[true_neg, false_pos], [false_neg, true_pos]], [precision_0, precision_1], [recall_0, recall_1], [f1_0, f1_1]

In [17]:
# Calculating Prior Probabilities for Each Class
class_counts = train_df['label'].value_counts()
total_reviews = len(train_df)

prior_positive = class_counts[1] / total_reviews
prior_negative = class_counts[0] / total_reviews

print(f"Total reviews in training set: {total_reviews}")
print(f"Number of positive reviews: {class_counts[1]}")
print(f"Number of negative reviews: {class_counts[0]}")
print(f"\nPrior Probability of Positive Class: {prior_positive:.4f}")
print(f"Prior Probability of Negative Class: {prior_negative:.4f}")

Total reviews in training set: 1000
Number of positive reviews: 494
Number of negative reviews: 506

Prior Probability of Positive Class: 0.4940
Prior Probability of Negative Class: 0.5060


In [18]:
# Calculating Likelihood Values of Each Word Given Each Class
# Example: calculate the likelihood of the word 'great'
train_df.loc[:, 'text_cleaned'] = train_df['text'].apply(preprocess_text_simple)

word_counts_by_class = defaultdict(lambda: defaultdict(int))
total_words_by_class = defaultdict(int)
vocabulary = set()

for index, row in train_df.iterrows():
    label = row['label']
    words = row['text_cleaned'].split()
    for word in words:
        word_counts_by_class[label][word] += 1
        total_words_by_class[label] += 1
        vocabulary.add(word)

word_to_check = 'great'
positive_word_count = word_counts_by_class[1][word_to_check]
negative_word_count = word_counts_by_class[0][word_to_check]
total_positive_words = total_words_by_class[1]
total_negative_words = total_words_by_class[0]

likelihood_positive = positive_word_count / total_positive_words
likelihood_negative = negative_word_count / total_negative_words

print(f"Likelihood of '{word_to_check}' in Positive Class: {likelihood_positive:.8f}")
print(f"Likelihood of '{word_to_check}' in Negative Class: {likelihood_negative:.8f}")

Likelihood of 'great' in Positive Class: 0.00206769
Likelihood of 'great' in Negative Class: 0.00088692


In [19]:
# Calculating the Posterior Probability using Logarithmic Representation
num_reviews = len(train_df)
log_prior_positive = math.log(train_df['label'].value_counts()[1] / num_reviews)
log_prior_negative = math.log(train_df['label'].value_counts()[0] / num_reviews)

# Example Review
example_review = "This movie was great and I loved it."
processed_review = preprocess_text_simple(example_review)

log_posterior_positive = log_prior_positive
log_posterior_negative = log_prior_negative

for word in processed_review.split():
    likelihood_positive = (word_counts_by_class[1][word] + 1) / (total_words_by_class[1] + len(vocabulary))
    likelihood_negative = (word_counts_by_class[0][word] + 1) / (total_words_by_class[0] + len(vocabulary))

    log_posterior_positive += math.log(likelihood_positive)
    log_posterior_negative += math.log(likelihood_negative)

print(f"Log Posterior Score for Positive Class: {log_posterior_positive:.4f}")
print(f"Log Posterior Score for Negative Class: {log_posterior_negative:.4f}")

predicted_class = 1 if log_posterior_positive > log_posterior_negative else 0
print(f"Predicted Class: {predicted_class}")

Log Posterior Score for Positive Class: -42.2946
Log Posterior Score for Negative Class: -43.3638
Predicted Class: 1


In [20]:
# using laplace smoothing
class DummyNaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.word_counts = defaultdict(lambda: defaultdict(int))
        self.total_words_per_class = defaultdict(int)
        self.vocabulary = {'great', 'movie', 'terrible', 'plot'} # Example vocabulary

    def calculate_likelihood(self, word, label):
        vocab_size = len(self.vocabulary)
        
        # Word and total counts with Laplace smoothing (alpha=1)
        word_count = self.word_counts[label][word] + self.alpha
        total_count = self.total_words_per_class[label] + self.alpha * vocab_size
        
        likelihood = word_count / total_count
        return likelihood

model = DummyNaiveBayes()
model.word_counts[1]['great'] = 10  # 'great' appears 10 times in the positive class
model.total_words_per_class[1] = 100 # Total words in the positive class

word_to_check = 'terrible'

# Likelihood without smoothing would be 0 for this unseen word
# But with Laplace smoothing, the likelihood is non-zero
likelihood_with_smoothing = model.calculate_likelihood(word_to_check, 1)

print(f"Likelihood of '{word_to_check}' in positive class with Laplace smoothing (alpha=1): {likelihood_with_smoothing:.4f}")

Likelihood of 'terrible' in positive class with Laplace smoothing (alpha=1): 0.0096


In [21]:
# Removing Unknown Words
training_vocabulary = set()
for text in train_df['text_cleaned']:
    for word in text.split():
        training_vocabulary.add(word)

test_review_with_unknown = "The movie was fantastic, absolutely stellar!"
processed_test_review = preprocess_text_simple(test_review_with_unknown)

unknown_words = [word for word in processed_test_review.split() if word not in training_vocabulary]

print(f"The number of unique words in the training vocabulary is: {len(training_vocabulary)}")
print(f"The word 'fantastic' is in the training vocabulary: {'fantastic' in training_vocabulary}")
print(f"The word 'stellar' is in the training vocabulary: {'stellar' in training_vocabulary}")
print(f"\nThe unknown words in the test review are: {unknown_words}")
print("\nDuring prediction, words not in the training vocabulary are simply ignored by the model.")

The number of unique words in the training vocabulary is: 17715
The word 'fantastic' is in the training vocabulary: True
The word 'stellar' is in the training vocabulary: True

The unknown words in the test review are: []

During prediction, words not in the training vocabulary are simply ignored by the model.


In [22]:
# Performing Sentiment Analysis on the Test Data
train_df.loc[:, 'text_cleaned'] = train_df['text'].apply(
    lambda x: preprocess_text(x, lemmatize_words=True, remove_stop_words=True, handle_logical_negation=True))
test_df.loc[:, 'text_cleaned'] = test_df['text'].apply(
    lambda x: preprocess_text(x, lemmatize_words=True, remove_stop_words=True, handle_logical_negation=True))

classifier = NaiveBayes(alpha=1.0)
classifier.fit(train_df['text_cleaned'], train_df['label'])

y_pred = classifier.predict(test_df['text_cleaned'])

print("First 10 Predicted Labels:")
print(y_pred[:10])

print("\nFirst 10 Actual Labels:")
print(test_df['label'].iloc[:10].tolist())

First 10 Predicted Labels:
[0, 0, 0, 0, 1, 0, 0, 1, 0, 0]

First 10 Actual Labels:
[0, 0, 0, 1, 1, 1, 0, 1, 1, 0]


In [23]:
# Calculating Confusion Matrix, Precision, Recall, and F1-Score
classifier = NaiveBayes(alpha=1.0)
classifier.fit(train_df['text_cleaned'], train_df['label'])

y_pred = classifier.predict(test_df['text_cleaned'])
y_test = test_df['label']

conf_matrix, precision, recall, f1 = calculate_metrics(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)

print("\nPrecision per Class:")
print(f"Negative (Class 0): {precision[0]:.4f}")
print(f"Positive (Class 1): {precision[1]:.4f}")

print("\nRecall per Class:")
print(f"Negative (Class 0): {recall[0]:.4f}")
print(f"Positive (Class 1): {recall[1]:.4f}")

print("\nF-1 Score per Class:")
print(f"Negative (Class 0): {f1[0]:.4f}")
print(f"Positive (Class 1): {f1[1]:.4f}")

Confusion Matrix:
[[89, 11], [39, 61]]

Precision per Class:
Negative (Class 0): 0.6953
Positive (Class 1): 0.8472

Recall per Class:
Negative (Class 0): 0.8900
Positive (Class 1): 0.6100

F-1 Score per Class:
Negative (Class 0): 0.7807
Positive (Class 1): 0.7093


In [24]:
# confusion matrix, Precision, Reacll, F1 score without preprocessing 
train_df.loc[:, 'text_cleaned'] = train_df['text'].apply(
    lambda x: preprocess_text(x, lemmatize_words=False, remove_stop_words=False, handle_logical_negation=False))
test_df.loc[:, 'text_cleaned'] = test_df['text'].apply(
    lambda x: preprocess_text(x, lemmatize_words=False, remove_stop_words=False, handle_logical_negation=False))

X_train = train_df['text_cleaned']
y_train = train_df['label']
X_test = test_df['text_cleaned']
y_test = test_df['label']

classifier = NaiveBayes(alpha=1.0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

conf_matrix, precision, recall, f1 = calculate_metrics(y_test, y_pred)

print("\n--- Scenario 1: No Preprocessing ---")
print("\nConfusion Matrix:")
print(conf_matrix)

print("\nMetrics per Class:")
print(f"Negative (Class 0) - Precision: {precision[0]:.4f}, Recall: {recall[0]:.4f}, F1: {f1[0]:.4f}")
print(f"Positive (Class 1) - Precision: {precision[1]:.4f}, Recall: {recall[1]:.4f}, F1: {f1[1]:.4f}")


--- Scenario 1: No Preprocessing ---

Confusion Matrix:
[[89, 11], [38, 62]]

Metrics per Class:
Negative (Class 0) - Precision: 0.7008, Recall: 0.8900, F1: 0.7841
Positive (Class 1) - Precision: 0.8493, Recall: 0.6200, F1: 0.7168


In [25]:
# confusion matrix, Precision, Reacll, F1 score with lemmatisation only
train_df.loc[:, 'text_cleaned'] = train_df['text'].apply(
    lambda x: preprocess_text(x, lemmatize_words=True, remove_stop_words=False, handle_logical_negation=False))
test_df.loc[:, 'text_cleaned'] = test_df['text'].apply(
    lambda x: preprocess_text(x, lemmatize_words=True, remove_stop_words=False, handle_logical_negation=False))

X_train = train_df['text_cleaned']
y_train = train_df['label']
X_test = test_df['text_cleaned']
y_test = test_df['label']

classifier = NaiveBayes(alpha=1.0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

conf_matrix, precision, recall, f1 = calculate_metrics(y_test, y_pred)

print("\n--- Scenario 2: With Lemmatization Only ---")
print("\nConfusion Matrix:")
print(conf_matrix)

print("\nMetrics per Class:")
print(f"Negative (Class 0) - Precision: {precision[0]:.4f}, Recall: {recall[0]:.4f}, F1: {f1[0]:.4f}")
print(f"Positive (Class 1) - Precision: {precision[1]:.4f}, Recall: {recall[1]:.4f}, F1: {f1[1]:.4f}")


--- Scenario 2: With Lemmatization Only ---

Confusion Matrix:
[[89, 11], [38, 62]]

Metrics per Class:
Negative (Class 0) - Precision: 0.7008, Recall: 0.8900, F1: 0.7841
Positive (Class 1) - Precision: 0.8493, Recall: 0.6200, F1: 0.7168


In [26]:
# confusion matrix, Precision, Reacll, F1 score with lemmatisation and removal of stop words 
train_df.loc[:, 'text_cleaned'] = train_df['text'].apply(
    lambda x: preprocess_text(x, lemmatize_words=True, remove_stop_words=True, handle_logical_negation=False))
test_df.loc[:, 'text_cleaned'] = test_df['text'].apply(
    lambda x: preprocess_text(x, lemmatize_words=True, remove_stop_words=True, handle_logical_negation=False))

# Extract data and labels
X_train = train_df['text_cleaned']
y_train = train_df['label']
X_test = test_df['text_cleaned']
y_test = test_df['label']

classifier = NaiveBayes(alpha=1.0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

conf_matrix, precision, recall, f1 = calculate_metrics(y_test, y_pred)

print("\n--- Scenario 3: With Lemmatization and Removal of Stop Words ---")
print("\nConfusion Matrix:")
print(conf_matrix)

print("\nMetrics per Class:")
print(f"Negative (Class 0) - Precision: {precision[0]:.4f}, Recall: {recall[0]:.4f}, F1: {f1[0]:.4f}")
print(f"Positive (Class 1) - Precision: {precision[1]:.4f}, Recall: {recall[1]:.4f}, F1: {f1[1]:.4f}")


--- Scenario 3: With Lemmatization and Removal of Stop Words ---

Confusion Matrix:
[[90, 10], [38, 62]]

Metrics per Class:
Negative (Class 0) - Precision: 0.7031, Recall: 0.9000, F1: 0.7895
Positive (Class 1) - Precision: 0.8611, Recall: 0.6200, F1: 0.7209


In [27]:
# confusion matrix, Precision, Reacll, F1 score with lemmatisation, removal of stop words and handling negation 
train_df.loc[:, 'text_cleaned'] = train_df['text'].apply(
    lambda x: preprocess_text(x, lemmatize_words=True, remove_stop_words=True, handle_logical_negation=True))
test_df.loc[:, 'text_cleaned'] = test_df['text'].apply(
    lambda x: preprocess_text(x, lemmatize_words=True, remove_stop_words=True, handle_logical_negation=True))

X_train = train_df['text_cleaned']
y_train = train_df['label']
X_test = test_df['text_cleaned']
y_test = test_df['label']

classifier = NaiveBayes(alpha=1.0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

conf_matrix, precision, recall, f1 = calculate_metrics(y_test, y_pred)

# Report results
print("\n--- Scenario 4: All Preprocessing ---")
print("\nConfusion Matrix:")
print(conf_matrix)

print("\nMetrics per Class:")
print(f"Negative (Class 0) - Precision: {precision[0]:.4f}, Recall: {recall[0]:.4f}, F1: {f1[0]:.4f}")
print(f"Positive (Class 1) - Precision: {precision[1]:.4f}, Recall: {recall[1]:.4f}, F1: {f1[1]:.4f}")


--- Scenario 4: All Preprocessing ---

Confusion Matrix:
[[89, 11], [39, 61]]

Metrics per Class:
Negative (Class 0) - Precision: 0.6953, Recall: 0.8900, F1: 0.7807
Positive (Class 1) - Precision: 0.8472, Recall: 0.6100, F1: 0.7093
