# **Sentiment Classification Model using Bi-Gram Model**

In [388]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import bigrams
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import string
import re

### **Import and download necessary libraries**

In [389]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### **Read Text Data**

In [414]:
# Read the text data from the file
with open("Movie_Reviews.txt", "r") as file:
    m_reviews = file.read()

### **Seperate the Movie Reviews into Positive and Negative**

In [415]:
# Split the content based on the headers of positive and negative reviews
positive_reviews_begin = m_reviews.find("Positive Reviews")
negative_reviews_begin = m_reviews.find("Negative Reviews")

In [416]:
# Extract the positive reviews
positive_reviews = m_reviews[positive_reviews_begin:negative_reviews_begin].strip()

# Find the index of "================"
divider_index = positive_reviews.find("\n================")

# Remove the line after "================"
if divider_index != -1:
    positive_reviews = positive_reviews[divider_index + len("\n================"):]

# Split the positive reviews by newline
list_positive_reviews = positive_reviews.split('\n')


In [417]:
# Remove the initial header
list_positive_reviews = [review for review in list_positive_reviews if review]

In [418]:
# Extract the negative reviews
negative_reviews = m_reviews[negative_reviews_begin:].strip()

# Find the index of "================"
divider_index = negative_reviews.find("\n================")

# Remove the line after "================"
if divider_index != -1:
    negative_reviews = negative_reviews[divider_index + len("\n================"):]

# Split the negative reviews by newline and filter out empty reviews
list_negative_reviews = [review for review in negative_reviews.split('\n') if review]


In [419]:
# Remove the initial header
list_negative_reviews = [review for review in list_negative_reviews if review]

In [420]:
print(list_positive_reviews)
print(list_negative_reviews)

['1. "Forrest Gump is an absolute masterpiece! Tom Hanks delivers an unforgettable performance, and the storytelling is heartwarming. This movie is a journey through life that will make you laugh, cry, and appreciate the simple beauties of existence."', '2. "The Shawshank Redemption is a timeless classic. The powerful themes of hope, friendship, and redemption make it a must-watch. Morgan Freeman and Tim Robbins give exceptional performances in this brilliantly crafted film."', '3. "The epic conclusion to The Lord of the Rings trilogy, The Return of the King, is a cinematic triumph. The breathtaking visuals, epic battles, and emotionally resonant story make it a monumental achievement in filmmaking."', '4. "La La Land is a love letter to the magic of Hollywood and dreams. The chemistry between Ryan Gosling and Emma Stone is enchanting, and the music and dance sequences are a pure delight. A modern musical masterpiece."', '5. "Wes Anderson\'s whimsical style shines in The Grand Budapest

# **Preprocessing**

### **Remove punctuation**

In [421]:
# Function to remove punctuation
def remove_punctuations(text):
    return re.sub(r'[^\w\s]', '', text)

# Remove punctuation from positive and negative reviews
positive_reviews_after_remove__punct = [remove_punctuations(review) for review in list_positive_reviews]
negative_reviews_after_remove__punct= [remove_punctuations(review) for review in list_negative_reviews]

In [423]:
# Display the reviews without punctuation
print("Positive Reviews:", positive_reviews_after_remove__punct)
print("Negative Reviews:", negative_reviews_after_remove__punct)

Positive Reviews: ['1 Forrest Gump is an absolute masterpiece Tom Hanks delivers an unforgettable performance and the storytelling is heartwarming This movie is a journey through life that will make you laugh cry and appreciate the simple beauties of existence', '2 The Shawshank Redemption is a timeless classic The powerful themes of hope friendship and redemption make it a mustwatch Morgan Freeman and Tim Robbins give exceptional performances in this brilliantly crafted film', '3 The epic conclusion to The Lord of the Rings trilogy The Return of the King is a cinematic triumph The breathtaking visuals epic battles and emotionally resonant story make it a monumental achievement in filmmaking', '4 La La Land is a love letter to the magic of Hollywood and dreams The chemistry between Ryan Gosling and Emma Stone is enchanting and the music and dance sequences are a pure delight A modern musical masterpiece', '5 Wes Andersons whimsical style shines in The Grand Budapest Hotel With its quir

### **Convert into Lowercase**

In [424]:
# Convert positive and negative reviews to lowercase
lower_positive_reviews = [review.lower() for review in positive_reviews_after_remove__punct]
lower_negative_reviews= [review.lower() for review in negative_reviews_after_remove__punct]

In [425]:
# Display the lowercase reviews
print("Positive Reviews:", lower_positive_reviews)
print("Negative Reviews:", lower_negative_reviews)

Positive Reviews: ['1 forrest gump is an absolute masterpiece tom hanks delivers an unforgettable performance and the storytelling is heartwarming this movie is a journey through life that will make you laugh cry and appreciate the simple beauties of existence', '2 the shawshank redemption is a timeless classic the powerful themes of hope friendship and redemption make it a mustwatch morgan freeman and tim robbins give exceptional performances in this brilliantly crafted film', '3 the epic conclusion to the lord of the rings trilogy the return of the king is a cinematic triumph the breathtaking visuals epic battles and emotionally resonant story make it a monumental achievement in filmmaking', '4 la la land is a love letter to the magic of hollywood and dreams the chemistry between ryan gosling and emma stone is enchanting and the music and dance sequences are a pure delight a modern musical masterpiece', '5 wes andersons whimsical style shines in the grand budapest hotel with its quir

### **Word Tokenization**

In [426]:
# Function to tokenize words
def tokenize(sentences):
    all_words = []
    for sentence in sentences:
        words = word_tokenize(sentence)
        all_words.extend(words)
    return all_words

In [427]:
positive_words = tokenize(lower_positive_reviews)
negative_words = tokenize(lower_negative_reviews)
print("Positive Reviews:", positive_words)
print("Negative Reviews:", negative_words)

Positive Reviews: ['1', 'forrest', 'gump', 'is', 'an', 'absolute', 'masterpiece', 'tom', 'hanks', 'delivers', 'an', 'unforgettable', 'performance', 'and', 'the', 'storytelling', 'is', 'heartwarming', 'this', 'movie', 'is', 'a', 'journey', 'through', 'life', 'that', 'will', 'make', 'you', 'laugh', 'cry', 'and', 'appreciate', 'the', 'simple', 'beauties', 'of', 'existence', '2', 'the', 'shawshank', 'redemption', 'is', 'a', 'timeless', 'classic', 'the', 'powerful', 'themes', 'of', 'hope', 'friendship', 'and', 'redemption', 'make', 'it', 'a', 'mustwatch', 'morgan', 'freeman', 'and', 'tim', 'robbins', 'give', 'exceptional', 'performances', 'in', 'this', 'brilliantly', 'crafted', 'film', '3', 'the', 'epic', 'conclusion', 'to', 'the', 'lord', 'of', 'the', 'rings', 'trilogy', 'the', 'return', 'of', 'the', 'king', 'is', 'a', 'cinematic', 'triumph', 'the', 'breathtaking', 'visuals', 'epic', 'battles', 'and', 'emotionally', 'resonant', 'story', 'make', 'it', 'a', 'monumental', 'achievement', 'in',

### **Lemmatization, and Remove Stop Words**

In [428]:
def get_wordnet_pos(tag):
    # Map POS tag to WordNet POS tag
    tag_dict = {"N": "n", "V": "v", "R": "r", "J": "a"}
    return tag_dict.get(tag[0], 'n')  # Default to noun if not found

def lemmatize_words(reviews):
    # Perform part-of-speech tagging
    reviews_pos = pos_tag(reviews)

    # Define stop words and remove them, also remove numbers
    stop_words = set(stopwords.words("english"))

    reviews_filtered = [word for word, pos in reviews_pos if word not in stop_words and word.isalpha()]

    # Lemmatize the filtered words based on POS
    lemmatizer = WordNetLemmatizer()
    reviews_lemmatized = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(pos)) for word, pos in pos_tag(reviews_filtered)]

    return reviews_lemmatized

# Preprocess positive and negative reviews
cleaned_positive_reviews = lemmatize_words(positive_words)
cleaned_negative_reviews = lemmatize_words(negative_words)

# Display the preprocessed reviews
print("Positive Reviews (Preprocessed):", cleaned_positive_reviews)
print("Negative Reviews (Preprocessed):", cleaned_negative_reviews)

Positive Reviews (Preprocessed): ['forrest', 'gump', 'absolute', 'masterpiece', 'tom', 'hank', 'delivers', 'unforgettable', 'performance', 'storytelling', 'heartwarming', 'movie', 'journey', 'life', 'make', 'laugh', 'cry', 'appreciate', 'simple', 'beauty', 'existence', 'shawshank', 'redemption', 'timeless', 'classic', 'powerful', 'theme', 'hope', 'friendship', 'redemption', 'make', 'mustwatch', 'morgan', 'freeman', 'tim', 'robbins', 'give', 'exceptional', 'performance', 'brilliantly', 'craft', 'film', 'epic', 'conclusion', 'lord', 'ring', 'trilogy', 'return', 'king', 'cinematic', 'triumph', 'breathtaking', 'visuals', 'epic', 'battle', 'emotionally', 'resonant', 'story', 'make', 'monumental', 'achievement', 'filmmaking', 'la', 'la', 'land', 'love', 'letter', 'magic', 'hollywood', 'dream', 'chemistry', 'ryan', 'gosling', 'emma', 'stone', 'enchant', 'music', 'dance', 'sequence', 'pure', 'delight', 'modern', 'musical', 'masterpiece', 'wes', 'anderson', 'whimsical', 'style', 'shine', 'grand

# **Implement Bigram Model**

In [454]:
# create bigrams for positive and negative reviews
positive_bigrams = list(bigrams(cleaned_positive_reviews))
negative_bigrams = list(bigrams(cleaned_negative_reviews))

In [None]:
# Calculate frequency for positive bigrams
for bigram in positive_bigrams:
    bigram_count = positive_bigrams.count(bigram)
    print(f"{bigram}: {bigram_count}")

In [None]:
# Calculate frequency for negative bigrams
for bigram in negative_bigrams:
    bigram_count = negative_bigrams.count(bigram)
    print(f"{bigram}: {bigram_count}")

In [457]:
# Function to calculate bigram probabilities
def calculate_bigram_probabilities(bigrams,unigrams):
    bigram_probabilities = {}
    for bigram in bigrams:
        word1, word2 = bigram
        bigram_count = bigrams.count(bigram)
        unigram_count = unigrams.count(word1)
        bigram_probabilities[bigram] = (bigram_count) / (unigram_count)

    return bigram_probabilities


In [458]:
# get the bigram probabilities
positive_bigram_probabilities = calculate_bigram_probabilities(positive_bigrams,cleaned_positive_reviews)
negative_bigram_probabilities = calculate_bigram_probabilities(negative_bigrams,cleaned_negative_reviews)

print("Positive Bigram Probabilities: \n",positive_bigram_probabilities)
print("Negative Bigram Probabilities: \n",negative_bigram_probabilities)

Positive Bigram Probabilities: 
 {('forrest', 'gump'): 1.0, ('gump', 'absolute'): 1.0, ('absolute', 'masterpiece'): 1.0, ('masterpiece', 'tom'): 0.3333333333333333, ('tom', 'hank'): 1.0, ('hank', 'delivers'): 1.0, ('delivers', 'unforgettable'): 1.0, ('unforgettable', 'performance'): 1.0, ('performance', 'storytelling'): 0.5, ('storytelling', 'heartwarming'): 1.0, ('heartwarming', 'movie'): 1.0, ('movie', 'journey'): 1.0, ('journey', 'life'): 0.5, ('life', 'make'): 1.0, ('make', 'laugh'): 0.2, ('laugh', 'cry'): 1.0, ('cry', 'appreciate'): 1.0, ('appreciate', 'simple'): 1.0, ('simple', 'beauty'): 1.0, ('beauty', 'existence'): 1.0, ('existence', 'shawshank'): 1.0, ('shawshank', 'redemption'): 1.0, ('redemption', 'timeless'): 0.5, ('timeless', 'classic'): 0.5, ('classic', 'powerful'): 0.5, ('powerful', 'theme'): 1.0, ('theme', 'hope'): 1.0, ('hope', 'friendship'): 1.0, ('friendship', 'redemption'): 1.0, ('redemption', 'make'): 0.5, ('make', 'mustwatch'): 0.2, ('mustwatch', 'morgan'): 1.0, 

In [459]:
# Test movie review
test_review = "It's clear that the movie has both its enthusiasts and critics. While it may not be to everyone's taste, it's worth watching with an open mind to form your own opinion."


In [460]:

def movie_review_classifier(test_review):
    N = 2

    # Remove punctuation and stop words, tokenize and lemmatize the words in the test_review
    test_review_without_punctuations = ''.join([char for char in test_review if char not in string.punctuation])
    test_words = tokenize(test_review_without_punctuations)
    test_words = lemmatize_words(test_words)

    # Generate bigrams for the test review
    test_bigrams = list(bigrams(test_words))

    # Calculate the bi-gram probability for the test review
    positive_probability = 1.0
    negative_probability = 1.0

    # Calculate probabilities for positive bigrams
    for bigram in test_bigrams:
        if bigram in positive_bigram_probabilities:
            positive_probability *= positive_bigram_probabilities[bigram]

    # Calculate probabilities for negative bigrams
    for bigram in test_bigrams:
        if bigram in negative_bigram_probabilities:
            negative_probability *= negative_bigram_probabilities[bigram]

    # Predict the category of the test movie review
    if positive_probability > negative_probability:
       prediction_explanation = "The review is classified as positive."
    elif positive_probability < negative_probability:
       prediction_explanation = "The review is classified as negative."
    else:
       prediction_explanation = "The review is classified as neutral."

    print(f"Positive Probability: {positive_probability}")
    print(f"Negative Probability: {negative_probability}")
    return prediction_explanation

In [461]:
# get the output of the predicted category
output = movie_review_classifier(test_review)
print(output)

Positive Probability: 1.0
Negative Probability: 1.0
The review is classified as neutral.
