In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install tabulate



In [3]:
import pandas as pd
import nltk
import re
from tabulate import tabulate
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.preprocessing import LabelEncoder
from nltk.util import ngrams
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


## **1. Creating the dataset**

In [4]:
with open('/content/drive/MyDrive/L4S1/CM 4340 | Natural Language Processing/Assignment 02/Movie_Reviews.txt', 'r') as file:
    lines = file.readlines()

* Removed lines with empty strings



In [5]:
filtered_lines = [line for line in lines if line != '\n']
filtered_lines

['Positive Reviews\n',
 '1. "Forrest Gump is an absolute masterpiece! Tom Hanks delivers an unforgettable performance, and the storytelling is heartwarming. This movie is a journey through life that will make you laugh, cry, and appreciate the simple beauties of existence."\n',
 '2. "The Shawshank Redemption is a timeless classic. The powerful themes of hope, friendship, and redemption make it a must-watch. Morgan Freeman and Tim Robbins give exceptional performances in this brilliantly crafted film."\n',
 '3. "The epic conclusion to The Lord of the Rings trilogy, The Return of the King, is a cinematic triumph. The breathtaking visuals, epic battles, and emotionally resonant story make it a monumental achievement in filmmaking."\n',
 '4. "La La Land is a love letter to the magic of Hollywood and dreams. The chemistry between Ryan Gosling and Emma Stone is enchanting, and the music and dance sequences are a pure delight. A modern musical masterpiece."\n',
 '5. "Wes Anderson\'s whimsical

* Finding the indexes of lines having "=" characters

In [6]:
indexes = [index for index, item in enumerate(filtered_lines) if "===" in item]
indexes

[1, 15, 28]

* Splitting lines into positive, negative and test review sets

In [7]:
positive_reviews = filtered_lines[2:14]
negative_reviews = filtered_lines[16:28]
test_reviews = filtered_lines[29:]

In [8]:
positive_reviews

['1. "Forrest Gump is an absolute masterpiece! Tom Hanks delivers an unforgettable performance, and the storytelling is heartwarming. This movie is a journey through life that will make you laugh, cry, and appreciate the simple beauties of existence."\n',
 '2. "The Shawshank Redemption is a timeless classic. The powerful themes of hope, friendship, and redemption make it a must-watch. Morgan Freeman and Tim Robbins give exceptional performances in this brilliantly crafted film."\n',
 '3. "The epic conclusion to The Lord of the Rings trilogy, The Return of the King, is a cinematic triumph. The breathtaking visuals, epic battles, and emotionally resonant story make it a monumental achievement in filmmaking."\n',
 '4. "La La Land is a love letter to the magic of Hollywood and dreams. The chemistry between Ryan Gosling and Emma Stone is enchanting, and the music and dance sequences are a pure delight. A modern musical masterpiece."\n',
 '5. "Wes Anderson\'s whimsical style shines in The Gr

In [9]:
negative_reviews

['1. "The Last Airbender is a disaster of a film adaptation. It butchers the beloved animated series with wooden acting, convoluted storytelling, and cringe-worthy special effects. A letdown for fans and newcomers alike."\n',
 '2. "Another Transformers movie, and it\'s just more of the same: mindless explosions, incoherent plotlines, and an overreliance on CGI. This franchise desperately needs an overhaul."\n',
 '3. "The Emoji Movie is a blatant cash grab with a shallow, uninspired plot. It fails to deliver clever humor or meaningful messages, making it a forgettable and disappointing animated film."\n',
 '4. "Fifty Shades of Grey is a cringe-inducing attempt at romance. Poorly written dialogue and unconvincing chemistry between the leads make it an awkward and unfulfilling cinematic experience."\n',
 '5. "Jack and Jill is an unbearable comedy that relies on stale humor and a painfully unfunny portrayal of Adam Sandler in a dual role. It\'s a prime example of lazy filmmaking."\n',
 '6.

In [10]:
test_reviews

["It's clear that the movie has both its enthusiasts and critics. While it may not be to everyone's taste, it's worth watching with an open mind to form your own opinion. \n"]

* Removing numbers and dots associated with numbers

In [11]:
postive_reviews_without_numbers = [re.sub(r'\d+\.', '', string) for string in positive_reviews]
negtive_reviews_without_numbers = [re.sub(r'\d+\.', '', string) for string in negative_reviews]

* Creating the dataframe

In [12]:
df_positive = pd.DataFrame({'reviews': postive_reviews_without_numbers, 'sentiments': 'positive'})
df_negative = pd.DataFrame({'reviews': negtive_reviews_without_numbers, 'sentiments': 'negative'})

In [13]:
df = pd.concat([df_positive, df_negative], ignore_index=True)

In [14]:
df.head()

Unnamed: 0,reviews,sentiments
0,"""Forrest Gump is an absolute masterpiece! Tom...",positive
1,"""The Shawshank Redemption is a timeless class...",positive
2,"""The epic conclusion to The Lord of the Rings...",positive
3,"""La La Land is a love letter to the magic of ...",positive
4,"""Wes Anderson's whimsical style shines in The...",positive


In [15]:

nltk.download('stopwords')
tokenizer=word_tokenize
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## **2. Pre-processing Reviews**

In [16]:
# Removing special characters from reviews

def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

df['reviews']=df['reviews'].apply(remove_special_characters)

In [17]:
# Lowercasing reviews

def lowercase_text(text):
  return text.lower()

df['reviews']=df['reviews'].apply(lowercase_text)

In [18]:
# Tokenizing reviews

def tokenize_reviews(text):
  tokens = tokens = tokenizer(text)
  tokens = [token.strip() for token in tokens]
  return tokens

df['tokens']=df['reviews'].apply(tokenize_reviews)

In [19]:
#removing the stopwords from tokens

stop=set(stopwords.words('english'))
print(stop)

def remove_stopwords(tokens):
  filtered_tokens = [token for token in tokens if token not in stopword_list]
  return filtered_tokens

df['tokens']=df['tokens'].apply(remove_stopwords)

{'who', 'me', 'of', 'shan', "that'll", 'that', 'all', 'doing', 'a', 'after', 'aren', 'it', 'but', 'because', 'most', 'does', 'am', 'same', 'mightn', 'them', 'be', 'itself', 'so', 'below', 'him', 'no', 'you', 'their', 'yours', 'is', 'at', 'here', 'was', 'don', 'during', 'your', 'do', 'doesn', 'has', 'these', 'before', "weren't", 'where', 's', 't', 'which', 'once', 'hadn', 'couldn', 'than', 'under', 'm', 'up', "you'll", 'whom', 'again', 'as', "doesn't", 'wasn', 'what', "you're", "hadn't", "hasn't", 'very', 'being', 'she', "don't", 'and', 'didn', 'are', "couldn't", 'why', 'down', 'above', 'only', 'just', "wouldn't", 'shouldn', 'or', 'between', 'some', 'themselves', 'from', 'each', 'myself', "isn't", 'few', 'can', 'other', 'did', "mustn't", 'had', "you've", 'our', 'when', 'such', "wasn't", "mightn't", 'i', 'his', 'y', 'hers', 'nor', 'haven', 'won', 'on', "didn't", 'mustn', 'my', 'should', 'he', 'with', 'herself', 'yourselves', "you'd", 'this', 'have', 'how', "aren't", 'o', 'off', 'through'

In [20]:
df['tokens'].head(10)

0    [forrest, gump, absolute, masterpiece, tom, ha...
1    [shawshank, redemption, timeless, classic, pow...
2    [epic, conclusion, lord, rings, trilogy, retur...
3    [la, la, land, love, letter, magic, hollywood,...
4    [wes, andersons, whimsical, style, shines, gra...
5    [inception, mindbending, brilliance, christoph...
6    [social, network, captivating, exploration, cr...
7    [smiths, portrayal, chris, gardner, pursuit, h...
8    [eternal, sunshine, spotless, mind, beautifull...
9    [princess, bride, timeless, fairy, tale, perfe...
Name: tokens, dtype: object

In [21]:
# Lemmatizing tokens to get them in to their base form

nltk.download('wordnet')

def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(token, pos='n') for token in tokens]
    lemmatized_words = [lemmatizer.lemmatize(token, pos='v') for token in lemmatized_words]  # Also lemmatize verbs
    lemmatized_words = [lemmatizer.lemmatize(token, pos='a') for token in lemmatized_words]  # Also lemmatize adjectives
    lemmatized_words = [lemmatizer.lemmatize(token, pos='r') for token in lemmatized_words]  # Also lemmatize adverbs
    return lemmatized_words
#Apply function on review column
df['lem_tokens']=df['tokens'].apply(lemmatize_tokens)


[nltk_data] Downloading package wordnet to /root/nltk_data...


In [22]:
df['lem_tokens'].iloc[2]

['epic',
 'conclusion',
 'lord',
 'ring',
 'trilogy',
 'return',
 'king',
 'cinematic',
 'triumph',
 'breathtaking',
 'visuals',
 'epic',
 'battle',
 'emotionally',
 'resonant',
 'story',
 'make',
 'monumental',
 'achievement',
 'filmmaking']

In [23]:
df['tokens'].iloc[2]

['epic',
 'conclusion',
 'lord',
 'rings',
 'trilogy',
 'return',
 'king',
 'cinematic',
 'triumph',
 'breathtaking',
 'visuals',
 'epic',
 'battles',
 'emotionally',
 'resonant',
 'story',
 'make',
 'monumental',
 'achievement',
 'filmmaking']

In [24]:
# encoding sentiment class labels usin labelEncoder

le = LabelEncoder()

df['sentiments'] = le.fit_transform(df['sentiments'])
print(le.classes_)


['negative' 'positive']


In [25]:
print("Positive label is encoded as : ",le.transform(['positive'])[0])
print("Negative label is encoded as : ",le.transform(['negative'])[0])

Positive label is encoded as :  1
Negative label is encoded as :  0


In [26]:
# Concatinatin the lemmatized tokens to get the lemmatized review text

def lemmatized_text(tokens):
  lem_text = " ".join(tokens)
  return lem_text

df['lem_text'] = df['lem_tokens'].apply(lemmatized_text)

In [27]:
df.head()

Unnamed: 0,reviews,sentiments,tokens,lem_tokens,lem_text
0,forrest gump is an absolute masterpiece tom h...,1,"[forrest, gump, absolute, masterpiece, tom, ha...","[forrest, gump, absolute, masterpiece, tom, ha...",forrest gump absolute masterpiece tom hank del...
1,the shawshank redemption is a timeless classi...,1,"[shawshank, redemption, timeless, classic, pow...","[shawshank, redemption, timeless, classic, pow...",shawshank redemption timeless classic powerful...
2,the epic conclusion to the lord of the rings ...,1,"[epic, conclusion, lord, rings, trilogy, retur...","[epic, conclusion, lord, ring, trilogy, return...",epic conclusion lord ring trilogy return king ...
3,la la land is a love letter to the magic of h...,1,"[la, la, land, love, letter, magic, hollywood,...","[la, la, land, love, letter, magic, hollywood,...",la la land love letter magic hollywood dream c...
4,wes andersons whimsical style shines in the g...,1,"[wes, andersons, whimsical, style, shines, gra...","[wes, anderson, whimsical, style, shine, grand...",wes anderson whimsical style shine grand budap...


## **3. Calculating N-gram Probabilities**

In [28]:
# generate bigrams

def generate_bigrams(tokens):
    return list(ngrams(tokens, 2))

df['bigrams'] = df['tokens'].apply(generate_bigrams)

In [29]:
df['bigrams']

0     [(forrest, gump), (gump, absolute), (absolute,...
1     [(shawshank, redemption), (redemption, timeles...
2     [(epic, conclusion), (conclusion, lord), (lord...
3     [(la, la), (la, land), (land, love), (love, le...
4     [(wes, andersons), (andersons, whimsical), (wh...
5     [(inception, mindbending), (mindbending, brill...
6     [(social, network), (network, captivating), (c...
7     [(smiths, portrayal), (portrayal, chris), (chr...
8     [(eternal, sunshine), (sunshine, spotless), (s...
9     [(princess, bride), (bride, timeless), (timele...
10    [(fifty, shades), (shades, grey), (grey, manag...
11    [(film, conversation), (conversation, starter)...
12    [(last, airbender), (airbender, disaster), (di...
13    [(another, transformers), (transformers, movie...
14    [(emoji, movie), (movie, blatant), (blatant, c...
15    [(fifty, shades), (shades, grey), (grey, cring...
16    [(jack, jill), (jill, unbearable), (unbearable...
17    [(superman, iv), (iv, colossal), (colossal

In [30]:
# getting all unique bigrams

df_exploded = df['bigrams'].explode()
unique_bigrams = df_exploded.unique().tolist()

In [31]:
print("Unique bigrams : ",len(unique_bigrams))

Unique bigrams :  473


In [32]:
# calculating bigram probabilities

def calculate_bigram_probabilities(bigram):
  phrase = " ".join(map(str, bigram))
  positive_bigram_count = sum(phrase in sentence.lower() for sentence in postive_reviews_without_numbers)
  postive_prior_word_count = sum(bigram[0] in sentence.lower() for sentence in postive_reviews_without_numbers)
  if(postive_prior_word_count == 0):
      postive_bigram_probability = 0
  else:
      postive_bigram_probability = positive_bigram_count/postive_prior_word_count

  positive_bigram_probabilities[bigram] = postive_bigram_probability

  negative_bigram_count = sum(phrase in sentence.lower() for sentence in negtive_reviews_without_numbers)
  negative_prior_word_count = sum(bigram[0] in sentence.lower() for sentence in negtive_reviews_without_numbers)
  if(negative_prior_word_count == 0):
      negative_bigram_probability = 0
  else:
      negative_bigram_probability = negative_bigram_count/negative_prior_word_count

  negative_bigram_probabilities[bigram] = negative_bigram_probability

In [33]:
# adding positive and negative bigram probability values for seperate dictionries

positive_bigram_probabilities = {}
negative_bigram_probabilities = {}

for bigram in unique_bigrams:
  calculate_bigram_probabilities(bigram)

In [34]:
# positive bigram probabilities
table_data_positive = [(key, value) for key, value in positive_bigram_probabilities.items()]
print(tabulate(table_data_positive, headers=["Key", "Value"], tablefmt="fancy_grid"))

╒═══════════════════════════════════╤══════════╕
│ Key                               │    Value │
╞═══════════════════════════════════╪══════════╡
│ ('forrest', 'gump')               │ 1        │
├───────────────────────────────────┼──────────┤
│ ('gump', 'absolute')              │ 0        │
├───────────────────────────────────┼──────────┤
│ ('absolute', 'masterpiece')       │ 1        │
├───────────────────────────────────┼──────────┤
│ ('masterpiece', 'tom')            │ 0        │
├───────────────────────────────────┼──────────┤
│ ('tom', 'hanks')                  │ 1        │
├───────────────────────────────────┼──────────┤
│ ('hanks', 'delivers')             │ 1        │
├───────────────────────────────────┼──────────┤
│ ('delivers', 'unforgettable')     │ 0        │
├───────────────────────────────────┼──────────┤
│ ('unforgettable', 'performance')  │ 1        │
├───────────────────────────────────┼──────────┤
│ ('performance', 'storytelling')   │ 0        │
├───────────────────

In [35]:
# negative bigram probabilities

table_data_negative = [(key, value) for key, value in negative_bigram_probabilities.items()]
print(tabulate(table_data_negative, headers=["Key", "Value"], tablefmt="fancy_grid"))

╒═══════════════════════════════════╤══════════╕
│ Key                               │    Value │
╞═══════════════════════════════════╪══════════╡
│ ('forrest', 'gump')               │ 0        │
├───────────────────────────────────┼──────────┤
│ ('gump', 'absolute')              │ 0        │
├───────────────────────────────────┼──────────┤
│ ('absolute', 'masterpiece')       │ 0        │
├───────────────────────────────────┼──────────┤
│ ('masterpiece', 'tom')            │ 0        │
├───────────────────────────────────┼──────────┤
│ ('tom', 'hanks')                  │ 0        │
├───────────────────────────────────┼──────────┤
│ ('hanks', 'delivers')             │ 0        │
├───────────────────────────────────┼──────────┤
│ ('delivers', 'unforgettable')     │ 0        │
├───────────────────────────────────┼──────────┤
│ ('unforgettable', 'performance')  │ 0        │
├───────────────────────────────────┼──────────┤
│ ('performance', 'storytelling')   │ 0        │
├───────────────────

In [36]:
# pre-processing test review

test_review = "It's clear that the movie has both its enthusiasts and critics. While it may not be to everyone's taste, it's worth watching with an open mind to form your own opinion. "

test_review = remove_special_characters(test_review)
test_review = lowercase_text(test_review)
tokenized_test_review = tokenize_reviews(test_review)
tokenized_test_review = remove_stopwords(tokenized_test_review)
tokenized_test_review = lemmatize_tokens(tokenized_test_review)

tokenized_test_review

['clear',
 'movie',
 'enthusiast',
 'critic',
 'may',
 'everyones',
 'taste',
 'worth',
 'watch',
 'open',
 'mind',
 'form',
 'opinion']

In [37]:
test_bigrams = generate_bigrams(tokenized_test_review)
test_bigrams

[('clear', 'movie'),
 ('movie', 'enthusiast'),
 ('enthusiast', 'critic'),
 ('critic', 'may'),
 ('may', 'everyones'),
 ('everyones', 'taste'),
 ('taste', 'worth'),
 ('worth', 'watch'),
 ('watch', 'open'),
 ('open', 'mind'),
 ('mind', 'form'),
 ('form', 'opinion')]

* Since there are unseen bigrams, I use "Add-One (Laplace) Smoothing here to get the probability of the test review with regards to each class.



In [38]:
def get_smoothed_probabilities_positive(bigram):
  phrase = " ".join(map(str, bigram))
  bigram_count = sum(phrase in sentence.lower() for sentence in postive_reviews_without_numbers) + 1
  prior_word_count = sum(bigram[0] in sentence.lower() for sentence in postive_reviews_without_numbers) + 465
  probability = bigram_count/prior_word_count
  return probability;

In [39]:
exploded_df = df['tokens'].explode().reset_index(drop=True)
unique_words = exploded_df.unique()
V = len(unique_words)
V


394

In [40]:
def get_smoothed_probabilities_negative(bigram):
  phrase = " ".join(map(str, bigram))
  bigram_count = sum(phrase in sentence.lower() for sentence in negtive_reviews_without_numbers) + 1
  prior_word_count = sum(bigram[0] in sentence.lower() for sentence in negtive_reviews_without_numbers) + V
  probability = bigram_count/prior_word_count
  return probability;

In [41]:
positive_probability = 1
for bigram in test_bigrams:
  positive_probability = positive_probability * get_smoothed_probabilities_positive(bigram)

print("Probability for the test review to be positive : ",positive_probability)

Probability for the test review to be positive :  1.9031805771933046e-32


In [42]:
negative_probability = 1
for bigram in test_bigrams:
  negative_probability = negative_probability * get_smoothed_probabilities_negative(bigram)

print("Probability for the test review to be negative : ", negative_probability)

Probability for the test review to be negative :  1.3759011137590818e-31



> test(positive) probability < test(negative) probability <br>
**Therefore as the conclusion we can say the sentiment of the test review is Negative**



## **4. Sentiment Analysis Using in-built Naive Bayes Model and Functions**

In [43]:
# Splitting data in to train set and test set

X_train, X_test, y_train, y_test = train_test_split(df['lem_text'], df['sentiments'], test_size=0.3, random_state=42)


In [44]:
X_train.head()

21    gigli train wreck romantic comedy pair ben aff...
5     inception mindbending brilliance christopher n...
2     epic conclusion lord ring trilogy return king ...
12    last airbender disaster film adaptation butch ...
15    fifty shade grey cringeinducing attempt romanc...
Name: lem_text, dtype: object

In [45]:
# transforming train and test set to Bag of Words using CounterVectorizer

cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
cv_train_reviews=cv.fit_transform(X_train)
cv_test_reviews=cv.transform(X_test)

print('BOW_train :',cv_train_reviews.shape)
print('BOW_test :',cv_test_reviews.shape)


BOW_train : (16, 841)
BOW_test : (8, 841)


In [46]:
X_test.head()

8     eternal sunshine spotless mind beautifully unc...
16    jack jill unbearable comedy rely stale humor p...
0     forrest gump absolute masterpiece tom hank del...
18    cat hat chaotic misguide adaptation dr seusss ...
11    film conversation starter clear wont everyones...
Name: lem_text, dtype: object

In [47]:
# Fitting train data to Naive Bayes Model

mnb=MultinomialNB()
nb_model=mnb.fit(cv_train_reviews,y_train)


In [48]:
# Predicting the sentiments for test set using countervectorized features
nb_model_predict = mnb.predict(cv_test_reviews)
print(nb_model_predict)


[1 0 0 0 0 1 0 0]


In [49]:
# Accuracy of the Naive Bayes Model

nb_model_score=accuracy_score(y_test,nb_model_predict)
print("nb_model_score :",nb_model_score)

nb_model_score : 0.625


In [50]:
# Classification report for Naive Bayes Model

nb_model_report=classification_report(y_test,nb_model_predict,target_names=['Positive','Negative'])
print(nb_model_report)

              precision    recall  f1-score   support

    Positive       0.50      1.00      0.67         3
    Negative       1.00      0.40      0.57         5

    accuracy                           0.62         8
   macro avg       0.75      0.70      0.62         8
weighted avg       0.81      0.62      0.61         8



### **4.1. Prediction**

In [51]:
test_review = "It's clear that the movie has both its enthusiasts and critics. While it may not be to everyone's taste, it's worth watching with an open mind to form your own opinion. "

test_review = remove_special_characters(test_review)
test_review = lowercase_text(test_review)
tokenized_test_review = tokenize_reviews(test_review)
tokenized_test_review = remove_stopwords(tokenized_test_review)
tokenized_test_review = lemmatize_tokens(tokenized_test_review)

lemmatized_test_review = lemmatized_text(tokenized_test_review)
lemmatized_test_review

'clear movie enthusiast critic may everyones taste worth watch open mind form opinion'

In [52]:
text = [lemmatized_test_review]

In [53]:
# test_review_vectorized = mnb.fit([' '.join(test_review)])
x = cv.transform(text)


In [54]:
sentiment_prediction = nb_model.predict(x)


In [55]:
if sentiment_prediction[0]:
  print("Predicted Sentiment: Positive")
else:
  print("Predicted Sentiment: Negative")



Predicted Sentiment: Negative




---

