### gloVe Word Embeddings and RNN (LSTM) Used to predict on Real Data

In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
import pymongo
import spacy
from pymongo import MongoClient
import string
import re
import nltk
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy import spatial
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
import random
import time
from tqdm import tqdm
tqdm.pandas()
import pickle
import matplotlib.pyplot as plt
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
%matplotlib inline


[nltk_data] Downloading package wordnet to C:\Users\Nitro
[nltk_data]     5\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
lemmatizer = WordNetLemmatizer()

!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 762.6 kB/s eta 0:00:17
     - -------------------------------------- 0.3/12.8 MB 2.3 MB/s eta 0:00:06
     - -------------------------------------- 0.6/12.8 MB 3.0 MB/s eta 0:00:05
     --- ------------------------------------ 1.1/12.8 MB 4.5 MB/s eta 0:00:03
     ---- ----------------------------------- 1.6/12.8 MB 5.5 MB/s eta 0:00:03
     ------ --------------------------------- 2.0/12.8 MB 6.4 MB/s eta 0:00:02
     ------- -------------------------------- 2.5/12.8 MB 6.7 MB/s eta 0:00:02
     --------- ------------------------------ 2.9/12.8 MB 7.0 MB/s eta 0:00:02
     ---------- ----------------------------- 3


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Pre-processing Function

In [32]:
def text_preprocessing(review):
  
  #changing into lower case

  review_text = review.lower()

  # Removing HTML tags

  review_text = BeautifulSoup(review_text , 'html.parser').get_text()

  # Removing Punctuations

  review_text = ''.join(char for char in review_text if char not in string.punctuation)

  # Removing Whitespaces

  review_text = re.sub(r'\s+' , ' ' , review_text)
  
  # Performing Lemmatization i.e converting words into base words
  nlp = spacy.load('en_core_web_sm')
  doc = nlp(review_text)
  lemmatized_tokens = [token.lemma_ for token in doc]
  
  
  # Removing Stopwords i.e words that add little to no meaning to the review

  review_clean_text = [word for word in lemmatized_tokens if word not in stopwords.words('english') ]

  clean_sentence = ' '.join(char for char in review_clean_text)

  return clean_sentence

In [33]:
def vectorize_text(texts , tokenizer):


   X_test = tokenizer.texts_to_sequences(texts)

   maxlen = 600 

   X_test = pad_sequences(X_test , padding='post' , maxlen= maxlen)

   return X_test




In [34]:
# test_text = ['HI i am here' , 'the apple is red' , 'HI i am here']

In [35]:
# words_for_cloud =[]
# for review in test_text:
#  print(review)
#  words = review.split()
#  for word in words:
#   words_for_cloud.append(word)

Creating function for word-cloud

In [36]:
def word_cloud(df):

 words_for_cloud =[]

 for review in df['review']:
  words = review.split()
  for word in words:
   words_for_cloud.append(word)

 return words_for_cloud

Creating function to handle new dataset

In [37]:
def process_new_dataset(file_path , tokenizer , model):
 
 #Load The new Dataset

 new_data = pd.read_csv(file_path)

 #Preprocess the text 

 new_data['processed_text'] = new_data['review'].progress_apply(text_preprocessing)

 #Vectorize the Reviewss
 X_data = vectorize_text(new_data['processed_text'] , tokenizer)

 #Predict Sentiments
 predictions = model.predict(X_data)

 #Adding Predictions to original dataframe to check validity
 new_data['predictions'] = predictions

 return new_data


In [38]:
def evaluate_value(data):
 if data > 5:
  return "Positive"
 elif data < 5:
  return "Negative"
 else:
  return "Neutral"

In [39]:
def evaluate_results(df):

 df['predictions'] = df['predictions'].apply(lambda value : value * 10 ) # converting score out of 10

 df['sentiment prediction'] = df['predictions'].progress_apply(evaluate_value) # Converting into Textual prediction

 
 #Calculating Percentage for Prediction

 pos_count = 0 
 neg_count = 0  

 for sentiment in df['sentiment prediction']:
  if sentiment == "Positive":
   pos_count = pos_count + 1
  else:
   neg_count = neg_count + 1

 pos_percent = (pos_count /(pos_count + neg_count)) * 100
 neg_percent = (neg_count /(pos_count + neg_count)) * 100
 score = np.mean(df['predictions']) * 10

 print('Positive Sentiments Predicted:', pos_count )
 print('Negative Sentiments Predicted:', neg_count )
 print('Positive Sentiments Percent:', pos_percent.__round__(1) )
 print('Negative Sentiments Percent:', neg_percent.__round__(1) )
 print("The Sentiment Socre is:" , score)





In [40]:
# Initialize a dictionary to keep count of each emotion
emotion_counts = {
    "anger": 0,
    "fear": 0,
    "joy": 0,
    "neutral": 0,
    "sadness": 0,
    "surprise": 0
}

def evaluate_emotions(emotion):
 if emotion in emotion_counts:
  emotion_counts[emotion] += 1
 

def evaluate_predicted_emotions(df):
 
  # Reset emotion counts before evaluation
  global emotion_counts
  emotion_counts = {key: 0 for key in emotion_counts}

  #Calculating numbers for each emotion

  df['emotion'].progress_apply(evaluate_emotions)
  print('Anger:' , emotion_counts['anger'])
  print('fear:' , emotion_counts['fear'])
  print('joy:' , emotion_counts['joy'])
  print('neutral:' , emotion_counts['neutral'])
  print('sadness:' , emotion_counts['sadness'])
  print('surprise:' , emotion_counts['surprise'])
 


Loading our Trained LSTM Model 

In [41]:
with open ('cnn_glove_model' , 'rb') as file:
 reelfeel_model = pickle.load(file)

In [42]:
reelfeel_model.summary()

Loading tokenizer used while training the model 

In [43]:
with open('word_tokenizer.pkl' , 'rb') as handle:
 word_tokenizer = pickle.load(handle)

## Applying RoBerta Model


In [44]:
# Load pre-trained RoBERTa model and tokenizer
model_name = "j-hartmann/emotion-english-distilroberta-base"
roberta_tokenizer = RobertaTokenizer.from_pretrained(model_name)
roberta_model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=7)  #7 classes for 7 emotions

Creating Function to apply roBerta Model


In [45]:
def predict_emotions(data):
    # Tokenization
    tokenized_inputs = roberta_tokenizer(data, padding=True, return_tensors='pt', truncation=True, max_length=512)
    
    # Model Interface
    with torch.no_grad():
        outputs = roberta_model(**tokenized_inputs)
    logits = outputs.logits
    predicted_probabilities = torch.softmax(logits, dim=1)

    # Post-processing
    emotion_labels = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]
    predicted_emotion_index = predicted_probabilities.argmax().item()
    predicted_emotion = emotion_labels[predicted_emotion_index]

    return predicted_emotion


Example Hard Day's Night: Relatively Positive movie

In [46]:
file_path = r"C:\Users\Nitro 5\Desktop\DataSets\1M Dataset\2_reviews_per_movie_raw\A Hard Day's Night 1964.csv"
processed_data = process_new_dataset(file_path , word_tokenizer , reelfeel_model)

100%|██████████| 242/242 [01:53<00:00,  2.13it/s]


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step


In [47]:
processed_data

Unnamed: 0,username,rating,helpful,total,date,title,review,processed_text,predictions
0,sryder@judson-il.edu,10,121,135,4 May 2004,"This film ""converted"" me\n",I was in my mid-thirties when the Beatles came...,I midthirtie beatle come america appear shea s...,0.619048
1,caspian1978,8,97,112,22 December 2003,The Age of Innocence\n,This is it. There has never been and never wil...,never never another band like beatle innocence...,0.999324
2,slokes,8,53,63,26 October 2004,Pipers At The Gates Of Dawn\n,What can you say about the film that started i...,say film start popular culture know take shape...,0.899519
3,Johnny Angel,9,52,63,2 December 1999,"""The Best British Musical/Comedy Film Of 1964...","""The first 60's film to ever waken my musical ...",first 60 film ever waken musical interest pop ...,0.990320
4,Pedro_H,8,74,92,8 April 2004,Solid gold record of the Liverpool miracles a...,The Beatles travel down from Liverpool to reco...,beatle travel liverpool record tv showif I mee...,0.983655
...,...,...,...,...,...,...,...,...,...
237,cookie_monster,Null,0,1,11 February 2002,Tasteless Comedy...\n,...but damn its funny.<br/><br/>Over the many ...,damn funnyover many year life 16 total I hear ...,0.936884
238,jon-285,Null,0,3,15 September 2001,"A dreadful film, but with wonderful songs\n","As a long-time fan of the Beatles, and yes, I ...",longtime fan beatle yes I around london start ...,0.094409
239,coverme6,Null,0,2,28 July 2001,A day in the life of the Beatles... through f...,The Fab Four have stormed the world with their...,fab four storm world hit song make even big im...,0.818262
240,gazzo-2,Null,0,10,19 January 2001,Dull.\n,"Dull, dated, slow, hmmmmm black and white, bas...",dull date slow hmmmmm black white basically lo...,0.016855


In [50]:
word_cloud(processed_data)

['I',
 'was',
 'in',
 'my',
 'mid-thirties',
 'when',
 'the',
 'Beatles',
 'came',
 'to',
 'America,',
 'and',
 'appeared',
 'at',
 'Shea',
 'Stadium',
 'and',
 '(famously)',
 'on',
 'the',
 'Ed',
 'Sullivan.',
 'I',
 'saw',
 'their',
 'success,',
 'with',
 'the',
 'screaming',
 'girls,',
 'as',
 'just',
 'another',
 'teen-age',
 'phenomenon.',
 'I',
 'must',
 'have',
 'read',
 'in',
 'some',
 'column',
 'that',
 'this',
 'film',
 'was',
 'interesting',
 'for',
 'its',
 'direction',
 'and',
 'photography.',
 'That',
 'was',
 'true.',
 'What',
 'I',
 'did',
 'not',
 'expect',
 'was',
 'that',
 'I',
 'would',
 'be',
 'caught',
 'up',
 'by',
 'the',
 'Beatles',
 'themselves,',
 'both',
 'as',
 'personalities',
 'and',
 'as',
 'musicians.',
 'Those',
 'who',
 'comment',
 'adversely',
 'on',
 'their',
 'lack',
 'of',
 'acting',
 'ability',
 'are',
 'way',
 'off',
 'base,',
 'because',
 'neither',
 'they',
 'nor',
 'the',
 'director',
 'were',
 'looking',
 'for',
 'dramatic',
 'skill;',
 'on

In [78]:
evaluate_results(processed_data)

100%|██████████| 242/242 [00:00<?, ?it/s]

Positive Sentiments Predicted: 214
Negative Sentiments Predicted: 28
Positive Sentiments Percent: 88.4
Negative Sentiments Percent: 11.6
The Sentiment Socre is: 81.55228747516747





In [79]:
processed_data

Unnamed: 0,username,rating,helpful,total,date,title,review,processed_text,predictions,sentiment prediction
0,sryder@judson-il.edu,10,121,135,4 May 2004,"This film ""converted"" me\n",I was in my mid-thirties when the Beatles came...,I midthirtie beatle come america appear shea s...,6.190479,Positive
1,caspian1978,8,97,112,22 December 2003,The Age of Innocence\n,This is it. There has never been and never wil...,never never another band like beatle innocence...,9.993237,Positive
2,slokes,8,53,63,26 October 2004,Pipers At The Gates Of Dawn\n,What can you say about the film that started i...,say film start popular culture know take shape...,8.995191,Positive
3,Johnny Angel,9,52,63,2 December 1999,"""The Best British Musical/Comedy Film Of 1964...","""The first 60's film to ever waken my musical ...",first 60 film ever waken musical interest pop ...,9.903204,Positive
4,Pedro_H,8,74,92,8 April 2004,Solid gold record of the Liverpool miracles a...,The Beatles travel down from Liverpool to reco...,beatle travel liverpool record tv showif I mee...,9.836551,Positive
...,...,...,...,...,...,...,...,...,...,...
237,cookie_monster,Null,0,1,11 February 2002,Tasteless Comedy...\n,...but damn its funny.<br/><br/>Over the many ...,damn funnyover many year life 16 total I hear ...,9.368840,Positive
238,jon-285,Null,0,3,15 September 2001,"A dreadful film, but with wonderful songs\n","As a long-time fan of the Beatles, and yes, I ...",longtime fan beatle yes I around london start ...,0.944093,Negative
239,coverme6,Null,0,2,28 July 2001,A day in the life of the Beatles... through f...,The Fab Four have stormed the world with their...,fab four storm world hit song make even big im...,8.182622,Positive
240,gazzo-2,Null,0,10,19 January 2001,Dull.\n,"Dull, dated, slow, hmmmmm black and white, bas...",dull date slow hmmmmm black white basically lo...,0.168549,Negative


In [80]:
predicted_emotions = processed_data['review'].progress_apply(predict_emotions)

100%|██████████| 242/242 [00:44<00:00,  5.48it/s]


In [81]:
predicted_df = pd.DataFrame(predicted_emotions.tolist() , columns=['emotion'] )

In [82]:
predicted_df

Unnamed: 0,emotion
0,neutral
1,neutral
2,neutral
3,joy
4,neutral
...,...
237,joy
238,disgust
239,joy
240,neutral


In [83]:
processed_data = pd.concat([processed_data , predicted_df ], axis = 1)

In [84]:
processed_data

Unnamed: 0,username,rating,helpful,total,date,title,review,processed_text,predictions,sentiment prediction,emotion
0,sryder@judson-il.edu,10,121,135,4 May 2004,"This film ""converted"" me\n",I was in my mid-thirties when the Beatles came...,I midthirtie beatle come america appear shea s...,6.190479,Positive,neutral
1,caspian1978,8,97,112,22 December 2003,The Age of Innocence\n,This is it. There has never been and never wil...,never never another band like beatle innocence...,9.993237,Positive,neutral
2,slokes,8,53,63,26 October 2004,Pipers At The Gates Of Dawn\n,What can you say about the film that started i...,say film start popular culture know take shape...,8.995191,Positive,neutral
3,Johnny Angel,9,52,63,2 December 1999,"""The Best British Musical/Comedy Film Of 1964...","""The first 60's film to ever waken my musical ...",first 60 film ever waken musical interest pop ...,9.903204,Positive,joy
4,Pedro_H,8,74,92,8 April 2004,Solid gold record of the Liverpool miracles a...,The Beatles travel down from Liverpool to reco...,beatle travel liverpool record tv showif I mee...,9.836551,Positive,neutral
...,...,...,...,...,...,...,...,...,...,...,...
237,cookie_monster,Null,0,1,11 February 2002,Tasteless Comedy...\n,...but damn its funny.<br/><br/>Over the many ...,damn funnyover many year life 16 total I hear ...,9.368840,Positive,joy
238,jon-285,Null,0,3,15 September 2001,"A dreadful film, but with wonderful songs\n","As a long-time fan of the Beatles, and yes, I ...",longtime fan beatle yes I around london start ...,0.944093,Negative,disgust
239,coverme6,Null,0,2,28 July 2001,A day in the life of the Beatles... through f...,The Fab Four have stormed the world with their...,fab four storm world hit song make even big im...,8.182622,Positive,joy
240,gazzo-2,Null,0,10,19 January 2001,Dull.\n,"Dull, dated, slow, hmmmmm black and white, bas...",dull date slow hmmmmm black white basically lo...,0.168549,Negative,neutral


In [97]:
evaluate_predicted_emotions(processed_data)

100%|██████████| 242/242 [00:00<00:00, 160935.72it/s]

Anger: 1
fear: 7
joy: 115
neutral: 63
sadness: 17
surprise: 23





Example : mostly negative reviews , Rating : 4.77

In [1]:
file_path = r"C:\Users\Nitro 5\Desktop\DataSets\1M Dataset\2_reviews_per_movie_raw\21 2008.csv"
processed_data = process_new_dataset(file_path , word_tokenizer , reelfeel_model)

NameError: name 'process_new_dataset' is not defined

In [None]:
processed_data

Unnamed: 0,username,rating,helpful,total,date,title,review,processed_text,predictions
0,TheRationalist,5,161,192,31 March 2008,Not The Movie It Could Have Been\n,"This movie was based on a true story, and if t...",movie base true story maker stick close true s...,0.322113
1,Smells_Like_Cheese,6,158,216,3 April 2008,"Nothing new, but it's worth the watch\n",21 is definitely the major film for the spring...,21 definitely major film spring time young hot...,0.635948
2,Rogue-32,2,92,125,1 April 2008,Doesn't even work as a fairytale\n,I was intrigued by the preview of 21 because I...,I intrigue preview 21 I sucker film gambling f...,0.017820
3,Lechuguilla,8,214,310,28 March 2008,"""Winner Winner Chicken Dinner""\n",Slick camera work and some good performances r...,slick camera work good performance rev technic...,0.875357
4,Cocacolaguy912-2,7,122,182,21 April 2008,Entertaining but very cliché.\n,21 is worth seeing on a restless Friday or Sat...,21 worth see restless friday saturday night fr...,0.107784
...,...,...,...,...,...,...,...,...,...
306,Ozzy2000,9,0,2,18 May 2008,Unreal but entertaining.\n,Having been to Las Vegas and also loving table...,las vegas also love table game I find film ent...,0.696992
307,jemps918,8,0,1,18 May 2008,Great soundtrack!\n,"The soundtrack is pretty darn good! And yes, t...",soundtrack pretty darn good yes movie entertai...,0.866279
308,mk35,Null,0,2,11 May 2008,finders keepers\n,I watch a myriad of movies and therefore as a ...,I watch myriad movie therefore consequence lot...,0.758137
309,ankurmisra,5,0,2,12 April 2008,Preteen paced movie.\n,"The movie was OK, but don't expect great actin...",movie ok expect great acting great dramait nev...,0.205822


In [None]:
evaluate_results(processed_data)

100%|██████████| 311/311 [00:00<?, ?it/s]

Positive Sentiments Predicted: 155
Negative Sentiments Predicted: 156
Positive Sentiments Percent: 49.8
Negative Sentiments Percent: 50.2
The Sentiment Socre is: 48.98046246235494





In [None]:
processed_data

Unnamed: 0,username,rating,helpful,total,date,title,review,processed_text,predictions,sentiment prediction
0,TheRationalist,5,161,192,31 March 2008,Not The Movie It Could Have Been\n,"This movie was based on a true story, and if t...",movie base true story maker stick close true s...,3.221132,Negative
1,Smells_Like_Cheese,6,158,216,3 April 2008,"Nothing new, but it's worth the watch\n",21 is definitely the major film for the spring...,21 definitely major film spring time young hot...,6.359476,Positive
2,Rogue-32,2,92,125,1 April 2008,Doesn't even work as a fairytale\n,I was intrigued by the preview of 21 because I...,I intrigue preview 21 I sucker film gambling f...,0.178202,Negative
3,Lechuguilla,8,214,310,28 March 2008,"""Winner Winner Chicken Dinner""\n",Slick camera work and some good performances r...,slick camera work good performance rev technic...,8.753568,Positive
4,Cocacolaguy912-2,7,122,182,21 April 2008,Entertaining but very cliché.\n,21 is worth seeing on a restless Friday or Sat...,21 worth see restless friday saturday night fr...,1.077842,Negative
...,...,...,...,...,...,...,...,...,...,...
306,Ozzy2000,9,0,2,18 May 2008,Unreal but entertaining.\n,Having been to Las Vegas and also loving table...,las vegas also love table game I find film ent...,6.969922,Positive
307,jemps918,8,0,1,18 May 2008,Great soundtrack!\n,"The soundtrack is pretty darn good! And yes, t...",soundtrack pretty darn good yes movie entertai...,8.662792,Positive
308,mk35,Null,0,2,11 May 2008,finders keepers\n,I watch a myriad of movies and therefore as a ...,I watch myriad movie therefore consequence lot...,7.581366,Positive
309,ankurmisra,5,0,2,12 April 2008,Preteen paced movie.\n,"The movie was OK, but don't expect great actin...",movie ok expect great acting great dramait nev...,2.058224,Negative
