<a href="https://colab.research.google.com/github/Vsriram01/Sentiment-Analysis---Major-Project/blob/main/Major_Project_Movie_Review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
df = pd.read_table('/content/moviereviews.tsv')
print("Shape:",df.shape)  #Return the shape of data 
print("Dimensions:",df.ndim)   #Return the n dimensions of data
print("Size:",df.size)   #Return the size of data 
print("Count of empty fields:\n",df.isna().sum())  #Returns the sum fo all na values
print("Summary of dataset:")
print(df.info())  #Give concise summary of a DataFrame
print("Top 5 reviews:\n",df.head())  #top 5 rows of the dataframe
print("Last 5 reviews:\n",df.tail()) #bottom 5 rows of the dataframe

Shape: (25000, 3)
Dimensions: 2
Size: 75000
Count of empty fields:
 id           0
sentiment    0
review       0
dtype: int64
Summary of dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB
None
Top 5 reviews:
        id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
3  3630_4          0  It must be assumed that those who praised this...
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...
Last 5 reviews:
             id  sentiment    

In [None]:
df['sentiment'].value_counts() #To check whether the data is balanced or unbalanced. In this case, it is balanced

1    12500
0    12500
Name: sentiment, dtype: int64

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
corpus = []
for i in range(0,25000):
  review = re.sub('[^\w\s]'," ",df["review"][i]) # ^ denotes "not include", \w means words and \s means whitespaces. So in this case, I'm excluding words and white spaces in tokenization
  review = review.lower()
  review = review.split()
  stop_words = stopwords.words('english') #Stopwords are repetitive words which have no effect on sentiment whatsoever. Hence removing them makes the data efficient
  review = [word for word in review if not word in set(stop_words)]
  review = " ".join(review)
  corpus.append(review)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500) # Max features (1500) columns are read
x = cv.fit_transform(corpus).toarray()
y = df["sentiment"]

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB
GNB = GaussianNB()
MNB = MultinomialNB()
model1 = GNB.fit(x_train, y_train)
model2 = MNB.fit(x_train, y_train)

In [None]:
print("Gaussian accuracy:",GNB.score(x_test,y_test))
print("Multinomial accuracy:",MNB.score(x_test,y_test))

Gaussian accuracy: 0.7954
Multinomial accuracy: 0.8484


In [None]:
y_pred=model2.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), np.array(y_test).reshape(len(y_test),1)),1))

[[1 0]
 [0 1]
 [1 0]
 ...
 [1 1]
 [0 0]
 [1 1]]


In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
CM = confusion_matrix(y_test,y_pred)
score = accuracy_score(y_test,y_pred)
cl_report = classification_report(y_test,y_pred)
print("Confusion matrix:\n",CM)
print("Classification Report:\n",cl_report)
print("Accuracy of MNB: ",score*100)

Confusion matrix:
 [[2146  402]
 [ 356 2096]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85      2548
           1       0.84      0.85      0.85      2452

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000

Accuracy of MNB:  84.84


In [None]:
#Save output as pickle file
import pickle
pickle.dump(cv, open('countvectorizer.pkl', 'wb')) #Save trained model and CountVectorizer to pickle
pickle.dump(model2, open("MNBmodel.pkl", "wb"))
loaded_model = pickle.load(open("MNBmodel.pkl", "rb")) #Open trained model and re-evaluate
y_pred_new = loaded_model.predict(x_test)
print("Accuracy score: ",loaded_model.score(x_test,y_test))

Accuracy score:  0.8484


In [None]:
# Predict output for new review
from nltk.corpus import stopwords
def new_review(new_review):
  cv = pickle.load(open("countvectorizer.pkl","rb"))
  loaded_model = pickle.load(open("MNBmodel.pkl", "rb"))
  new_review = new_review
  new_review = re.sub('[^\w\s]', ' ', new_review)
  new_review = new_review.lower()
  new_review = new_review.split()
  stop_words = stopwords.words('english')
  new_review = [word for word in new_review if not word in set(stop_words)]
  new_review = ' '.join(new_review)
  new_corpus = [new_review]
  new_X_test = cv.transform(new_corpus).toarray()
  new_y_pred = loaded_model.predict(new_X_test)
  return new_y_pred
input_review = input('Enter new review: ')
new_review = new_review(input_review)
if new_review[0]==1:
   print("Positive")
else :
   print("Negative")


Enter new review: Good
Positive


In [None]:
#Deployment
!pip install streamlit
!pip install pyngrok==4.1.1
from pyngrok import ngrok



In [None]:
%%writefile app.py
# Import modules
import pandas as pd 
import pickle
import streamlit as st
import re
from nltk.corpus import stopwords

# Display on webpage
st.title('Movie review')
st.markdown("This webpage uses the IMDB movie review dataset")
st.markdown("Using sentiment analysis, reviews will be classified under a specific category")
st.sidebar.title("Steps: ")
st.sidebar.markdown("1. Type your opinion")
st.sidebar.markdown("2. Press enter")
st.sidebar.markdown("3. Wait for the respective emoji to appear")

# Load previously created models
loaded_model = pickle.load(open("MNBmodel.pkl","rb"))
cv = pickle.load(open("countvectorizer.pkl","rb"))

# Predict output for new review
def new_review(new_review):
  new_review = new_review
  new_review = re.sub('[^\w\s]', ' ', new_review)
  new_review = new_review.lower()
  new_review = new_review.split()
  stop_words = stopwords.words('english')
  new_review = [word for word in new_review if not word in set(stop_words)]
  new_review = ' '.join(new_review)
  new_corpus = [new_review]
  new_x_test = cv.transform(new_corpus).toarray()
  new_y_pred = loaded_model.predict(new_x_test)
  return new_y_pred

# Get user input
input_review = st.text_input('Enter new review:')
new_review = new_review(input_review)
if new_review[0]==1:
   st.title(":smile:")
else :
   st.title(":worried:")

Overwriting app.py


In [None]:
!nohup streamlit run app.py &
url = ngrok.connect(port='8501')
url

nohup: appending output to 'nohup.out'


'http://cbb050c6a68e.ngrok.io'