In [10]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
import re
import nltk

nltk.download('stopwords')
nltk.download('punkt')


import os
import warnings
import pickle
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\laser\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\laser\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
data=pd.read_csv('IMDB Dataset.csv')

In [12]:
print(data.shape)
data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [13]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [14]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [15]:
data.sentiment.replace('positive', 1, inplace=True)
data.sentiment.replace('negative', 0, inplace=True)

In [16]:
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
5,"Probably my all-time favorite movie, a story o...",1
6,I sure would like to see a resurrection of a u...,1
7,"This show was an amazing, fresh & innovative i...",0
8,Encouraged by the positive comments about this...,0
9,If you like original gut wrenching laughter yo...,1


In [17]:
def preprocess_input(review):
    review = re.sub(re.compile(r'<.*?>'), '', review)
    review = re.sub(r'[^a-zA-Z\s]', '', review)
    review = review.lower()
    sw = set(stopwords.words('english'))
    words = word_tokenize(review)
    review = [w for w in words if w not in sw]
    stm = SnowballStemmer('english')
    review = [stm.stem(w) for w in review]
    review = ' '.join(review)
    return review

data['review']=data['review'].apply(preprocess_input)

In [18]:
data.review[0]

'one review mention watch oz episod youll hook right exact happen meth first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use wordit call oz nicknam given oswald maximum secur state penitentari focus main emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awayi would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti surreal couldnt say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score

X_review = np.array(data.iloc[:,0].values)
y_sentiment = np.array(data.sentiment.values)
cv = CountVectorizer(max_features = 2000)
X_review = cv.fit_transform(data.review).toarray()

X_train, X_test, y_train, y_test = train_test_split(X_review, y_sentiment, test_size=0.2, random_state=42)

bern = BernoulliNB(alpha=1.0)
gaus = GaussianNB()
multi = MultinomialNB(alpha=1.0)
logReg = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)

bern.fit(X_train, y_train)
gaus.fit(X_train, y_train)
multi.fit(X_train, y_train)
logReg.fit(X_train,y_train)

gaus_predict = gaus.predict(X_test)
multi_predict = multi.predict(X_test)
bern_predict = bern.predict(X_test)
logReg_predict = logReg.predict(X_test)

print("Bernoulli: ", accuracy_score(y_test, gaus_predict))
print("Gaussian: ", accuracy_score(y_test, multi_predict))
print("Multinomial: ", accuracy_score(y_test, bern_predict))
print("Logistic Regression :", accuracy_score(y_test, logReg_predict))

Bernoulli:  0.7457
Gaussian:  0.8387
Multinomial:  0.8409
Logistic Regression : 0.8699


In [None]:
user_review = input("Enter your movie review: ")

preprocessed_review = preprocess_input(user_review)

bow = cv.transform([preprocessed_review]).toarray()

prediction = logReg.predict(bow)[0]

# Print the result
if prediction == 1:
    print("The review is positive.")
else:
    print("The review is negative.")

The review is positive.


In [21]:
with open('logReg_model.pkl', 'wb') as file:
    pickle.dump(logReg, file)

with open('count_vectorizer.pkl', 'wb') as file:
    pickle.dump(cv, file)