In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score , accuracy_score
import pickle

In [2]:
df = pd.read_csv("movie_reviews.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df['sentiment'] = df['sentiment'].map({'positive' : 1 , 'negative' : 0})

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [6]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
df.shape

(50000, 2)

### Using the TFIDF Vectorizer

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Paras
[nltk_data]     Rana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
stopset = stopwords.words('english')

In [10]:
vectoriser = TfidfVectorizer(use_idf=True , lowercase=True , strip_accents='ascii' , stop_words=stopset)

In [11]:
X = vectoriser.fit_transform(df.review)
y = df.sentiment
pickle.dump(vectoriser , open('transform1.pkl' , 'wb'))

### Train-Test Split

In [12]:
X_train , X_test , y_train , y_test = train_test_split(X , y ,  test_size=0.33 , random_state=42)

In [13]:
clf = naive_bayes.MultinomialNB()
clf.fit(X_train , y_train)

MultinomialNB()

In [14]:
y_pred = clf.predict(X_test)
score = accuracy_score(y_pred , y_test)
print(score)

0.8644242424242424


In [15]:
clf = naive_bayes.MultinomialNB()
clf.fit(X, y)

MultinomialNB()

In [16]:
y_pred = clf.predict(X_test)
score = accuracy_score(y_pred , y_test)
print(score)

0.9024848484848484


In [17]:
filename = 'nlp_model2.pkl'
pickle.dump(clf , open(filename , 'wb'))