In [13]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder


In [48]:
df = pd.read_csv('IMDB.csv',names = ['review','sentiment'])
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
11967,Sometimes I rest my head and think about the r...,negative
11968,A teenager who seems to have it all commits su...,negative
11969,Low-rent version of Ashley Judd's Double Jeopa...,negative
11970,"""Alexander Nevsky"" marked director Sergei Eise...",positive


In [49]:
df = df.sample(500)
df

Unnamed: 0,review,sentiment
2599,Charles Bronson has given the viewers lots of ...,negative
9757,"I am right now in front of the tv, watching Ca...",positive
6831,I actually found out about Favela Rising via t...,positive
5165,NBC had a chance to make a powerful religious ...,negative
9465,then you will be a big fan of this movie. Its ...,positive
...,...,...
1240,"**Could be considered some mild spoilers, but ...",negative
11416,Terrific film with a slightly slow start - giv...,positive
10449,I just found the IMDb and searched this film a...,positive
10354,"I don't know if I'm just weird, but I thorough...",positive


# Preprocessing
1. Label encoding
2. Text cleaning

In [12]:
#Label encoding

le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])
df

Unnamed: 0,review,sentiment
7659,I had long wanted to watch this romantic drama...,1
1683,The name (Frau) of the main character is the G...,0
3272,"""Nada"" was the most inadequate follow-up to ""L...",0
6313,"Having lived in Ontario my whole life, in the ...",1
4834,I found this movie to be very well-paced. The ...,1
...,...,...
11831,Fires on the plain directed by Kon Ichikawa an...,1
6121,I love this movie. My only disappointment was ...,1
2214,I am the guy who usually keeps opinions to him...,0
1085,"Winchester '73 is a great story, and that's wh...",1


# Model Building
1. Feature extraction using countvectorizer
2. Train Test Split 
3. Importing all the training methods
4. Training the models
5. Testing the models

In [42]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score

from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC

In [25]:
cv = CountVectorizer()
x = cv.fit_transform(df['review']).toarray()
y = df["sentiment"].values
print(x)
print(y)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[1 0 0 1 1 1 0 1 1 0 1 1 0 0 1 1 1 0 0 1 0 0 1 1 1 0 0 0 1 1 0 1 0 1 1 1 1
 0 1 1 0 0 0 0 0 1 0 1 0 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 1 1 0 0 0
 1 1 1 0 0 1 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 1 1 0 1 1 1 0 0
 1 0 1 1 0 0 1 0 0 0 1 1 1 1 1 1 0 1 0 0 0 0 0 1 0 1 0 1 1 0 0 1 0 1 1 1 0
 0 1 1 1 1 0 1 1 1 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 1 0 1 1 0
 1 1 0 0 1 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 1 0 1 0 0 0 1 0 1 1 1 1 1 0 0
 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 1 0
 0 1 1 0 1 0 0 0 0 0 1 0 1 0 0 1 1 1 1 0 0 0 1 1 0 1 0 0 1 1 1 0 0 1 0 1 1
 1 1 0 0 0 1 1 0 1 1 0 0 0 1 1 0 0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 1 1 1 1 1
 0 0 1 0 0 1 1 1 0 1 1 1 0 0 0 1 1 1 0 1 0 0 1 0 1 1 0 0 1 1 1 0 0 1 0 0 1
 0 0 1 1 1 0 0 0 1 0 0 1 1 0 1 0 1 1 1 0 1 0 1 0 1 1 0 1 1 1 0 0 0 1 0 0 0
 1 1 1 1 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 1 0 

#### Creating a train test split

In [63]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1,random_state=10)

### Training and testing modules

In [64]:
bnb = BernoulliNB()
mnb = MultinomialNB()
log = LogisticRegression()
svc = SVC()
sgd = SGDClassifier()

In [67]:
bnb.fit(x_train,y_train)
y_pred = bnb.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))

0.72
0.7727272727272727


In [70]:
mnb.fit(x_train,y_train)
y_pred = mnb.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))

0.72
0.75


In [71]:
log.fit(x_train,y_train)
y_pred = log.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))

0.78
0.7777777777777778


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [72]:
svc.fit(x_train,y_train)
y_pred = svc.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))

0.56
0.6666666666666666


In [73]:
sgd.fit(x_train,y_train)
y_pred = sgd.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))

0.74
0.7241379310344828
