In [23]:
! python --version

Python 3.7.0


In [24]:
! venv\Scripts\activate.bat

In [25]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split as tts
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import text

In [26]:
data = pd.read_csv("IMDB Dataset.csv")

In [27]:
# top values of the data-set
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [28]:
# count of unique values in the column
data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

**Data Pre-Processing**

In [29]:
def clean_text1(text):
    text=re.sub('\[.*?\]','',text)
    text=re.sub('[%s]'%re.escape(string.punctuation),'',text)
    text=re.sub('\w*\d\w*','',text)
    return text

cleaned1=lambda x:clean_text1(x)

In [30]:
data['review']=pd.DataFrame(data.review.apply(cleaned1))

In [31]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production br br The filmin...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive


In [32]:
# second round of cleaning
def clean_text2(text):
    text=re.sub('[''"",,,]','',text)
    text=re.sub('\n','',text)
    return text

cleaned2=lambda x:clean_text2(x)

In [33]:
data['review']=pd.DataFrame(data.review.apply(cleaned2))
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production br br The filmin...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive


In [34]:
x = data.iloc[0:,0].values
y = data.iloc[0:,1].values
xtrain,xtest,ytrain,ytest = tts(x,y,test_size = 0.25,random_state = 225)

In [35]:
all_stopwords = list(text.ENGLISH_STOP_WORDS)
print(type(all_stopwords))
all_stopwords.remove('not')
print(all_stopwords)

<class 'list'>
['now', 'etc', 'full', 'during', 'please', 'un', 'hereby', 'most', 'yourselves', 'former', 'indeed', 'me', 'same', 'whoever', 'however', 'else', 'before', 'while', 'have', 'three', 'enough', 'by', 'these', 'been', 'must', 'thence', 'none', 'meanwhile', 'a', 'ever', 'take', 'even', 'mostly', 'why', 'system', 'along', 'next', 'everything', 'thru', 'forty', 'formerly', 'whither', 'anyone', 'himself', 'one', 'throughout', 'after', 'was', 'few', 'below', 'her', 'if', 'many', 'already', 'put', 'every', 'twelve', 'that', 'cant', 'well', 'become', 'yours', 'further', 'elsewhere', 'inc', 'except', 'de', 'move', 'bill', 'which', 'since', 'with', 'together', 'perhaps', 'never', 'nor', 'whom', 'seemed', 'thereupon', 'only', 'least', 'rather', 'cannot', 'at', 'among', 'as', 'hasnt', 'around', 'for', 'somewhere', 'those', 'made', 'is', 'nevertheless', 'something', 'him', 'onto', 'this', 'noone', 'six', 'his', 'name', 'each', 'latter', 'always', 'alone', 'against', 'first', 'whereas', 

In [36]:
tf = TfidfVectorizer(stop_words = all_stopwords)
from sklearn.pipeline import Pipeline

**Logistic Regression**

In [37]:

classifier=LogisticRegression()
model=Pipeline([('vectorizer',tf),('classifier',classifier)])

model.fit(xtrain,ytrain)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['now', 'etc', 'full', 'during',
                                             'please', 'un', 'hereby', 'most',
                                             'yourselves', 'former', 'indeed',
                                             'me', 'same', 'whoever', 'however',
                                             'else', 'before', 'while', 'have',
                                             'three', 'enough', 'by', 'these',
                                             'been', 'must', 'thence', 'none',
                                             'meanwhile', 'a', 'ever', ...])),
                ('classifier', LogisticRegression())])

In [38]:
ypred=model.predict(xtest)
ypred

array(['positive', 'positive', 'negative', ..., 'positive', 'negative',
       'positive'], dtype=object)

In [39]:
# model score
accuracy_score(ypred,ytest)

0.89048

In [40]:
predict_this = ["This is really bad", "I fell asleep for the SECOND time watching dune. Not because it’s a bad movie but I just keep watching it when I’m"]
model.predict(predict_this)

array(['positive', 'negative', 'negative'], dtype=object)

In [41]:
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))