### Project : Sentiments classification on movies review dataset

In [1]:
import numpy as np
import re
import pickle
#import nltk
#from nltk.corpus import stopwords
from sklearn.datasets import load_files
import pandas as pd
#load_files:Load text files with categories as subfolder names.

### Load datasets

In [2]:
train = load_files('dataset/train',encoding='utf-8')

In [3]:
train.target_names

['neg', 'pos']

In [4]:
#train.target

In [5]:
len(train.data)

1000

In [6]:
#type(train.data)
train.data[0]

'I am a huge John Denver fan. I have a large collection of his music on vinyl. I saw this Christmas special when it was originally on TV and loved it. I have the original vinyl album and CD. I have the original CD and later release. The later release is missing several songs though. I see that it has been released this year with all original songs. To my surprise I found the original CD for sale at $75.00. WOW - to think that a Christmas Cd would be worth that much. To me no amount is worth selling this treasure. It is my favorite Christmas CD. I have never been able to find it on VHS or DVD. I would love to have either version. If anyone has one available please let me know. Thanks'

In [7]:
train.target[0]

1

In [8]:
np.bincount(train.target)

array([500, 500], dtype=int64)

In [9]:
X_train = train.data
y_train = train.target

In [10]:
len(X_train)

1000

In [11]:
test = load_files('dataset/test/',encoding='utf-8')

In [12]:
X_test = test.data
y_test = test.target

In [13]:
len(X_test)

200

### clean data

In [14]:
train.data[2]

'I almost called HBO and demanded my money back for the month just because they\'ve been airing this movie. I can just see the movie execs sitting around going, "Okay, we need to come up with something that\'s just like Home Alone, only we\'ll add a bunch of cash for the kid, hire cut-rate actors, and oh yeah, we\'ll make it a lot less funny!"<br /><br />Okay, maybe not the last part, but that\'s basically what you\'ve got here. Not even worth seeing if someone else rents it. And as a movie for kids? Forget it. I wouldn\'t let my kids see this, not necessarily because of bad-taste jokes, but because I wouldn\'t want them to say, "What were you thinking showing us that lame piece of garbage, Dad?!?!"'

In [15]:
def clean(x):
    #x=re.sub(r'\W',' ',x)
    #x = re.sub(r'[^a-zA-Z]',' ',x)
    x = re.sub("wouldn\'t",'would not',x)
    x = re.sub("they\ 've",'they have',x)
    
    #to remove html tags
    x = re.sub(r'<.*?>', '', x)
    
    #to remove everything except alpha
    x = re.sub(r'[^a-zA-Z]',' ',x)
    
      
    x = re.sub(r'\s+',' ',x)          #remove extra space's
    return x.lower()
    
#\W:matches any non-alphanumeric character; 
#this is equivalent to the set [^a-zA-Z0-9_].

In [16]:
clean('I wouldn\'t hello  ..!  a 123#hi john <html> ok</html>')

'i would not hello a hi john ok'

In [17]:
df = pd.DataFrame(X_train,columns=['review'])
df['target'] = y_train
df.head()

Unnamed: 0,review,target
0,I am a huge John Denver fan. I have a large co...,1
1,I just read the plot summary and it is the wor...,1
2,I almost called HBO and demanded my money back...,0
3,"Like his earlier film, ""In a Glass Cage"", Agus...",1
4,There are few films that leave me with the fee...,1


In [18]:
df['review']=df.review.apply(clean)

In [19]:
df.head()

Unnamed: 0,review,target
0,i am a huge john denver fan i have a large col...,1
1,i just read the plot summary and it is the wor...,1
2,i almost called hbo and demanded my money back...,0
3,like his earlier film in a glass cage agust vi...,1
4,there are few films that leave me with the fee...,1


### Convert text into numeric

In [20]:
from nltk.corpus import stopwords

In [21]:
words = stopwords.words('english')

In [22]:
if 'not' in words:
    words.remove('not')

In [23]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [24]:
#cv = CountVectorizer(min_df=10,max_df=.6,stop_words=words)
cv = CountVectorizer(stop_words=words)

#min_df=10:exclude any word that comes in 10 or less than 10 documents
#max_df=.6:excude any word that comes more than 60% of the documents,

In [25]:
X_new = cv.fit_transform(df.review.values).toarray()

In [26]:
#cv.get_feature_names()

In [27]:
X_new.shape

(1000, 15239)

In [28]:
cv.get_feature_names()

['aaaarrgh',
 'aachen',
 'aapke',
 'aaron',
 'abandon',
 'abandoned',
 'abandonment',
 'abandons',
 'abbas',
 'abby',
 'abc',
 'abducted',
 'abductor',
 'abe',
 'abiding',
 'abilities',
 'ability',
 'able',
 'ably',
 'abo',
 'aboard',
 'abominable',
 'abominably',
 'abomination',
 'aboriginal',
 'abortion',
 'abound',
 'aboutagirly',
 'abracadabrantesque',
 'abraham',
 'abroad',
 'abruptly',
 'absence',
 'absent',
 'absentminded',
 'absolute',
 'absolutely',
 'absorbed',
 'absorbing',
 'absorbs',
 'absorption',
 'abstain',
 'abstract',
 'abstraction',
 'absurd',
 'absurdism',
 'absurdity',
 'abu',
 'abundance',
 'abuse',
 'abused',
 'abuses',
 'abusive',
 'abysmal',
 'abysmally',
 'academic',
 'academy',
 'accelerated',
 'accent',
 'accents',
 'accentuates',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accepting',
 'accepts',
 'access',
 'accessible',
 'accident',
 'accidental',
 'accidentally',
 'acclaim',
 'acclaimed',
 'accompanied',
 'accompaniment',
 'accompany',
 'acco

In [29]:
### use tfidf vectorizor

In [30]:
#from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [31]:
nb = MultinomialNB()
nb.fit(X_new,y_train)

MultinomialNB()

In [32]:
nb.score(X_new,y_train)

0.983

### prediction on testing data

In [33]:
df_test = pd.DataFrame(X_test,columns=['review'])
df_test['target'] = y_test

In [34]:
df_test.head()

Unnamed: 0,review,target
0,"Formulaic slasher film, only this one stars th...",0
1,"Yes, I am a romantic of sorts who likes musica...",1
2,I went to an advance screening of this movie t...,1
3,Four things intrigued me as to this film - fir...,0
4,Our family (and the entire sold out sneak prev...,1


In [35]:
df_test.shape

(200, 2)

In [36]:
df_test['review'] = df_test.review.apply(clean)

In [37]:
X_test_new = cv.transform(df_test.review.values).toarray()

In [38]:
X_test_new.shape

(200, 15239)

In [39]:
nb.score(X_test_new,y_test)

0.785

In [40]:
test=["I do not like this movie",
      "I would not recommend this movie",
     "I hate this movie",
      "I love this movie"]

In [41]:
f=[]
for i in test:
    s=clean(i)
    f.append(s)

In [42]:
f

['i do not like this movie',
 'i would not recommend this movie',
 'i hate this movie',
 'i love this movie']

In [43]:
t=cv.transform(f).toarray()

In [44]:
t.shape

(4, 15239)

In [45]:
nb.predict(t)

array([0, 0, 0, 1])

### Lets work on 50000 samples

In [46]:
df = pd.read_csv('movie_reviews.csv')

In [47]:
df.head()

Unnamed: 0,review,sentiment
0,This is one of those unfortunate films that su...,1
1,Okay maybe it was because I happen to be in Ya...,1
2,"Although I love this movie, I can barely watch...",1
3,"A man arrives in a strange, beautiful, sterile...",1
4,I'm sitting around going through movie listing...,1


In [48]:
df.shape

(50000, 2)

In [49]:
df.sentiment.value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [50]:
df['review'] = df.review.apply(clean)

In [51]:
from sklearn.model_selection import train_test_split

In [52]:
x_train,x_test,y_train,y_test = train_test_split(df.review.values,
                                                 df.sentiment.values,
                                                test_size=10000,
                                                random_state=10)

In [53]:
x_train.shape

(40000,)

In [54]:
np.bincount(y_test)

array([4962, 5038], dtype=int64)

In [55]:
cv1 = CountVectorizer(stop_words=words)


In [56]:
#x_new=cv1.fit_transform(x_train).toarray()
#x_test_new = cv1.transform(x_test).toarray()

x_new=cv1.fit_transform(x_train)
x_test_new = cv1.transform(x_test)

In [57]:
x_new[0].toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [58]:
x_new.shape

(40000, 92358)

In [59]:
x_test_new.shape

(10000, 92358)

In [60]:
nb = MultinomialNB()
nb.fit(x_new,y_train)

MultinomialNB()

In [61]:
nb.score(x_test_new,y_test)

0.862

In [62]:
f

['i do not like this movie',
 'i would not recommend this movie',
 'i hate this movie',
 'i love this movie']

In [63]:
t=cv1.transform(f)

In [64]:
t.shape

(4, 92358)

In [65]:
nb.predict(t)

array([0, 0, 0, 1], dtype=int64)

### save the model

In [66]:
with open('nb_model.pkl','wb') as f1:
    pickle.dump(nb,f1)

In [67]:
### save the vectorizer
with open('cv1.pkl','wb') as f1:
    pickle.dump(cv1,f1)

In [68]:
### load model

In [69]:
with open('nb_model.pkl','rb') as f1:
    clf=pickle.load(f1)

In [70]:
clf.predict(t)

array([0, 0, 0, 1], dtype=int64)