In [68]:
# importing required library
import numpy as np
import pandas as pd
import spacy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report

In [69]:
# Load spacy Model
nlp = spacy.load('en_core_web_sm')

In [70]:
#reading the file
pos_rev = pd.read_csv('pos.txt',sep = '\n',encoding ='latin-1', header=None)
pos_rev['mood'] = 1
pos_rev.rename(columns={0:'review'}, inplace = True)
pos_rev.head()

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [71]:
#reading the file
neg_rev = pd.read_csv('negative.txt',sep = '\n',encoding ='latin-1', header=None)
neg_rev['mood'] = 0
neg_rev.rename(columns={0:'review'}, inplace = True)
neg_rev.head()

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0


In [72]:
# preprocessing steps
# 1. lower
# 2. remove punctuation
# 3. remove stopwords
# 4. lemmatization

In [73]:
# converting lower case
pos_rev['review'] = pos_rev['review'].apply(lambda x : x.lower())
pos_rev.head()

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [74]:
# remove punctuation
pos_rev['review'] = pos_rev['review'].apply(lambda x : ' '.join([ token.text for token in nlp(x) if not token.is_punct]))
pos_rev.head()

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century 's...,1
1,the gorgeously elaborate continuation of the l...,1
2,effective but too tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,emerges as something rare an issue movie that ...,1


In [75]:
# remove stop words and lemmatize
pos_rev['review'] = pos_rev['review'].apply(lambda x : ' '.join([token.lemma_ for token in nlp(x) if not token.is_stop]))
pos_rev.head()

Unnamed: 0,review,mood
0,rock destine 21st century new conan go splash ...,1
1,gorgeously elaborate continuation lord ring tr...,1
2,effective tepid biopic,1
3,like movie fun wasabi good place start,1
4,emerge rare issue movie honest keenly observe ...,1


In [76]:
# for negative sentiment data also
# lower the data
neg_rev['review'] = neg_rev['review'].apply(lambda x : x.lower())
neg_rev.head()

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0


In [77]:
# remove punctuation
neg_rev['review'] = neg_rev['review'].apply(lambda x : ' '.join([token.text for token in nlp(x) if not token.is_punct]))
neg_rev.head()

Unnamed: 0,review,mood
0,simplistic silly and tedious,0
1,it 's so laddish and juvenile only teenage boy...,0
2,exploitative and largely devoid of the depth o...,0
3,garbus discards the potential for pathological...,0
4,a visually flashy but narratively opaque and e...,0


In [78]:
# remove stop word and lemmatize
neg_rev['review'] = neg_rev['review'].apply(lambda x : ' '.join([token.lemma_ for token in nlp(x) if not token.is_stop]))
neg_rev.head()

Unnamed: 0,review,mood
0,simplistic silly tedious,0
1,laddish juvenile teenage boy possibly find funny,0
2,exploitative largely devoid depth sophisticati...,0
3,garbus discard potential pathological study ex...,0
4,visually flashy narratively opaque emotionally...,0


In [79]:
# combine positive and negative sentiment data
com_rev = pd.concat([pos_rev, neg_rev] , axis = 0).reset_index()
com_rev.head()

Unnamed: 0,index,review,mood
0,0,rock destine 21st century new conan go splash ...,1
1,1,gorgeously elaborate continuation lord ring tr...,1
2,2,effective tepid biopic,1
3,3,like movie fun wasabi good place start,1
4,4,emerge rare issue movie honest keenly observe ...,1


In [135]:
# split dependent and independent variable
X = com_rev['review'].values
y = com_rev['mood'].values

In [136]:
# train test split
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state = 101)

In [137]:
train_data = pd.DataFrame({'review':X_train,'mood':y_train})
test_data = pd.DataFrame({'review':X_test,'mood':y_test})

In [138]:
train_data.head()

Unnamed: 0,review,mood
0,put washington honest work man john q archibal...,0
1,poignant familiar story young person suspend c...,1
2,timely director dream quietly lyrical tale pro...,1
3,film virtually choke self consciousness,0
4,film take inside rhythm subject experience watch,1


In [139]:
test_data.head()

Unnamed: 0,review,mood
0,important movie reminder power film examine value,1
1,' ve see hear like film recommend originality,1
2,ending leave unfulfilled performance enjoy mem...,1
3,surface lover run crime flick lot common piesi...,1
4,walk remember shrewd activate girlish tear duc...,0


In [140]:
# converting text into vector using Tf-idf vectorizer
vector = TfidfVectorizer()
train_vectors = vector.fit_transform(train_data['review'])
test_vectors = vector.transform(test_data['review'])

In [141]:
# model building- here we create Support Vector Machine classifier
from sklearn.svm import SVC
classifier = SVC()

In [142]:
# training model
classifier.fit(train_vectors,train_data['mood'])
pred = classifier.predict(test_vectors)



In [143]:
from sklearn.metrics import classification_report
classification_report(test_data['mood'], pred ,output_dict = True)

  'precision', 'predicted', average, warn_for)


{'0': {'precision': 0.4936708860759494,
  'recall': 1.0,
  'f1-score': 0.6610169491525424,
  'support': 1053},
 '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1080},
 'accuracy': 0.4936708860759494,
 'macro avg': {'precision': 0.2468354430379747,
  'recall': 0.5,
  'f1-score': 0.3305084745762712,
  'support': 2133},
 'weighted avg': {'precision': 0.243710943759013,
  'recall': 0.4936708860759494,
  'f1-score': 0.3263248229993564,
  'support': 2133}}

In [144]:
# using naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
sent_model = MultinomialNB()

In [145]:
sent_model.fit(train_vectors,train_data['mood'])
pred = sent_model.predict(test_vectors)

In [146]:
classification_report(test_data['mood'],pred,output_dict = True)

{'0': {'precision': 0.7610186092066601,
  'recall': 0.7378917378917379,
  'f1-score': 0.7492767598842816,
  'support': 1053},
 '1': {'precision': 0.7517985611510791,
  'recall': 0.774074074074074,
  'f1-score': 0.7627737226277372,
  'support': 1080},
 'accuracy': 0.7562119081106423,
 'macro avg': {'precision': 0.7564085851788696,
  'recall': 0.755982905982906,
  'f1-score': 0.7560252412560093,
  'support': 2133},
 'weighted avg': {'precision': 0.7563502304443407,
  'recall': 0.7562119081106423,
  'f1-score': 0.7561106650708413,
  'support': 2133}}

In [147]:
# saving model
import joblib
filename = 'new_model.pkl'
joblib.dump(sent_model,filename)

['new_model.pkl']

In [148]:
new_model = joblib.load('new_model.pkl')

In [149]:
new_model.predict(test_vectors)

array([1, 1, 1, ..., 0, 1, 1], dtype=int64)

In [158]:
# saving tfidf vector
joblib.dump(vector,'tfidf_model.pkl')

['tfidf_model.pkl']

In [159]:
vec = joblib.load('tfidf_model.pkl')

In [160]:
text = 'i do not like this movie'
text = text.split()
vec1 = vec.transform(text)

In [157]:
vec1.size

3