In [1]:
# importing all the libraries

import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
sw = stopwords.words('english')

In [4]:
import sklearn
print(sklearn.__version__)

0.23.2


In [5]:
# read the positive data

pos_rev = pd.read_csv('/Users/priyankac/Downloads/pos.txt', sep = '\n', header = None, encoding = 'latin-1')
pos_rev.head()


Unnamed: 0,0
0,the rock is destined to be the 21st century's ...
1,"the gorgeously elaborate continuation of "" the..."
2,effective but too-tepid biopic
3,if you sometimes like to go to the movies to h...
4,"emerges as something rare , an issue movie tha..."


In [6]:
# creating a new column with the value as 1

pos_rev['mood'] = 1
pos_rev

Unnamed: 0,0,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


In [8]:
# Renaming the column '0' with 'review'

pos_rev.rename(columns = {0: 'review'}, inplace = True)

In [6]:
pos_rev

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


In [13]:
# Read the negative data

neg_rev = pd.read_csv('/Users/priyankac/Downloads/negative.txt', sep = '\n', header = None, encoding = 'latin-1')
neg_rev.head()

Unnamed: 0,0
0,"simplistic , silly and tedious."
1,"it's so laddish and juvenile , only teenage bo..."
2,exploitative and largely devoid of the depth o...
3,[garbus] discards the potential for pathologic...
4,a visually flashy but narratively opaque and e...


In [14]:
# Add a column with value '0'

neg_rev['mood'] = 0
neg_rev

Unnamed: 0,0,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
5326,a terrible movie that some people will neverth...,0
5327,there are many definitions of 'time waster' bu...,0
5328,"as it stands , crocodile hunter has the hurrie...",0
5329,the thing looks like a made-for-home-video qui...,0


In [15]:
# Rename the column '0' as 'review'

neg_rev.rename(columns = {0: 'review'}, inplace = True)

In [10]:
neg_rev

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
5326,a terrible movie that some people will neverth...,0
5327,there are many definitions of 'time waster' bu...,0
5328,"as it stands , crocodile hunter has the hurrie...",0
5329,the thing looks like a made-for-home-video qui...,0


In [None]:
# Clean the data- Text Preprocessing
# 1. lowercase
# 2. spaces
# 3. punctuation
# 4. stopwords
# 5. lemmatize

In [12]:
# All the text preprocessing steps are done stepwise for both pos_rev and neg_rev

# 1. Converting the pos_rev to lowercase
pos_rev['review'] = pos_rev['review'].apply(lambda x: x.lower()) 
pos_rev

# 2. Removing the spaces and any @ signs

pos_rev['review'] = pos_rev['review'].apply(lambda x: re.sub(r"@\s+", "", x))

# removing any digits and other charecters
pos_rev['review'] = pos_rev['review'].apply(lambda x: re.sub(r"[^a-zA-Z]", " ", x))
pos_rev


# 3. removing the punctuation

pos_rev['review'] = pos_rev['review'].apply(lambda x:" ".join([word for word in x.split() if word not in string.punctuation]))
pos_rev


# 4. Remove the stopwords and do lemmatization

lm = WordNetLemmatizer()

pos_rev['review'] = pos_rev['review'].apply(lambda x:" ".join([lm.lemmatize(word) for word in x.split() if word not in (sw)]))
pos_rev



Unnamed: 0,review,mood
0,rock destined st century new conan going make ...,1
1,gorgeously elaborate continuation lord ring tr...,1
2,effective tepid biopic,1
3,sometimes like go movie fun wasabi good place ...,1
4,emerges something rare issue movie honest keen...,1
...,...,...
5326,exuberantly romantic serenely melancholy time ...,1
5327,mazel tov film family joyous life acting yiddi...,1
5328,standing shadow motown best kind documentary o...,1
5329,nice see piscopo year chaykin headly priceless,1


In [16]:
# 1. Converting neg_rev to lowercase

neg_rev['review'] = neg_rev['review'].apply(lambda x: x.lower()) 
neg_rev

# 2. Removing the spaces and @
neg_rev['review'] = neg_rev['review'].apply(lambda x: re.sub(r"@\S+", "", x))

# removing any digits and other characters

neg_rev['review'] = neg_rev['review'].apply(lambda x: re.sub(r'[^a-zA-Z]' , " ", x))
neg_rev

# 3. Removing the punctuation
neg_rev['review'] = neg_rev['review'].apply(lambda x:" ".join([word for word in x.split() if word not in string.punctuation]))
neg_rev

# 4. Removing  stopwords and do lemmatization

lm = WordNetLemmatizer()

neg_rev['review'] = neg_rev['review'].apply(lambda x: " ".join([lm.lemmatize(word) for word in x.split() if word not in (sw)]))
neg_rev


Unnamed: 0,review,mood
0,simplistic silly tedious,0
1,laddish juvenile teenage boy could possibly fi...,0
2,exploitative largely devoid depth sophisticati...,0
3,garbus discard potential pathological study ex...,0
4,visually flashy narratively opaque emotionally...,0
...,...,...
5326,terrible movie people nevertheless find moving,0
5327,many definition time waster movie must surely one,0
5328,stand crocodile hunter hurried badly cobbled l...,0
5329,thing look like made home video quickie,0


In [17]:
# Concatenate the pos_rev and the neg_rev data

com_rev = pd.concat([neg_rev, pos_rev], axis = 0, ignore_index = True)
com_rev

Unnamed: 0,review,mood
0,simplistic silly tedious,0
1,laddish juvenile teenage boy could possibly fi...,0
2,exploitative largely devoid depth sophisticati...,0
3,garbus discard potential pathological study ex...,0
4,visually flashy narratively opaque emotionally...,0
...,...,...
10657,exuberantly romantic serenely melancholy time ...,1
10658,mazel tov film family joyous life acting yiddi...,1
10659,standing shadow motown best kind documentary o...,1
10660,nice see piscopo year chaykin headly priceless,1


In [21]:
# train_test_split

X_train, X_test, y_train, y_test = train_test_split(com_rev['review'].values , com_rev['mood'].values ,
                                                   test_size = 0.2, random_state = 101)

In [12]:
X_train

array(['visual spectacle full stunning images effects',
       'the sweetest thing leaves a bitter taste . ',
       "hey arnold ! the movie could have been made 40 years ago , and parents' appreciation of it may depend on whether they consider that a good thing . ",
       ..., 'poignant funny',
       'the movie straddles the fence between escapism and social commentary , and on both sides it falls short . ',
       'even with a green mohawk and a sheet of fire-red flame tattoos covering his shoulder , however , kilmer seems to be posing , rather than acting . and that leaves a hole in the center of the salton sea . '],
      dtype=object)

In [13]:
X_test

array(["too many improbabilities and rose-colored situations temper what could've been an impacting film . ",
       "ihops don't pile on this much syrup . ",
       'those of you who are not an eighth grade girl will most likely doze off during this one . ',
       ..., 'masterpiece',
       "know precisely make steven soderbergh's full frontal though stop enjoying much",
       'if only merchant paid more attention the story . '], dtype=object)

In [22]:
# To visualize the data we convert it to dataframe

train_data = pd.DataFrame({'review':X_train , 'mood':y_train})
train_data




Unnamed: 0,review,mood
0,visual spectacle full stunning image effect,1
1,sweetest thing leaf bitter taste,0
2,hey arnold movie could made year ago parent ap...,0
3,one year best film,1
4,mendes still quite know fill frame like hank c...,0
...,...,...
8524,possibly since grumpy old men heard film solid...,0
8525,rare find film dazzle eye challenge brain sati...,1
8526,poignant funny,1
8527,movie straddle fence escapism social commentar...,0


In [23]:
test_data = pd.DataFrame({'review':X_test, 'mood':y_test})
test_data

Unnamed: 0,review,mood
0,many improbability rose colored situation temp...,0
1,ihops pile much syrup,0
2,eighth grade girl likely doze one,0
3,director john musker ron clements team behind ...,0
4,post philosophical message personal freedom fi...,1
...,...,...
2128,woody allen write deliver one liner well anybo...,0
2129,tough astringent darkly funny well also generi...,0
2130,masterpiece,1
2131,know precisely make steven soderbergh full fro...,1


In [24]:
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data.review)
test_vectors = vectorizer.transform(test_data.review)

In [25]:
vectorizer.get_feature_names()   # using len(vectorizer.get_feature_names()) gave sthe number of features

['aaliyah',
 'abagnale',
 'abandon',
 'abandone',
 'abandoned',
 'abandono',
 'abbas',
 'abbass',
 'abbott',
 'abbreviated',
 'abc',
 'abderrahmane',
 'abel',
 'aberration',
 'abhorrent',
 'abiding',
 'ability',
 'able',
 'ably',
 'abomination',
 'aboriginal',
 'aborted',
 'aboul',
 'abound',
 'abraham',
 'abrams',
 'abrasive',
 'abridged',
 'abroad',
 'abrupt',
 'abruptly',
 'absence',
 'absent',
 'absolutamente',
 'absolute',
 'absolutely',
 'absorb',
 'absorbed',
 'absorbing',
 'absorbs',
 'absorption',
 'abstract',
 'absurd',
 'absurdist',
 'absurdity',
 'absurdly',
 'abundance',
 'abundant',
 'abundantly',
 'aburrido',
 'abuse',
 'abused',
 'abuser',
 'abysmal',
 'abysmally',
 'abyss',
 'acaba',
 'acabamos',
 'academic',
 'academy',
 'accent',
 'accept',
 'acceptable',
 'acceptance',
 'accepting',
 'accepts',
 'access',
 'accessibility',
 'accessible',
 'accident',
 'accidental',
 'acclaim',
 'acclaimed',
 'accommodate',
 'accompanied',
 'accompanies',
 'accompany',
 'accompanying

In [26]:
# SVM
 
from sklearn import svm
from sklearn.metrics import classification_report

In [27]:
classifier = svm.SVC()
classifier.fit(train_vectors , train_data.mood)

SVC()

In [28]:
pred = classifier.predict(test_vectors)

In [30]:
# Generate the classification report

report = classification_report(test_data.mood , pred , output_dict = True)
report

{'0': {'precision': 0.7527573529411765,
  'recall': 0.7583333333333333,
  'f1-score': 0.7555350553505534,
  'support': 1080},
 '1': {'precision': 0.7502392344497608,
  'recall': 0.7445394112060779,
  'f1-score': 0.7473784556720686,
  'support': 1053},
 'accuracy': 0.7515236755743084,
 'macro avg': {'precision': 0.7514982936954686,
  'recall': 0.7514363722697056,
  'f1-score': 0.751456755511311,
  'support': 2133},
 'weighted avg': {'precision': 0.7515142311542751,
  'recall': 0.7515236755743084,
  'f1-score': 0.751508379559909,
  'support': 2133}}

In [31]:
# Saving the model( the vectoriser and the classifier both need to be saved so that the vocabulary is same)

import joblib
joblib.dump(classifier , 'netflix_75.pkl')
joblib.dump(vectorizer , 'vectorizer.pkl')


['vectorizer.pkl']

In [25]:
#  in Flask app the following needs to be done

vector = joblib.load('vectorizer.pkl')
model = joblib.load('netflix_75.pkl')

In [26]:
data = ['bad movie']

tfidf = vector.transform(data).toarray()
my_pred = model.predict(tfidf)
print(my_pred)

[0]


In [None]:
# final pipeline

# data--->preprocessing--->vectorizer--->split--->algorithm

In [None]:
# things to do

# stem
# remove -
# regex to remove numbers
# use naive bayes algorithm
# Flask app
