# Problem Statement


Analysis of positive and negative reviews on Netflix data

In [1]:
# Importing all the required libraries

import re
import pandas as pd
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer


In [7]:
# Defining the stopwords
sw = stopwords.words('english')

# Defining lemmatization
lm = WordNetLemmatizer()

In [8]:
# Reading the positive reviews data

pos_rev = pd.read_csv('/Users/priyankac/Downloads/pos.txt', sep = '\n', header = None, encoding = 'latin-1')
pos_rev.head()

Unnamed: 0,0
0,the rock is destined to be the 21st century's ...
1,"the gorgeously elaborate continuation of "" the..."
2,effective but too-tepid biopic
3,if you sometimes like to go to the movies to h...
4,"emerges as something rare , an issue movie tha..."


In [9]:
# Creating a column with values as '1'

pos_rev['mood'] = 1
pos_rev

Unnamed: 0,0,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


In [10]:
# Renaming the column '0' as 'review'

pos_rev.rename(columns = {0: 'review'}, inplace = True)
pos_rev

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


In [11]:
# Reading the negative reviews column

neg_rev = pd.read_csv('/Users/priyankac/Downloads/negative.txt', sep = '\n', header = None, encoding = 'latin-1')
neg_rev.head()

Unnamed: 0,0
0,"simplistic , silly and tedious."
1,"it's so laddish and juvenile , only teenage bo..."
2,exploitative and largely devoid of the depth o...
3,[garbus] discards the potential for pathologic...
4,a visually flashy but narratively opaque and e...


In [12]:
# Creating a column 'mood' with values as '0'

neg_rev['mood'] = 0
neg_rev

Unnamed: 0,0,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
5326,a terrible movie that some people will neverth...,0
5327,there are many definitions of 'time waster' bu...,0
5328,"as it stands , crocodile hunter has the hurrie...",0
5329,the thing looks like a made-for-home-video qui...,0


In [13]:
# renaming the column '0' as 'reviews'

neg_rev.rename(columns = {0: 'review'}, inplace = True)
neg_rev

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
5326,a terrible movie that some people will neverth...,0
5327,there are many definitions of 'time waster' bu...,0
5328,"as it stands , crocodile hunter has the hurrie...",0
5329,the thing looks like a made-for-home-video qui...,0


# Text preprocessing

In [14]:
# Text preprocessing pos_rev Dataframe

# Converting text to lowercase

pos_rev['review'] = pos_rev['review'].apply(lambda x: x.lower())
pos_rev

# Removing all @ characters and any unwanted spaces

pos_rev['review'] = pos_rev['review'].apply(lambda x: re.sub(r'@\S+', "",x))
pos_rev

# Removing any digits and unwanted characters

pos_rev['review'] = pos_rev['review'].apply(lambda x: re.sub(r'[^a-zA-Z]', " ",x))
pos_rev

# Removing punctuation

pos_rev['review'] = pos_rev['review'].apply(lambda x: " ".join([word for word in x.split() if word not in string.punctuation]))
pos_rev

# Removing stopwords and performing lemmatization

pos_rev['review'] = pos_rev['review'].apply(lambda x: " ".join([lm.lemmatize(word) for word in x.split() if word not in (sw)]))
pos_rev

Unnamed: 0,review,mood
0,rock destined st century new conan going make ...,1
1,gorgeously elaborate continuation lord ring tr...,1
2,effective tepid biopic,1
3,sometimes like go movie fun wasabi good place ...,1
4,emerges something rare issue movie honest keen...,1
...,...,...
5326,exuberantly romantic serenely melancholy time ...,1
5327,mazel tov film family joyous life acting yiddi...,1
5328,standing shadow motown best kind documentary o...,1
5329,nice see piscopo year chaykin headly priceless,1


In [36]:
# Text preprocessing on neg_rev Dataframe

# Coverting text to lowercase
neg_rev['review'] = neg_rev['review'].apply(lambda x: x.lower())
neg_rev

# Removing any @ and unwanted spaces
neg_rev['review'] = neg_rev['review'].apply(lambda x: re.sub(r'@\S+' ,"", x))
neg_rev

# Removing any digits and unwanted characters
neg_rev['review'] = neg_rev['review'].apply(lambda x: re.sub(r'[^a-zA-Z]' ," ", x))
neg_rev

# Removing punctuation
neg_rev['review'] = neg_rev['review'].apply(lambda x: " ".join([word for word in x.split() if word not in string.punctuation]))
neg_rev

# Removing stopwords and performing lemmatization
neg_rev['review'] = neg_rev['review'].apply(lambda x: " ".join([lm.lemmatize(word) for word in x.split() if word not in (sw)]))
neg_rev

Unnamed: 0,review,mood
0,simplistic silly tedious,0
1,laddish juvenile teenage boy could possibly fi...,0
2,exploitative largely devoid depth sophisticati...,0
3,garbus discard potential pathological study ex...,0
4,visually flashy narratively opaque emotionally...,0
...,...,...
5326,terrible movie people nevertheless find moving,0
5327,many definition time waster movie must surely one,0
5328,stand crocodile hunter hurried badly cobbled l...,0
5329,thing look like made home video quickie,0


In [15]:
# Concatenating neg_rev and pos_rev Dataframes

com_rev = pd.concat([neg_rev, pos_rev], axis = 0, ignore_index = True)
com_rev

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
10657,exuberantly romantic serenely melancholy time ...,1
10658,mazel tov film family joyous life acting yiddi...,1
10659,standing shadow motown best kind documentary o...,1
10660,nice see piscopo year chaykin headly priceless,1


# train_test_split

In [16]:
# Splitting the data

X_train, X_test, y_train, y_test = train_test_split(com_rev['review'].values, com_rev['mood'].values, 
                                                   test_size = 0.2, random_state = 101)

In [17]:
# Checking X_train data
X_train

array(['visual spectacle full stunning image effect',
       'the sweetest thing leaves a bitter taste . ',
       "hey arnold ! the movie could have been made 40 years ago , and parents' appreciation of it may depend on whether they consider that a good thing . ",
       ..., 'poignant funny',
       'the movie straddles the fence between escapism and social commentary , and on both sides it falls short . ',
       'even with a green mohawk and a sheet of fire-red flame tattoos covering his shoulder , however , kilmer seems to be posing , rather than acting . and that leaves a hole in the center of the salton sea . '],
      dtype=object)

In [18]:
# Checking X_test data
X_test

array(["too many improbabilities and rose-colored situations temper what could've been an impacting film . ",
       "ihops don't pile on this much syrup . ",
       'those of you who are not an eighth grade girl will most likely doze off during this one . ',
       ..., 'masterpiece',
       'know precisely make steven soderbergh full frontal though stop enjoying much',
       'if only merchant paid more attention the story . '], dtype=object)

In [19]:
# For proper visualization of data converting the train and test data into DataFrame

train_data = pd.DataFrame({'review': X_train, 'mood': y_train})

test_data = pd.DataFrame({'review': X_test, 'mood': y_test})

In [16]:
train_data

Unnamed: 0,review,mood
0,visual spectacle full stunning image effect,1
1,the sweetest thing leaves a bitter taste .,0
2,hey arnold ! the movie could have been made 40...,0
3,one year best film,1
4,mendes still doesn't quite know how to fill a ...,0
...,...,...
8524,possibly not since grumpy old men have i heard...,0
8525,rare find film dazzle eye challenge brain sati...,1
8526,poignant funny,1
8527,the movie straddles the fence between escapism...,0


In [17]:
test_data

Unnamed: 0,review,mood
0,too many improbabilities and rose-colored situ...,0
1,ihops don't pile on this much syrup .,0
2,those of you who are not an eighth grade girl ...,0
3,"directors john musker and ron clements , the t...",0
4,post philosophical message personal freedom fi...,1
...,...,...
2128,woody allen can write and deliver a one liner ...,0
2129,"it's tough , astringent , darkly funny and . ....",0
2130,masterpiece,1
2131,know precisely make steven soderbergh full fro...,1


In [20]:
# Tf-Idf vectorization

vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data.review)
test_vectors = vectorizer.transform(test_data.review)

In [19]:
print(test_vectors.shape)

(2133, 16095)


In [20]:
print(train_vectors.shape)

(8529, 16095)


In [21]:
# Checking the features

#vectorizer.get_feature_names()

# Number of features
len(vectorizer.get_feature_names())

16095

# Creating model Naive bayes

In [21]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(train_vectors, train_data.mood)

MultinomialNB()

In [22]:
# Evaluating predictions
from sklearn import metrics

pred = model.predict(test_vectors)


In [23]:
# Generate the classification report
from sklearn.metrics import classification_report

report = classification_report(test_data.mood , pred , output_dict = True)
report

{'0': {'precision': 0.9771689497716894,
  'recall': 0.9907407407407407,
  'f1-score': 0.9839080459770114,
  'support': 1080},
 '1': {'precision': 0.9903660886319846,
  'recall': 0.976258309591643,
  'f1-score': 0.9832615973218556,
  'support': 1053},
 'accuracy': 0.9835911861228317,
 'macro avg': {'precision': 0.983767519201837,
  'recall': 0.9834995251661918,
  'f1-score': 0.9835848216494335,
  'support': 2133},
 'weighted avg': {'precision': 0.9836839930065187,
  'recall': 0.9835911861228317,
  'f1-score': 0.9835889130966182,
  'support': 2133}}

# Saving the model

In [25]:
import joblib

joblib.dump(model, 'netflix_99.pkl')
joblib.dump(vectorizer, 'vectorizer_netflix.pkl')

['vectorizer_netflix.pkl']