In [1]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer
from nltk.stem import WordNetLemmatizer 

In [2]:
pos_rev = pd.read_csv('netflix/pos.txt' , sep = '\n' , header= None , encoding='latin-1')
pos_rev['mood'] = 1
pos_rev.rename(columns={0 : 'review'} , inplace = True)
pos_rev

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


In [3]:
neg_rev = pd.read_csv('netflix/negative.txt' , sep = '\n' , header= None , encoding='latin-1')
neg_rev['mood'] = 0
neg_rev.rename(columns={0 : 'review'} , inplace = True)
neg_rev

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
5326,a terrible movie that some people will neverth...,0
5327,there are many definitions of 'time waster' bu...,0
5328,"as it stands , crocodile hunter has the hurrie...",0
5329,the thing looks like a made-for-home-video qui...,0


In [None]:
#1. lower
#2. regex 
#3. stopwords
#4. punctuations
#5. lemmatize



In [4]:
sw = stopwords.words('english')
lem = WordNetLemmatizer()

In [27]:

pos_rev['review'] = pos_rev.loc[: , 'review'].apply(lambda x : x.lower())
pos_rev['review'] = pos_rev.loc[: , 'review'].apply(lambda x : re.sub(r'@\S+' , '' , x))
pos_rev['review'] = pos_rev.loc[: , 'review'].apply(lambda x : x.translate(str.maketrans(dict.fromkeys(string.punctuation))))
pos_rev['review'] = pos_rev.loc[: , 'review'].apply(lambda x : " ".join([lem.lemmatize(word , pos='v') for word in x.split() if word not in (sw)]))
pos_rev

Unnamed: 0,review,mood
0,rock destine 21st century new conan go make sp...,1
1,gorgeously elaborate continuation lord ring tr...,1
2,effective tootepid biopic,1
3,sometimes like go movie fun wasabi good place ...,1
4,emerge something rare issue movie thats honest...,1
...,...,...
5326,exuberantly romantic serenely melancholy time ...,1
5327,mazel tov film family joyous life act yiddish ...,1
5328,stand shadow motown best kind documentary one ...,1
5329,nice see piscopo year chaykin headly priceless,1


In [28]:
neg_rev['review'] = neg_rev.loc[: , 'review'].apply(lambda x : x.lower())
neg_rev['review'] = neg_rev.loc[: , 'review'].apply(lambda x : re.sub(r'@\S+' , '' , x))
neg_rev['review'] = neg_rev.loc[: , 'review'].apply(lambda x : x.translate(str.maketrans(dict.fromkeys(string.punctuation))))
neg_rev['review'] = neg_rev.loc[: , 'review'].apply(lambda x : " ".join([lem.lemmatize(word , pos='v') for word in x.split() if word not in (sw)]))
neg_rev

Unnamed: 0,review,mood
0,simplistic silly tedious,0
1,laddish juvenile teenage boys could possibly f...,0
2,exploitative largely devoid depth sophisticati...,0
3,garbus discard potential pathological study ex...,0
4,visually flashy narratively opaque emotionally...,0
...,...,...
5326,terrible movie people nevertheless find move,0
5327,many definitions time waster movie must surely...,0
5328,stand crocodile hunter hurry badly cobble look...,0
5329,thing look like madeforhomevideo quickie,0


In [29]:
com_rev = pd.concat([pos_rev , neg_rev]).reset_index(drop = True)
com_rev

Unnamed: 0,review,mood
0,rock destine 21st century new conan go make sp...,1
1,gorgeously elaborate continuation lord ring tr...,1
2,effective tootepid biopic,1
3,sometimes like go movie fun wasabi good place ...,1
4,emerge something rare issue movie thats honest...,1
...,...,...
10657,terrible movie people nevertheless find move,0
10658,many definitions time waster movie must surely...,0
10659,stand crocodile hunter hurry badly cobble look...,0
10660,thing look like madeforhomevideo quickie,0


In [31]:
com_rev['review'].values

array(['rock destine 21st century new conan go make splash even greater arnold schwarzenegger jeanclaud van damme steven segal',
       'gorgeously elaborate continuation lord ring trilogy huge column word cannot adequately describe cowriterdirector peter jackson expand vision j r r tolkien middleearth',
       'effective tootepid biopic', ...,
       'stand crocodile hunter hurry badly cobble look 1959 godzilla combine scenes japanese monster flick can shots raymond burr comment monsters path destruction',
       'thing look like madeforhomevideo quickie',
       'enigma wellmade dry placid'], dtype=object)

In [35]:
X_train , X_test , y_train , y_test = train_test_split(com_rev['review'].values , com_rev['mood'].values , test_size = 0.2, random_state = 101)

In [None]:
# train_data = pd.DataFrame({'review':X_train , 'mood':y_tra})

In [37]:
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

In [38]:
from sklearn import svm
from sklearn.metrics import classification_report

In [48]:
classifier = svm.SVC(kernel = 'linear')
classifier.fit(train_vectors , y_train)

SVC(kernel='linear')

In [49]:
pred = classifier.predict(test_vectors)

In [50]:
report  = classification_report(y_test , pred , output_dict=True)
report

{'0': {'precision': 0.7916666666666666,
  'recall': 0.7758784425451092,
  'f1-score': 0.7836930455635491,
  'support': 1053},
 '1': {'precision': 0.7856494096276113,
  'recall': 0.8009259259259259,
  'f1-score': 0.7932141219624026,
  'support': 1080},
 'accuracy': 0.7885607126113455,
 'macro avg': {'precision': 0.788658038147139,
  'recall': 0.7884021842355176,
  'f1-score': 0.7884535837629758,
  'support': 2133},
 'weighted avg': {'precision': 0.7886199542418285,
  'recall': 0.7885607126113455,
  'f1-score': 0.7885138437401839,
  'support': 2133}}