In [0]:
import numpy as np
import pandas as pd

import re
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords

import spacy

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
df = pd.read_csv('/content/drive/My Drive/IMDB Dataset.csv')
print(len(df))
df.head()

50000


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [0]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [0]:
# checking the blanks rows in the data
blanks = []

for i,lb,rv in df.itertuples():
  if rv.isspace():
    blanks.append(i)
blanks

[]

In [0]:
# Applying text cleaning techniques

def clean_text(text):
  text = text.lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('<.*?>+', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub('\n', '', text)
  text = re.sub('\w*\d\w*', '', text)
  return text

df['review'] = df['review'].apply(lambda x: clean_text(x))
df['review'].head()

0    one of the other reviewers has mentioned that ...
1    a wonderful little production the filming tech...
2    i thought this was a wonderful way to spend ti...
3    basically theres a family where a little boy j...
4    petter matteis love in the time of money is a ...
Name: review, dtype: object

In [0]:
# Tokenizing the dataset
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
df['review'] = df['review'].apply(lambda x: tokenizer.tokenize(x))
df['review'][3]

['basically',
 'theres',
 'a',
 'family',
 'where',
 'a',
 'little',
 'boy',
 'jake',
 'thinks',
 'theres',
 'a',
 'zombie',
 'in',
 'his',
 'closet',
 'his',
 'parents',
 'are',
 'fighting',
 'all',
 'the',
 'timethis',
 'movie',
 'is',
 'slower',
 'than',
 'a',
 'soap',
 'opera',
 'and',
 'suddenly',
 'jake',
 'decides',
 'to',
 'become',
 'rambo',
 'and',
 'kill',
 'the',
 'zombieok',
 'first',
 'of',
 'all',
 'when',
 'youre',
 'going',
 'to',
 'make',
 'a',
 'film',
 'you',
 'must',
 'decide',
 'if',
 'its',
 'a',
 'thriller',
 'or',
 'a',
 'drama',
 'as',
 'a',
 'drama',
 'the',
 'movie',
 'is',
 'watchable',
 'parents',
 'are',
 'divorcing',
 'arguing',
 'like',
 'in',
 'real',
 'life',
 'and',
 'then',
 'we',
 'have',
 'jake',
 'with',
 'his',
 'closet',
 'which',
 'totally',
 'ruins',
 'all',
 'the',
 'film',
 'i',
 'expected',
 'to',
 'see',
 'a',
 'boogeyman',
 'similar',
 'movie',
 'and',
 'instead',
 'i',
 'watched',
 'a',
 'drama',
 'with',
 'some',
 'meaningless',
 'thri

In [0]:
# Removing stopwords belonging to english language
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

df['review'] = df['review'].apply(lambda x: remove_stopwords(x))
df['review'][3]

['basically',
 'theres',
 'family',
 'little',
 'boy',
 'jake',
 'thinks',
 'theres',
 'zombie',
 'closet',
 'parents',
 'fighting',
 'timethis',
 'movie',
 'slower',
 'soap',
 'opera',
 'suddenly',
 'jake',
 'decides',
 'become',
 'rambo',
 'kill',
 'zombieok',
 'first',
 'youre',
 'going',
 'make',
 'film',
 'must',
 'decide',
 'thriller',
 'drama',
 'drama',
 'movie',
 'watchable',
 'parents',
 'divorcing',
 'arguing',
 'like',
 'real',
 'life',
 'jake',
 'closet',
 'totally',
 'ruins',
 'film',
 'expected',
 'see',
 'boogeyman',
 'similar',
 'movie',
 'instead',
 'watched',
 'drama',
 'meaningless',
 'thriller',
 'well',
 'playing',
 'parents',
 'descent',
 'dialogs',
 'shots',
 'jake',
 'ignore']

In [0]:
def combine_text(list_of_text):
    combined_text = ' '.join(list_of_text)
    return combined_text

df['review'] = df['review'].apply(lambda x : combine_text(x))

In [0]:
# Token Normalization
# Lemmatizer

lemmatizer=nltk.stem.WordNetLemmatizer()
df['review'] = df['review'].apply(lambda x: lemmatizer.lemmatize(x))

In [0]:
# setting X and Y
X = df['review']
y = df['sentiment']

In [0]:
# Spliting the data 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=101)

In [0]:
# Importing libray
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 

In [0]:
# creating Pipeline for processing and modeling
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 2), norm='l2')

svc_clf = Pipeline([('tfidf', vectorizer),
                      ('svc_clf', LinearSVC())])


lr_clf = Pipeline([('tfidf', vectorizer),
                      ('lr_clf', LogisticRegression())])


In [0]:
# fitting the data into models
svc_clf.fit(X_train,y_train)
lr_clf.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.5, max_features=None,
                                 min_df=2, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('lr_clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scal

In [0]:
svc_pred = svc_clf.predict(X_test)
lr_pred = lr_clf.predict(X_test)

### Evalution of Linear SVC

In [0]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [0]:
print(confusion_matrix(y_test, svc_pred))
print('\n')
print(classification_report(y_test, svc_pred))
print('\n')
print(accuracy_score(y_test, svc_pred))

[[11077  1410]
 [ 1154 11359]]


              precision    recall  f1-score   support

    negative       0.91      0.89      0.90     12487
    positive       0.89      0.91      0.90     12513

    accuracy                           0.90     25000
   macro avg       0.90      0.90      0.90     25000
weighted avg       0.90      0.90      0.90     25000



0.89744


### Evalution of Logistic Regression

In [0]:
print(confusion_matrix(y_test, lr_pred))
print('\n')
print(classification_report(y_test, lr_pred))
print('\n')
print(accuracy_score(y_test, lr_pred))

[[10892  1595]
 [ 1255 11258]]


              precision    recall  f1-score   support

    negative       0.90      0.87      0.88     12487
    positive       0.88      0.90      0.89     12513

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000



0.886
