In [1]:
import pandas as pd
data = pd.read_csv('/content/IMDB Dataset.csv')

In [2]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# Data Cleaning and Preprocessing

In [3]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
corpus = []
stopword = stopwords.words('english')
stopword.remove('not')
for i in range(0, len(data)):
  review = re.sub('[^a-zA-Z0-9]', ' ', data['review'][i])
  review = review.lower()
  review = review.split()
  a = []
  for word in review:
    if word not in stopword:
      a.append(lemmatizer.lemmatize(word))
  review = ' '.join(a)
  corpus.append(review)

In [6]:
corpus

['one reviewer mentioned watching 1 oz episode hooked right exactly happened br br first thing struck oz brutality unflinching scene violence set right word go trust not show faint hearted timid show pull punch regard drug sex violence hardcore classic use word br br called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy not high agenda em city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement never far away br br would say main appeal show due fact go show dare forget pretty picture painted mainstream audience forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready watched developed taste oz got accustomed high level graphic violence not violence injustice crooked guard sold nickel inmate kill order get away well mannered middle class inmate turned prison bitch due lack street skil

# Data Transformation Using Word2Vec

In [None]:
!pip install gensim



In [None]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
words = []
for sent in corpus:
  sent_token = sent_tokenize(sent)
  for sent in sent_token:
    words.append(simple_preprocess(sent))

In [None]:
words

[['one',
  'reviewer',
  'mentioned',
  'watching',
  'oz',
  'episode',
  'hooked',
  'right',
  'exactly',
  'happened',
  'br',
  'br',
  'first',
  'thing',
  'struck',
  'oz',
  'brutality',
  'unflinching',
  'scene',
  'violence',
  'set',
  'right',
  'word',
  'go',
  'trust',
  'not',
  'show',
  'faint',
  'hearted',
  'timid',
  'show',
  'pull',
  'punch',
  'regard',
  'drug',
  'sex',
  'violence',
  'hardcore',
  'classic',
  'use',
  'word',
  'br',
  'br',
  'called',
  'oz',
  'nickname',
  'given',
  'oswald',
  'maximum',
  'security',
  'state',
  'penitentary',
  'focus',
  'mainly',
  'emerald',
  'city',
  'experimental',
  'section',
  'prison',
  'cell',
  'glass',
  'front',
  'face',
  'inwards',
  'privacy',
  'not',
  'high',
  'agenda',
  'em',
  'city',
  'home',
  'many',
  'aryan',
  'muslim',
  'gangsta',
  'latino',
  'christian',
  'italian',
  'irish',
  'scuffle',
  'death',
  'stare',
  'dodgy',
  'dealing',
  'shady',
  'agreement',
  'never',


In [None]:
import gensim
model = gensim.models.Word2Vec(words, window=5, min_count=2)

In [None]:
model.wv.index_to_key

['br',
 'movie',
 'film',
 'not',
 'one',
 'like',
 'time',
 'good',
 'character',
 'story',
 'even',
 'get',
 'would',
 'make',
 'see',
 'really',
 'scene',
 'well',
 'much',
 'bad',
 'people',
 'great',
 'also',
 'first',
 'show',
 'way',
 'thing',
 'made',
 'life',
 'could',
 'think',
 'go',
 'know',
 'watch',
 'love',
 'plot',
 'actor',
 'two',
 'many',
 'seen',
 'year',
 'say',
 'end',
 'never',
 'acting',
 'look',
 'best',
 'little',
 'ever',
 'man',
 'better',
 'take',
 'come',
 'work',
 'still',
 'part',
 'something',
 'director',
 'find',
 'want',
 'back',
 'give',
 'lot',
 'real',
 'guy',
 'watching',
 'performance',
 'woman',
 'play',
 'old',
 'funny',
 'though',
 'another',
 'actually',
 'nothing',
 'role',
 'going',
 'new',
 'every',
 'girl',
 'day',
 'world',
 'point',
 'cast',
 'horror',
 'comedy',
 'minute',
 'thought',
 'fact',
 'feel',
 'quite',
 'pretty',
 'star',
 'action',
 'around',
 'seems',
 'young',
 'big',
 'however',
 'got',
 'enough',
 'right',
 'long',
 'li

In [None]:
model.corpus_count

50000

In [None]:
import numpy as np
def avg_word2vec(doc):
  return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis=0)

In [None]:
!pip install tqdm



In [None]:
from tqdm import tqdm
X=[]
for i in tqdm(range(len(words))):
  X.append(avg_word2vec(words[i]))

100%|██████████| 50000/50000 [15:21<00:00, 54.28it/s]


In [None]:
X

[array([ 0.4177651 , -0.23611729, -0.06409317,  0.20174143, -0.27277535,
         0.09812044, -0.01891225,  0.5439603 ,  0.29339045, -0.01998074,
         0.16984859, -0.01060251, -0.6054415 ,  0.253062  , -0.2373514 ,
        -0.31818324, -0.19131668, -0.52378196, -0.3247487 , -0.82038426,
         0.24413663,  0.46627852,  0.2431808 ,  0.11532379, -0.35966533,
        -0.11434291,  0.24104103, -0.09282302, -0.12141979,  0.10063194,
        -0.04143799, -0.56587464,  0.548585  , -0.43844002, -0.08423785,
         0.27156982,  0.1611511 , -0.20485647,  0.30398643,  0.04413575,
         0.01275079, -0.6864303 , -0.08943072, -0.20453264,  0.26952556,
        -0.11259572, -0.04647154,  0.32608318,  0.21132068,  0.01518727,
        -0.06043794,  0.11459266,  0.16240528,  0.26218733,  0.05771039,
        -0.11167977,  0.44804004, -0.0290123 , -0.08055592,  0.33230728,
         0.08178543, -0.13232768, -0.085521  ,  0.11212506, -0.18418488,
        -0.03640856,  0.17522596,  0.35908145, -0.5

In [None]:
import numpy as np
X_new = np.array(X)

In [None]:
X_new[0]

array([ 0.4177651 , -0.23611729, -0.06409317,  0.20174143, -0.27277535,
        0.09812044, -0.01891225,  0.5439603 ,  0.29339045, -0.01998074,
        0.16984859, -0.01060251, -0.6054415 ,  0.253062  , -0.2373514 ,
       -0.31818324, -0.19131668, -0.52378196, -0.3247487 , -0.82038426,
        0.24413663,  0.46627852,  0.2431808 ,  0.11532379, -0.35966533,
       -0.11434291,  0.24104103, -0.09282302, -0.12141979,  0.10063194,
       -0.04143799, -0.56587464,  0.548585  , -0.43844002, -0.08423785,
        0.27156982,  0.1611511 , -0.20485647,  0.30398643,  0.04413575,
        0.01275079, -0.6864303 , -0.08943072, -0.20453264,  0.26952556,
       -0.11259572, -0.04647154,  0.32608318,  0.21132068,  0.01518727,
       -0.06043794,  0.11459266,  0.16240528,  0.26218733,  0.05771039,
       -0.11167977,  0.44804004, -0.0290123 , -0.08055592,  0.33230728,
        0.08178543, -0.13232768, -0.085521  ,  0.11212506, -0.18418488,
       -0.03640856,  0.17522596,  0.35908145, -0.55830765,  0.46

#Model Traning (using Word2Vec data)

In [10]:
y = pd.get_dummies(data['sentiment'])
y = y.iloc[:,1].values

In [None]:
y

array([ True,  True,  True, ..., False, False, False])

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
review_model = RandomForestClassifier().fit(x_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = review_model.predict(x_test)
score = accuracy_score(y_test, y_pred)
print(score)
print(classification_report(y_test, y_pred))

0.8282
              precision    recall  f1-score   support

       False       0.84      0.81      0.83      5035
        True       0.82      0.84      0.83      4965

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000



# Data Transformation using BOW(Bag of Words)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
X = CountVectorizer(max_features=2500, binary=True).fit_transform(corpus).toarray()

In [11]:
print(X[1])
print(y)

[0 0 0 ... 0 0 0]
[ True  True  True ... False False False]


# Model Traning (using BOW data)

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [13]:
from sklearn.ensemble import RandomForestClassifier
review_model_bow = RandomForestClassifier().fit(x_train, y_train)

In [14]:
from sklearn.metrics import accuracy_score, classification_report
y_pred_bow = review_model_bow.predict(x_test)
score = accuracy_score(y_test, y_pred_bow)
print(score)
print(classification_report(y_test,y_pred_bow))

0.8382
              precision    recall  f1-score   support

       False       0.84      0.84      0.84      5035
        True       0.84      0.83      0.84      4965

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



# Data Transformation using TF-IDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
X = TfidfVectorizer(max_features=2500).fit_transform(corpus).toarray()

In [18]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Model Traning (using TF-IDF data)

In [19]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [20]:
from sklearn.ensemble import RandomForestClassifier
review_model_tfidf = RandomForestClassifier().fit(x_train, y_train)

In [21]:
from sklearn.metrics import accuracy_score, classification_report
y_pred_tfidf = review_model_tfidf.predict(x_test)
score = accuracy_score(y_test, y_pred_tfidf)
print(score)
print(classification_report(y_test, y_pred_tfidf))

0.8445
              precision    recall  f1-score   support

       False       0.84      0.85      0.85      5035
        True       0.85      0.84      0.84      4965

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000

