In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [4]:
df = df[:40000]

In [38]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought wonderful way spend time hot summer ...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive


In [5]:
df['sentiment'].value_counts()

sentiment
negative    20007
positive    19993
Name: count, dtype: int64

# Data Cleaning

In [6]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
df.duplicated().sum()

266

In [8]:
df.drop_duplicates(inplace = True)

In [9]:
df.duplicated().sum()

0

In [10]:
import re

In [11]:
def remove_html(text):
    p = re.compile(r'<.*?>')
    return p.sub("" , text)

In [12]:
df['review'] = df['review'].apply(remove_html)

In [13]:
from nltk.corpus import stopwords
sw_list = stopwords.words('english')

In [14]:
df['review'] = df['review'].apply(lambda x : [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [49]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive


In [15]:
df['review'] = df['review'].str.lower()

In [51]:
X = df.iloc[: , 0:1]
Y = df['sentiment']

In [52]:
X.shape

(29853, 1)

In [53]:
from sklearn.preprocessing import LabelEncoder

In [54]:
encode = LabelEncoder()

In [55]:
Y = encode.fit_transform(Y)

In [56]:
Y

array([1, 1, 1, ..., 0, 1, 0])

In [57]:
from sklearn.model_selection import train_test_split

In [58]:
X_train , X_test , Y_train , Y_test = train_test_split(X,Y , test_size = 0.2 , random_state=1)

In [59]:
X_train.shape

(23882, 1)

# Using Back of word(BOW) for text vectorization

In [60]:
from sklearn.feature_extraction.text import CountVectorizer

In [61]:
cv = CountVectorizer()

In [62]:
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [63]:
X_train_bow.shape

(23882, 76610)

In [64]:
from sklearn.naive_bayes import GaussianNB

In [65]:
gnb = GaussianNB()

In [66]:
gnb.fit(X_train_bow , Y_train )

In [67]:
Y_pred = gnb.predict(X_test_bow)

In [68]:
from sklearn.metrics import accuracy_score , confusion_matrix

In [69]:
accuracy_score(Y_test , Y_pred)

0.6488025456372467

In [70]:
confusion_matrix(Y_test , Y_pred)

array([[2395,  587],
       [1510, 1479]])

# Trying with Random Forest Classifier

In [71]:
from sklearn.ensemble import RandomForestClassifier

In [79]:
rf = RandomForestClassifier()

In [74]:
rf.fit(X_train_bow , Y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(Y_test,y_pred)

0.8514486685647296

In [80]:
#Maximum 3000 features use kiye jaane wale words ko use kiya hai
cv = CountVectorizer(max_features = 3000)

In [76]:
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [77]:
rf = RandomForestClassifier()

rf.fit(X_train_bow , Y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(Y_test,y_pred)

0.8372131971194104

In [81]:
cv = CountVectorizer(ngram_range = (1,3) ,  max_features = 5000)

In [82]:
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [83]:
rf = RandomForestClassifier()

rf.fit(X_train_bow , Y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(Y_test,y_pred)

0.8365432925808073

# Using TfIdf

In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [85]:
tf = TfidfVectorizer()

In [88]:
X_train_tfidf = tf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tf.transform(X_test['review']).toarray()

In [90]:
rf = RandomForestClassifier()
rf.fit(X_train_tfidf , Y_train)

In [92]:
y_pred = rf.predict(X_test_tfidf)

In [93]:
accuracy_score(Y_test , y_pred)

0.8531234299112377

# Using Word2Vec For text vectorization

In [16]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought wonderful way spend time hot summer ...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive


In [22]:
import gensim
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [24]:
story=[]
for review in df['review']:
    raw_sent = sent_tokenize(review)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [25]:
len(story)

422208

In [27]:
model = gensim.models.Word2Vec(
    window = 10 , 
    min_count = 2
)

In [28]:
model.build_vocab(story)

In [29]:
model.train(story , total_examples = model.corpus_count , epochs = model.epochs)

(24525102, 25996090)

In [32]:
def doct_vect(doc):
    doc = [word for word in  doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc] , axis = 0)

In [31]:
from tqdm import tqdm

### Remove the vocabulary of the word

In [33]:
X = []
for doc in tqdm(df['review'].values):
    X.append(doct_vect(doc))

100%|██████████| 39734/39734 [1:21:37<00:00,  8.11it/s]


In [34]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

y = encoder.fit_transform(df['sentiment'])

In [35]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [36]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.8160312067446835