In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 255)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data import and Checking

In [None]:
#Data importing
Dataset = "IMDB Dataset.csv"
df = pd.read_csv(Dataset)
df.shape

(50000, 2)

In [None]:
df = df.iloc[:10000] #Taking 10000 samples only
df.shape

(10000, 2)

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of v...",positive
1,"A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen-...",positive
2,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well b...",positive
3,"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br...",negative
4,"Petter Mattei's ""Love in the Time of Money"" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what money, power and success do to people in the different situ...",positive


In [None]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,5028
negative,4972


In [None]:
df.isnull().sum() #checking missing values

Unnamed: 0,0
review,0
sentiment,0


In [None]:
df.duplicated().sum() #checking duplicate values

np.int64(17)

In [None]:
df.drop_duplicates(inplace=True) #dropping duplicate values

In [None]:
df.duplicated().sum()

np.int64(0)

# Basic Pre-processing

In [None]:
# Remove html-tags
# lower case
# remove stop words

def remove_html(text):
  pattern = re.compile('<.*?>')
  return pattern.sub(r'', text)

In [None]:
df['review'] = df['review'].apply(remove_html)

In [None]:
df['review'] = df['review'].str.lower()  #lower all characters

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,"one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.the first thing that struck me about oz was its brutality and unflinching scenes of violence, whi...",positive
1,"a wonderful little production. the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. the actors are extremely well chosen- michael sheen not only ...",positive
2,"i thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. the plot is simplistic, but the dialogue is witty and the characters are likable (even the well b...",positive
3,"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of al...",negative
4,"petter mattei's ""love in the time of money"" is a visually stunning film to watch. mr. mattei offers us a vivid portrait about human relations. this is a movie that seems to be telling us what money, power and success do to people in the different situ...",positive


In [None]:
stopwords = stopwords.words('english')

In [None]:
df['review'] = df['review'].apply(lambda x: [i for i in x.split() if i not in stopwords]).apply(lambda x: " ".join(x))

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,"one reviewers mentioned watching 1 oz episode hooked. right, exactly happened me.the first thing struck oz brutality unflinching scenes violence, set right word go. trust me, show faint hearted timid. show pulls punches regards drugs, sex violence. ha...",positive
1,"wonderful little production. filming technique unassuming- old-time-bbc fashion gives comforting, sometimes discomforting, sense realism entire piece. actors extremely well chosen- michael sheen ""has got polari"" voices pat too! truly see seamless edit...",positive
2,"thought wonderful way spend time hot summer weekend, sitting air conditioned theater watching light-hearted comedy. plot simplistic, dialogue witty characters likable (even well bread suspected serial killer). may disappointed realize match point 2: r...",positive
3,"basically there's family little boy (jake) thinks there's zombie closet & parents fighting time.this movie slower soap opera... suddenly, jake decides become rambo kill zombie.ok, first going make film must decide thriller drama! drama movie watchable...",negative
4,"petter mattei's ""love time money"" visually stunning film watch. mr. mattei offers us vivid portrait human relations. movie seems telling us money, power success people different situations encounter. variation arthur schnitzler's play theme, director ...",positive


In [None]:
#split the data and assign the x and y values
X = df.iloc[:,0:1] # or we can use df['review']
y =df['sentiment']

In [None]:
X.head()

Unnamed: 0,review
0,"one reviewers mentioned watching 1 oz episode hooked. right, exactly happened me.the first thing struck oz brutality unflinching scenes violence, set right word go. trust me, show faint hearted timid. show pulls punches regards drugs, sex violence. ha..."
1,"wonderful little production. filming technique unassuming- old-time-bbc fashion gives comforting, sometimes discomforting, sense realism entire piece. actors extremely well chosen- michael sheen ""has got polari"" voices pat too! truly see seamless edit..."
2,"thought wonderful way spend time hot summer weekend, sitting air conditioned theater watching light-hearted comedy. plot simplistic, dialogue witty characters likable (even well bread suspected serial killer). may disappointed realize match point 2: r..."
3,"basically there's family little boy (jake) thinks there's zombie closet & parents fighting time.this movie slower soap opera... suddenly, jake decides become rambo kill zombie.ok, first going make film must decide thriller drama! drama movie watchable..."
4,"petter mattei's ""love time money"" visually stunning film watch. mr. mattei offers us vivid portrait human relations. movie seems telling us money, power success people different situations encounter. variation arthur schnitzler's play theme, director ..."


In [None]:
y.head()

Unnamed: 0,sentiment
0,positive
1,positive
2,positive
3,negative
4,positive


In [None]:
# Here we are changing y to numerical(0 and 1)
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(y)
y

array([1, 1, 1, ..., 0, 0, 1])

In [None]:
# Training split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [None]:
X_train.shape,X_test.shape

((7986, 1), (1997, 1))

In [None]:
# Here applying Bag Of word(BOW)

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [None]:
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Model

In [None]:
# Feeding the data to model
from sklearn.naive_bayes import GaussianNB

Gnb_model = GaussianNB()
Gnb_model.fit(X_train_bow,y_train)

In [None]:
#Predictions
y_pred = Gnb_model.predict(X_test_bow)

In [None]:
# compare the prediction and actual y by using the metrics
from sklearn.metrics import accuracy_score,confusion_matrix

accuracy_score(y_test,y_pred)

0.6324486730095142

In [46]:
confusion_matrix(y_test,y_pred)

array([[717, 235],
       [499, 546]])

In [47]:
# Here using randomforestclassifier (another model)
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier() #calling the model

rf_model.fit(X_train_bow,y_train)  #fit the data

y_pred_rf = rf_model.predict(X_test_bow) #predict the outcomes from model

accuracy_score(y_test,y_pred_rf) #compare the outcomes

0.8432648973460191

In [49]:
# Here we are taking 3000 features only to predict
cv = CountVectorizer(max_features=3000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf_model = RandomForestClassifier()

rf_model.fit(X_train_bow,y_train)
y_pred_f = rf_model.predict(X_test_bow)

accuracy_score(y_test,y_pred_f)

0.8407611417125689

# N- grams

In [50]:
cv = CountVectorizer(ngram_range=(1,2),max_features=5000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf_model = RandomForestClassifier()

rf_model.fit(X_train_bow,y_train)
y_pred_n = rf_model.predict(X_test_bow)

accuracy_score(y_test,y_pred_n)

0.7476214321482223

# TFDIF

In [51]:
# Here we are using TFDIF Technique for changing X to numerical
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer()

X_train_tf = tf.fit_transform(X_train['review']).toarray()
X_test_tf = tf.transform(X_test['review']).toarray()

rf_model = RandomForestClassifier()

rf_model.fit(X_train_tf,y_train)
y_pred_tf = rf_model.predict(X_test_tf)

accuracy_score(y_test,y_pred_tf)

0.8467701552328493

# Word2Vec

In [59]:
#Here we are using word2vec for cahnging X to numericali
import gensim

word2vec_model = gensim.models.Word2Vec()

In [64]:
X_train_build = word2vec_model.build_vocab(X_train['review'])
X_test_build = word2vec_model.build_vocab(X_test['review'])



In [67]:
X_train_word = word2vec_model.train(X_train['review'],total_examples=len('X_train_review'),epochs= 10)
X_test_word = word2vec_model.train(X_test['review'],total_examples=len('X_test_review'),epochs= 10)



In [69]:
rf_model = RandomForestClassifier()

rf_model.fit(X_train_word,y_train)
y_pred_word = rf_model.predict(X_test_word)

ValueError: Expected 2D array, got 1D array instead:
array=[13839457. 70752880.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.