In [10]:
# Import Libraries

import os
import re
import xgboost
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from numpy import array
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix


In [18]:
#Download NLTK Libraries

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tarunpreet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/tarunpreet/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
# Load IMDB Reviews Dataset

df = pd.read_csv("/home/tarunpreet/IMDB_Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [13]:
#Remove HTML Tags

TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
     return TAG_RE.sub('', text)

In [14]:
#Data Pre-Processing 

def preprocess_text(sen):
    
    #'''Cleans text data up, leaving only 2 or more char long non-stepwords composed of A-Z & a-z onlyin lowercase'''
    sentence = sen.lower()
    
    # Remove html tags
    sentence = remove_tags(sentence)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)  # When we remove apostrophe from the word "Mark's", the apostrophe is replaced by an empty space. Hence, we are left with single character "s" that we are removing here.
   
    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)  # Next, we remove all the single characters and replace it by a space which creates multiple spaces in our text. Finally, we remove the multiple spaces from our text as well.

    # Remove Stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    sentence = pattern.sub('', sentence)

    return sentence

In [15]:
X = []
sentences = list(df['review'])
for sen in sentences:
    X.append(preprocess_text(sen))

In [16]:
# Converting sentiment labels to 0 & 1

y = df['sentiment']
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))


# 1. Sentiment analysis using Machine Learning Models

In [20]:
# converting sentences or documents into numerical vectors 

vectorizer = TfidfVectorizer(use_idf = True,lowercase = True,strip_accents='ascii')
X_vec = vectorizer.fit_transform(df["review"])

In [21]:
#Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.20, random_state=42)

**Model 1: Multinomial Naive Bayes**

In [22]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_rf = nb.predict(X_test)
print("Accuracy : {} %".format(round(accuracy_score(y_test, y_pred_rf)*100, 4)))

Accuracy : 86.41 %


**Model 2: Random Forest**

In [23]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Accuracy : {} %".format(round(accuracy_score(y_test, y_pred_rf)*100, 4)))

Accuracy : 84.04 %


**Model 3: XGBoost**

In [135]:
xgb = xgboost.XGBClassifier()
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("Accuracy : {} %".format(round(accuracy_score(y_test, y_pred_xgb)*100, 4)))

Accuracy : 85.76 %


# 2. Sentiment Analysis using VADER pre-built lexicon-based sentiment analysis tool

In [141]:
# VADER pre-built, lexicon-based sentiment analysis tool

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/tarunpreet/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [142]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [144]:
# Calculate Polarity Score and add to dataframe

df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['comp_score'] = df['compound'].apply(lambda c: 'positive' if c >=0 else 'negative')
df.head()

In [145]:
print("Accuracy : {} %".format(round(accuracy_score(df['sentiment'],df['comp_score'])*100, 4)))

Accuracy : 69.212 %


# 3. Sentiment Analysis using Deep Learning Models

In [138]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, LSTM, Dropout,Bidirectional
from keras.layers.core import Dense, Activation, Dropout
from keras.preprocessing.text import one_hot, Tokenizer
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM
from sklearn.model_selection import train_test_split
from keras_preprocessing.sequence import pad_sequences


In [144]:
# Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [145]:
# Convert words to tokens

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)
X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)                 

In [146]:
# Padding sentence length

vocab_length = len(word_tokenizer.word_index) + 1
max_length = 100
X_train = pad_sequences(X_train, padding='post', maxlen=max_length)
X_test = pad_sequences(X_test, padding='post', maxlen=max_length)

**Model 1: Simple Neural Network**

In [147]:
snn_model = Sequential()
embedding_layer = Embedding(vocab_length, 100, input_length=max_length , trainable=True)
snn_model.add(embedding_layer)
snn_model.add(Flatten())
snn_model.add(Dense(1, activation='sigmoid'))

# Model compiling
snn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(snn_model.summary())

score = snn_model.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_16 (Embedding)    (None, 100, 100)          8737200   
                                                                 
 flatten_7 (Flatten)         (None, 10000)             0         
                                                                 
 dense_17 (Dense)            (None, 1)                 10001     
                                                                 
Total params: 8,747,201
Trainable params: 8,747,201
Non-trainable params: 0
_________________________________________________________________
None
Test Score: 0.6931443810462952
Test Accuracy: 0.5078666806221008


**Model 2: Convolutional Neural Network**

In [149]:
from keras.layers import Conv1D

cnn_model = Sequential()
embedding_layer = Embedding(vocab_length, 100, input_length=max_length , trainable=True)
cnn_model.add(embedding_layer)
cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(units = 64, activation = 'relu'))
cnn_model.add(Dense(units = 32,activation = 'relu'))
cnn_model.add(Dense(1, activation='sigmoid'))

# Model compiling
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(cnn_model.summary())

cnn_model_history = cnn_model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)
score = cnn_model.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Model: "sequential_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_18 (Embedding)    (None, 100, 100)          8737200   
                                                                 
 conv1d_3 (Conv1D)           (None, 96, 128)           64128     
                                                                 
 global_max_pooling1d_3 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_19 (Dense)            (None, 64)                8256      
                                                                 
 dense_20 (Dense)            (None, 32)                2080      
                                                                 
 dense_21 (Dense)            (None, 1)                 33        
                                                     