In [10]:
# Import Libraries

import os
import re
import xgboost
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from numpy import array
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import warnings
warnings.filterwarnings("ignore")


In [11]:
# Load IMDB Reviews Dataset

df = pd.read_csv("/home/tarunpreet/IMDB_Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# 1. Sentiment Analysis using VADER pre-built lexicon-based sentiment analysis tool

In [3]:
# VADER pre-built, lexicon-based sentiment analysis tool
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/tarunpreet/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [5]:
print(df.iloc[1])
sid.polarity_scores(df['review'][1])

review       A wonderful little production. <br /><br />The...
sentiment                                             positive
Name: 1, dtype: object


{'neg': 0.053, 'neu': 0.776, 'pos': 0.172, 'compound': 0.9641}

In [6]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['comp_score'] = df['compound'].apply(lambda c: 'positive' if c >=0 else 'negative')
df.head()

Unnamed: 0,review,sentiment,scores,compound,comp_score
0,One of the other reviewers has mentioned that ...,positive,"{'neg': 0.203, 'neu': 0.748, 'pos': 0.048, 'co...",-0.9951,negative
1,A wonderful little production. <br /><br />The...,positive,"{'neg': 0.053, 'neu': 0.776, 'pos': 0.172, 'co...",0.9641,positive
2,I thought this was a wonderful way to spend ti...,positive,"{'neg': 0.094, 'neu': 0.714, 'pos': 0.192, 'co...",0.9605,positive
3,Basically there's a family where a little boy ...,negative,"{'neg': 0.138, 'neu': 0.797, 'pos': 0.065, 'co...",-0.9213,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"{'neg': 0.052, 'neu': 0.801, 'pos': 0.147, 'co...",0.9744,positive


In [9]:
df['sentiment'] =  np.array(list(map(lambda x: 1 if x=="positive" else 0,df['sentiment'] )))
df['comp_score']= np.array(list(map(lambda x: 1 if x=="positive" else 0, df['comp_score'])))


In [11]:
CR=classification_report(df['sentiment'],df['comp_score'])
print("Accuracy : {} %".format(round(accuracy_score(df['sentiment'],df['comp_score'])*100, 4)))
print("Classification Report",CR)

Accuracy : 69.626 %
Classification Report               precision    recall  f1-score   support

           0       0.79      0.54      0.64     25000
           1       0.65      0.86      0.74     25000

    accuracy                           0.70     50000
   macro avg       0.72      0.70      0.69     50000
weighted avg       0.72      0.70      0.69     50000



# 2. Sentiment analysis using Machine Learning Models

In [12]:
#Download NLTK Libraries

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tarunpreet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/tarunpreet/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
#Remove HTML Tags

TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
     return TAG_RE.sub('', text)

In [14]:
#Data Pre-Processing 

def preprocess_text(sen):
    
    #'''Cleans text data up, leaving only 2 or more char long non-stepwords composed of A-Z & a-z onlyin lowercase'''
    sentence = sen.lower()
    
    # Remove html tags
    sentence = remove_tags(sentence)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)  # When we remove apostrophe from the word "Mark's", the apostrophe is replaced by an empty space. Hence, we are left with single character "s" that we are removing here.
   
    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)  # Next, we remove all the single characters and replace it by a space which creates multiple spaces in our text. Finally, we remove the multiple spaces from our text as well.

    # Remove Stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    sentence = pattern.sub('', sentence)

    return sentence

In [15]:
X = []
sentences = list(df['review'])
for sen in sentences:
    X.append(preprocess_text(sen))

In [16]:
# Converting sentiment labels to 0 & 1

y = df['sentiment']
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))


In [17]:
# converting sentences or documents into numerical vectors 

vectorizer = TfidfVectorizer(use_idf = True,lowercase = True,strip_accents='ascii')
X_vec = vectorizer.fit_transform(df["review"])

In [18]:
#Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.20, random_state=42)

**Model 1: Random Forest**

In [80]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
CR=classification_report(y_test, y_pred_rf)
print("Classification Report",CR)
print("Accuracy : {} %".format(round(accuracy_score(y_test, y_pred_rf)*100, 4)))

Classification Report               precision    recall  f1-score   support

           0       0.85      0.85      0.85      4961
           1       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Accuracy : 84.75 %


**Model 2: XGBoost**

In [82]:
xgb = xgboost.XGBClassifier()
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
CR=classification_report(y_test, y_pred_xgb)
print("Classification Report",CR)
print("Accuracy : {} %".format(round(accuracy_score(y_test, y_pred_xgb)*100, 4)))

Classification Report               precision    recall  f1-score   support

           0       0.87      0.85      0.86      4961
           1       0.86      0.88      0.87      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Accuracy : 86.42 %


# 3. Sentiment Analysis using Deep Learning Models

In [7]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from tensorflow.keras.layers import Dense,SimpleRNN, LSTM, Dropout,Bidirectional,Activation, Dropout
from keras.layers import Dense, SimpleRNN, LSTM, Dropout,Bidirectional
from keras.preprocessing.text import one_hot, Tokenizer
from keras.models import Sequential
from keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM
from sklearn.model_selection import train_test_split
from keras_preprocessing.sequence import pad_sequences


In [19]:
# Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [20]:
# Convert words to tokens

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)
X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)                 

In [21]:
# Padding sentence length

vocab_length = len(word_tokenizer.word_index) + 1
max_length = 100
X_train = pad_sequences(X_train, padding='post', maxlen=max_length)
X_test = pad_sequences(X_test, padding='post', maxlen=max_length)

**Model 1: Simple Neural Network**

In [24]:
snn_model = Sequential()
embedding_layer = Embedding(vocab_length, 100, input_length=max_length , trainable=True)
snn_model.add(embedding_layer)
snn_model.add(Flatten())
snn_model.add(Dense(1, activation='sigmoid'))

# Model compiling
snn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(snn_model.summary())

score = snn_model.evaluate(X_test, y_test, verbose=1)
y_pred_snn = snn_model.predict(X_test)
y_pred_labels = (y_pred_snn > 0.5).astype(int)
CR=classification_report(y_test, y_pred_labels)
print("Classification Report",CR)
print("Test Accuracy:", score[1])

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 100)          8737200   
                                                                 
 flatten_2 (Flatten)         (None, 10000)             0         
                                                                 
 dense_2 (Dense)             (None, 1)                 10001     
                                                                 
Total params: 8747201 (33.37 MB)
Trainable params: 8747201 (33.37 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Classification Report               precision    recall  f1-score   support

           0       0.49      0.63      0.55      7411
           1       0.49      0.35      0.41      7589

    accuracy                           0.49     15000
   macro avg       0.49      0.49

**Model 2: Convolutional Neural Network**

In [25]:
from keras.layers import Conv1D

cnn_model = Sequential()
embedding_layer = Embedding(vocab_length, 100, input_length=max_length , trainable=True)
cnn_model.add(embedding_layer)
cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(units = 64, activation = 'relu'))
cnn_model.add(Dense(units = 32,activation = 'relu'))
cnn_model.add(Dense(1, activation='sigmoid'))

# Model compiling
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(cnn_model.summary())

cnn_model_history = cnn_model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)
score = cnn_model.evaluate(X_test, y_test, verbose=1)
y_pred_cnn = cnn_model.predict(X_test)
y_pred_labels = (y_pred_cnn > 0.5).astype(int)
CR=classification_report(y_test, y_pred_labels)
print("Classification Report",CR)
print("Test Accuracy:", score[1])

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 100)          8737200   
                                                                 
 conv1d (Conv1D)             (None, 96, 128)           64128     
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense_3 (Dense)             (None, 64)                8256      
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                      