In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# change here to your dataset path
# PATH = '/content/drive/MyDrive/imdb_nlp/IMDB Dataset.csv'
PATH = 'IMDB Dataset.csv'

## **1. Background**

![Natural language processing](https://landbot.io/wp-content/uploads/2019/11/natural-language-processing-chatbot.jpg)

**What is Natural Language Processing?**

From wikipedia, Natural language processing (NLP) is a subfield of linguistics, computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.

**What is Sentiment Classification?**

Sentiment analysis (also known as opinion mining or emotion AI) refers to the use of natural language processing, text analysis, computational linguistics, and biometrics to systematically identify, extract, quantify, and study affective states and subjective information. Sentiment analysis is widely applied to voice of the customer materials such as reviews and survey responses, online and social media, and healthcare materials for applications that range from marketing to customer service to clinical medicine.

**What is Tokenizer?**

Tokenization is a necessary first step in many natural language processing tasks, such as word counting, parsing, spell checking, corpus generation, and statistical analysis of text.

Tokenizer is a compact pure-Python (2 and 3) executable program and module for tokenizing Icelandic text. It converts input text to streams of tokens, where each token is a separate word, punctuation sign, number/amount, date, e-mail, URL/URI, etc. It also segments the token stream into sentences, considering corner cases such as abbreviations and dates in the middle of sentences.[Tokenizer](https://pypi.org/project/tokenizer/)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Serfati/imdb_sentiment_analysis)

In [None]:
# !pip install -r requirements.txt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from collections import Counter

In [None]:
import re
import os
import pandas as pd
import numpy as np

In [None]:
# visualization
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import homogeneity_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

## **2. Data exploratory analysis**


### **2.1 Data overview**

![IMDB 50 review datasets](https://o.aolcdn.com/images/dims?quality=85&image_uri=https%3A%2F%2Fo.aolcdn.com%2Fimages%2Fdims%3Fcrop%3D908%252C537%252C0%252C0%26quality%3D85%26format%3Djpg%26resize%3D1600%252C947%26image_uri%3Dhttps%253A%252F%252Fs.yimg.com%252Fos%252Fcreatr-uploaded-images%252F2019-08%252F560e5d20-c833-11e9-bf26-36635805fe83%26client%3Da1acac3e1b3290917d92%26signature%3D639a4965c41ca6cec13652498f65cfc97170ea5d&client=amp-blogside-v2&signature=765e155477177a69b93eac5611145d4241be6071)

This dataset contains movie reviews along with their associated binary sentiment polarity labels. It is intended to serve as a benchmark for sentiment classification. This document outlines how the dataset was gathered, and how to use the files provided.

**Dataset**

The core dataset contains 50,000 reviews. The overall distribution of labels is balanced (25k pos and 25k neg). We also include an additional 50,000 unlabeled documents for unsupervised learning.

### **2.2 Data Exploration**

The first step is to load the data to global environment.

In [None]:
df = pd.read_csv(PATH)

We could see some abnormal words such as <br /><br />, then we should replace them by a null or space value.

In [None]:
plt.figure()
plt.hist(df['review'].str.split().apply(len).value_counts())
plt.xlabel('number of words in sentence')
plt.ylabel('frequency')
plt.title('Words occurrence frequency')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df["review"][0][:250]

In [None]:
df["sentiment"][0]

In [None]:
df["review"].groupby(df["sentiment"]).count()

In [None]:
s = df['sentiment'].value_counts()
s = (s/s.sum())*100

plt.figure()
bars = plt.bar(s.index, s.values, color = ['green', 'red'], alpha = .6)
plt.xticks(s.index, ['Positive', 'Negative'], fontsize = 15)
plt.tick_params(bottom = False, top = False, left = False, right = False, labelleft = False)
for spine in plt.gca().spines.values():
    spine.set_visible(False)
for bar in bars:
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() - 5, s = str(bar.get_height())[:2] + '%', ha = 'center', fontsize = 15)
plt.title('Reviews polarity', fontsize = 17)

### **2.3 Data pre-processing**

**Text Cleaning**

0.Label Encoder

1.Remove html tags

2.Remove special characters

3.Converting every thing to lower case

4.Removing Stop words

5.Stemming

6.Remove extra spaces

7.Lemmatization

In [None]:
label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])
df.head()

In [None]:
df['review'] = df['review'].str.replace('<br />','')

In [None]:
# function to remove special characters
df["review"] = df["review"].apply(lambda x: re.sub("[^0-9a-zA-Z]",' ', x))

In [None]:
df['review'] = df['review'].str.lower()

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

**Stop Words Removal**

We'll remove the stop words for better prediction.

In [None]:
from tqdm import tqdm
pbar = tqdm(total=df.shape[0], leave=True, position=0)

def remove_sw(review):
    tokens = word_tokenize(review)
    tokens = [w for w in tokens if not w in stop_words]
    pbar.update(1)
    return " ".join(tokens)

df["review"] = df["review"].apply(remove_sw)
pbar.close()

**Porter Stemmer**

For this particular dataset the PorterStemmer does not bring better performance, so it is better to skip this step.

In [None]:
pbar = tqdm(total=df.shape[0], leave=True, position=0)
ps = PorterStemmer()

def stem(text):
    pbar.update(1)
    return ' '.join([ps.stem(word) for word in text.split()])

df["review"] = df["review"].apply(stem)
pbar.close()

In [None]:
nltk.download('punkt')
#Creating a Lemmatizer for preprocessing
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [None]:
df["review"] = df["review"].apply(lambda x: re.sub(" +"," ", x))

In [None]:
# A cleaned review after pre proccessing
df['review'][0]

In [None]:
neg = df[df['sentiment'] == 0]
pos = df[df['sentiment'] == 1]

In [None]:
import plotly.express as px
def get_top_text_ngrams(corpus, n, g):
    vec = CountVectorizer(ngram_range=(g, g)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
most_common_uni = get_top_text_ngrams(neg.review,10,1)
most_common_uni = dict(most_common_uni)
temp = pd.DataFrame(columns = ["Common_words" , 'Count'])
temp["Common_words"] = list(most_common_uni.keys())
temp["Count"] = list(most_common_uni.values())
fig = px.bar(temp, x="Count", y="Common_words", title='Unigram - Commmon Words in Negative Text', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

In [None]:
most_common_uni = get_top_text_ngrams(pos.review,10,1)
most_common_uni = dict(most_common_uni)
temp = pd.DataFrame(columns = ["Common_words" , 'Count'])
temp["Common_words"] = list(most_common_uni.keys())
temp["Count"] = list(most_common_uni.values())
fig = px.bar(temp, x="Count", y="Common_words", title='Unigram - Commmon Words in Positive Text', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

In [None]:
most_common_uni = get_top_text_ngrams(pos.review,10,2)
most_common_uni = dict(most_common_uni)
temp = pd.DataFrame(columns = ["Common_words" , 'Count'])
temp["Common_words"] = list(most_common_uni.keys())
temp["Count"] = list(most_common_uni.values())
fig = px.bar(temp, x="Count", y="Common_words", title='Bigram - Commmon Words in Positive Text', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

In [None]:
most_common_uni = get_top_text_ngrams(pos.review,10,2)
most_common_uni = dict(most_common_uni)
temp = pd.DataFrame(columns = ["Common_words" , 'Count'])
temp["Common_words"] = list(most_common_uni.keys())
temp["Count"] = list(most_common_uni.values())
fig = px.bar(temp, x="Count", y="Common_words", title='Bigram - Commmon Words in Positive Text', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

In [None]:
X = df['review']
y = df['sentiment']

Split data to train and test for modeling and performance evaluation.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42)

print('Training dataset : {} reviews'.format(X_train.shape[0]))
print('Testing dataset : {} reviews'.format(X_test.shape[0]))

In [None]:
y_test.value_counts()

## **3. Modeling**

### 3.1 Feature Extraction using TF-IDF algorithm

![TFIDF](https://miro.medium.com/max/532/0*bHkPdhgfnyTs4un_)

In scikit-learn, the TF-IDF algorithm is implemented using **TfidfTransformer**. This transformer needs the count matrix which it will transform later. Hence, we use **CountVectorizer** first.
Alternatively, one can use **TfidfVectorizer**, which is the equivalent of CountVectorizer followed by TfidfTransformer

In [None]:
tfidfVect =    TfidfVectorizer( max_df = 0.5,
                                sublinear_tf=True,
                                lowercase = True, 
                                ngram_range = (1,2), 
                                tokenizer = LemmaTokenizer(),
                                stop_words = 'english',
                                min_df = 1,
                                use_idf = True,
                                # max_features = 1000,
                                strip_accents = 'ascii'
                                )

In [None]:
%time features = tfidfVect.fit_transform(X_train)
features.shape

## Unsupervised Learning Approach

Now, all that’s left to do is use a machine learning algorithm. We can summarize all that we have done so far using a scikit-learn pipeline.

In [None]:
model = KMeans(n_clusters=2, random_state=42)

#fit the model with data (occurs in-place)
model.fit(features)

In [None]:
features_test = tfidfVect.transform(X_test)

We can find predictions using the predict() method.

In [None]:
pred = model.predict(features_test)
pred = pd.DataFrame(pred)

To evaluate the predictions, we use different classification metrics.

In [None]:
print("test accuracy score: {0:.3f}%".format(accuracy_score(y_test, pred)*100))
#accuracy_score ==> 74.352%

In [None]:
sn.heatmap(confusion_matrix(y_test, pred), annot=True, cmap="icefire",xticklabels=['Negative', 'Positive'],yticklabels=
['Negative', 'Positive'], fmt='g')

sn.color_palette("pastel")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('KMeans Confusion Matrix')

We have obtained than 74.4% accuracy in predicting whether the review message is positive or negative, and we have performed feature extraction from the raw text in the process.

In [None]:
# pca = PCA(n_components=2)
# reduced_features = pca.fit_transform(features.toarray()[:1000])
# reduced_cluster_centers = pca.transform(model.cluster_centers_)

# plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=model.predict(features), s=10)
# plt.scatter(reduced_cluster_centers[:, 0], reduced_cluster_centers[:, 1], marker='x', s=150, c='b')

## Supervised Learning Approach

## TfIdfVectorizer Feature Extraction 
**Naive Bayes Classifier for Multinomial**

In [None]:
#instantiate the model (with the default parameters)
mnb = MultinomialNB()

#fit the model with data (occurs in-place)
mnb.fit(features, y_train)

In [None]:
print("Training accuracy of MultinomialNB using TfIdfVectorizer: {0:.3f}%".format(accuracy_score(y_train, mnb.predict(features))*100))
#Training accuracy of MultinomialNB using TfIdfVectorizer: 98.848%

**Stochastic Gradient Descent Classifier**

In [None]:
from sklearn.linear_model import SGDClassifier
#instantiate the model (with the default parameters)
sgd = SGDClassifier()

#fit the model with data (occurs in-place)
sgd.fit(features, y_train)

In [None]:
print("Training accuracy of SGDClassifier using TfIdfVectorizer: {0:.3f}%".format(accuracy_score(y_train, rfc.predict(features))*100))

**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
#instantiate the model (with the default parameters)
lr = LogisticRegression()

#fit the model with data (occurs in-place)
lr.fit(features, y_train)

In [None]:
print("Training accuracy of Logistic Regression using TfIdfVectorizer: {0:.3f}%".format(accuracy_score(y_train, lr.predict(features))*100))
#Training accuracy of Logistic Regression using TfIdfVectorizer: 96.088%

### 3.1 Feature Extraction using Count Vectorizer

In [None]:
cv=CountVectorizer( max_df = 0.5,
                    lowercase = True, 
                    ngram_range = (1,2), 
                    tokenizer = LemmaTokenizer(),
                    stop_words = 'english',
                    min_df = 1,
                    )

In [None]:
%time bow_features = cv.fit_transform(X_train)
bow_features.shape

In [None]:
%time bow_features_test = bow.transform(X_test)

**MultinomialNB**

In [None]:
#instantiate the model (with the default parameters)
mnb2 = MultinomialNB()

#fit the model with data (occurs in-place)
mnb2.fit(bow_features, y_train)

In [None]:
print("Training accuracy of MultinomialNB using CountVectorizer: {0:.3f}%".format(accuracy_score(y_train, mnb2.predict(bow_features))*100))

**LogisticRegression**

In [None]:
#instantiate the model (with the default parameters)
lr2 = LogisticRegression()

#fit the model with data (occurs in-place)
lr2.fit(bow_features, y_train)

In [None]:
print("Training accuracy of LogisticRegression using CountVectorizer: {0:.3f}%".format(accuracy_score(y_train, lr2.predict(bow_features))*100))

**SGDClassifier**

In [None]:
#instantiate the model (with the default parameters)
sgd2 = SGDClassifier()

#fit the model with data (occurs in-place)
sgd2.fit(bow_features, y_train)

In [None]:
print("Training accuracy of SGDClassifier using CountVectorizer: {0:.3f}%".format(accuracy_score(y_train, sgd2.predict(bow_features))*100))

## Evaluation

We'll take the model with the highest training accuracy score and evaluate the test set with it. 
In out case the **MultinomialNB** model provied us the highest score.

In [None]:
features_test = tfidfVect.transform(X_test)
pred = mnb.predict(features_test)
pred = pd.DataFrame(pred)

In [None]:
print("Test accuracy score of MultinomialNB using TfIdfVectorizer: {0:.3f}%".format(accuracy_score(y_test, pred)*100))
# accuracy score: 83.516%
print("Test AUC score of MultinomialNB using TfIdfVectorizer: {0:.3f} %".format(roc_auc_score(y_test, pred)*100))