In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df.head()

In [6]:
df.info()

In [7]:
df['sentiment'].value_counts()

#### So we can see there are 2 columns - review and sentiment. sentiment is the target column that we need to predict. The dataset is completely balanced and it has equal number of positive and negative sentiments.

#### Let's take one review as sample and understand why we need to clean the text.

In [24]:
review = df['review'].loc[1]
review

#### Normally any NLP task involves following text cleaning techniques -

1. Removal of HTML contents like "< br>".
2. Removal of punctutions, special characters like '\'.
3. Removal of stopwords like is, the which do not offer much insight.
4. Stemming/Lemmatization to bring back multiple forms of same word to their common root like 'coming', 'comes' into 'come'.
5. Vectorization - Encode the numeric values once you have cleaned it.
6. Fit the data to the ML model.

#### We will apply all these techniques on this sample review and understand how it works.

#### First of all we will remove HTML contents.

#### Removal of HTML contents like "< br>".

In [25]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(review, "html.parser")
review = soup.get_text()
review

#### We can see HTML tags are removed; so in the next step we will remove everything except lower/upper case letters using Regular Expressions.

In [26]:
import re

review = re.sub('\[[^]]*\]', ' ', review)
review = re.sub('[^a-zA-Z]', ' ', review)
review

#### Next we will bring everything into lowercase.

In [27]:
review = review.lower()
print(review)

#### Stopwords removal - since stopwords removal works on every word in your text we need to split the text.

In [28]:
review = review.split()
print(len(review))
print(review)

In [21]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


for word in set(stopwords.words('english')):
    print(word, end=" | ")

In [29]:
review = [word for word in review if not word in set(stopwords.words('english'))]
print(len(review))
print(review)

#### Stemming/Lemmatization - we will apply both and see the difference.

In [30]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
review_s = [ps.stem(word) for word in review]
print(review_s)

In [31]:
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()
review_lemmat = [lem.lemmatize(word) for word in review]
print(review_lemmat)

#### We can see that 'little' has become 'littl' after Stemming but remained 'little' after Lemmatization. We will use Lemmatization.

#### Merge the words to form cleaned up version of the text.

In [33]:
review = ' '.join(review_lemmat)
review

#### Our next step will be to bring this text in mathematical forms and to do so we will create a Corpus first.

In [35]:
corpus = []
corpus.append(review)

#### To vectorize the text we will apply -

1. CountVectorizer (Bag of Words Model)
2. TfidfVectorizer (Bag of Words Model)
3. Keras Tokenizer (Embedding)

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer()
review_count_vec = count_vec.fit_transform(corpus)

review_count_vec.toarray()

#### So we can see the data has become numeric with 1,2 and 3s based on the number of times they appear in the text.

#### There is another variation of CountVectorizer with binary=True and in that case all zero entries will have 1.

In [37]:
count_vec_bin = CountVectorizer(binary=True)
review_count_vec_bin = count_vec_bin.fit_transform(corpus)

review_count_vec_bin.toarray()

#### So there is no 2s and 3s in the vector.

#### We will now explore TF-IDF - TF stands for Text Frequency which means how many times a word (term) appears in a text (document). IDF means Inverse Document Frequency and is calculated as log(# of documents in corpus/# of documents containing the term).

#### Finally TF-IDF score is calculated as TF * IDF.

#### IDF acts as a balancing factor and diminishes the weight of terms that occur very frequently in the document set and increases the weight of terms that occur rarely.

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
review_tfidf_vec = tfidf_vec.fit_transform(corpus)

review_tfidf_vec.toarray()

In [39]:
df.head()

In [40]:
df["sentiment"] = df["sentiment"].replace({"positive": 1, "negative":0})
df["sentiment"].value_counts()

In [41]:
X = df["review"]
y = df["sentiment"]

In [None]:
data_corpus = []

for i in range(X.shape[0]):
    soup = BeautifulSoup(X.iloc[i], "html.parser")
    review = soup.get_text()
    review = re.sub('\[[^]]*\]', ' ', review)
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    review = [word for word in review if not word in set(stopwords.words('english'))]
    lem = WordNetLemmatizer()
    review = [lem.lemmatize(word) for word in review]
    review = ' '.join(review)
    data_corpus.append(review)

#### Let's validate one sample entry.

In [None]:
data_corpus[1]

## We will now fit the data to Naive Bayes classifier. 

#### Bayesian model uses prior probabilities to predict posterior probabilites which is helpful for classification with discrete features like text classification.

#### Here we are using Multinomial Naive Bayes try other Naive Bayes

#### The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work. We will now use CountVectorizer as our Bag-of-Words model before applying MultinomialNB model.

In [None]:
count_vec_NB = CountVectorizer(ngram_range=(1, 3), binary=False)
count_vec_data_NB = count_vec_NB.fit_transform(data_corpus)

#### So there are 900000 terms in the corpus and we will use a *Chi-Square* test to select top 5000 features.

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

ch2 = SelectKBest(chi2, k=5000)
count_vec_data_NB = ch2.fit_transform(count_vec_data_NB, y)

#### Let's fit the data to Multinomial Naive Bayes model.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(count_vec_data_NB, y, test_size=0.30, random_state=42)

In [None]:
from sklearn.naive_bayes import MultinomialNB

multi_clf = MultinomialNB()
multi_clf.fit(X_train, y_train)

predict_NB = multi_clf.predict(X_test)

#### Let's measure its performance.

In [None]:
print("Classification Report: \n", classification_report(y_test, predict_NB,target_names=['Negative','Positive']))

In [None]:
print("Confusion Matrix: \n", confusion_matrix(y_test, predict_NB))

In [None]:
print("Accuracy: \n", accuracy_score(y_test, predict_NB))