In [48]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from wordcloud import WordCloud,STOPWORDS
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import os
print(os.listdir("input"))
import warnings
warnings.filterwarnings('ignore')

['IMDB Dataset.csv']


In [49]:
imdb_data=pd.read_csv("input/IMDB Dataset.csv")
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [50]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [51]:
#sentiment count
imdb_data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [52]:
train_reviews=imdb_data.review[:40000]
train_sentiment=imdb_data.sentiment[:40000]
test_reviews=imdb_data.review[40000:]
test_sentiment=imdb_data.sentiment[40000:]
print(train_reviews.shape)
print(train_sentiment.shape)
print(test_reviews.shape)
print(test_sentiment.shape)

(40000,)
(40000,)
(10000,)
(10000,)


Text Normalization

In [53]:
#Tokenization of words
tokenizer=ToktokTokenizer()
stopword_list=nltk.corpus.stopwords.words('english')

Remove HTML strips and noise text

In [54]:
def strip_html(text):
    soup=BeautifulSoup(text,'html.parser')
    return soup.get_text()

#removing square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text=strip_html(text)
    text=remove_between_square_brackets(text)
    return text

imdb_data['review']=imdb_data['review'].apply(denoise_text)

Remove Special Characters

In [55]:
def remove_special_characters(text):
    pattern=r'[^a-zA-Z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
imdb_data['review']=imdb_data['review'].apply(remove_special_characters)

Lemmatization


In [56]:
def simple_lemmatization(text):
    lm=WordNetLemmatizer()
    text=' '.join(lm.lemmatize(word) for word in text.split())
    return text

imdb_data['review']=imdb_data['review'].apply(simple_lemmatization)

Stop Word Removal

In [57]:
stop=list(stopwords.words('english'))
print(stop)

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize ( text )
    if is_lower_case:
        filtered_tokens=[token for token in tokens if token not in stopword_list ]
    else:
        filtered_tokens=[token for token in tokens if token.lower() not in stopword_list ]  
    filtered_tokens=' '.join(filtered_tokens)      
    return filtered_tokens

imdb_data['review']=imdb_data['review'].apply(remove_stopwords)    

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

BoW creates a vocabulary of all unique words in your dataset, and then represents each document (review) as a vector:

Each position in the vector = a word in the vocabulary

Value = how many times that word appears in the document

In [58]:
cv=CountVectorizer(max_df=0.9,min_df=2,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(test_reviews)
print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)

BOW_cv_train: (40000, 1370393)
BOW_cv_test: (10000, 1370393)


TF-IDF stands for:

TF = Term Frequency → How often a word appears in a document.

IDF = Inverse Document Frequency → How rare the word is across all documents.

In [59]:
tv=TfidfVectorizer(max_df=0.9,min_df=2,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=cv.fit_transform(train_reviews)
#transformed test reviews
tv_test_reviews=cv.transform(test_reviews)
print('TF_IDF_tv_train:',tv_train_reviews.shape)
print('TF_IDF_tv_test:',tv_test_reviews.shape)

TF_IDF_tv_train: (40000, 1370393)
TF_IDF_tv_test: (10000, 1370393)


Conversion of sentiment data to vector

In [60]:
#LabelBinarizer
lb=LabelBinarizer()
sentiment_data=lb.fit_transform(imdb_data['sentiment'])
print(sentiment_data.shape)


(50000, 1)


Split the sentiment data

In [61]:
train_v_sentiments=sentiment_data[:40000]
test_v_sentiments=sentiment_data[40000:]
print(train_v_sentiments)
print(test_v_sentiments)

[[1]
 [1]
 [1]
 ...
 [1]
 [0]
 [0]]
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


Modelling the dataset
1) Logistic Regression --> Regularization(Adding Penalty to avoid overfitting)

In [62]:
lr=LogisticRegression(penalty='l2',C=1,max_iter=500,random_state=42)
#BOW MODEL
lr_Bow=lr.fit(cv_train_reviews,train_v_sentiments)
print(lr_Bow)
#tf-idf 
lr_tf_idf=lr.fit(tv_train_reviews,train_v_sentiments)
print(lr_tf_idf)

LogisticRegression(C=1, max_iter=500, random_state=42)
LogisticRegression(C=1, max_iter=500, random_state=42)


Prediction of LR model

In [65]:
lr_bow_predict=lr.predict(cv_test_reviews)
print(lr_bow_predict)
lr_tfidf_predict=lr.predict(tv_test_reviews)
print(lr_tfidf_predict)

[0 0 0 ... 1 0 0]
[0 0 0 ... 1 0 0]


Accuracy


In [68]:
lr_bow_accuracy=accuracy_score(test_v_sentiments,lr_bow_predict)
print(lr_bow_accuracy)
lr_tfidf_accuracy=accuracy_score(test_v_sentiments,lr_tfidf_predict)
print(lr_tfidf_accuracy)

0.907
0.907
