IMDB dataset having 50K movie reviews for natural language processing or text analytics. This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. This provides a set of 25,000 highly polar reviews for training and 25,000 fr testing. 

## Initial exploration

In [61]:
import numpy as np
import pandas as pd

In [62]:
data = pd.read_csv("IMDB Dataset.csv")

In [63]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [64]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [65]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [66]:
data.columns

Index(['review', 'sentiment'], dtype='object')

In [67]:
data.shape

(50000, 2)

In [68]:
data.isnull().any()

review       False
sentiment    False
dtype: bool

In [69]:
data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

## Text Normalization

In [70]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [71]:
import re
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from bs4 import BeautifulSoup

In [72]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [73]:
#Removing the noisy text
def noiseremoval_text(text):
    soup = BeautifulSoup(text, "html.parser") #remove the html tags
    text = soup.get_text()
    text = re.sub('\[[^]]*\]','', text)
    return text

In [74]:
#Apply the function on the review column of the dataframe and store back
data['review'] = data['review'].apply(noiseremoval_text)

In [75]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Stemming

In [76]:
#Stemming the text
def stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [77]:
#Apply the function on review column
data['review'] = data['review'].apply(stemmer)

In [78]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl production. the film techniqu i...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there' a famili where a littl boy (jake)...,negative
4,"petter mattei' ""love in the time of money"" is ...",positive


In [79]:
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize  

In [80]:
stop_wr=set(stopwords.words('english'))
print(stop_wr)

{'how', "mightn't", 'itself', 'yourself', 'had', "didn't", 'here', 'against', 'them', 'we', 'll', 'before', 've', 'once', 'not', 'm', 'wouldn', "aren't", 'their', 'few', 'doesn', 'who', 'be', 'what', 'me', 'in', 'after', 'same', 'should', 'does', 'nor', 'do', 'own', 'herself', "shan't", 'these', 'shan', 'from', 'been', 'under', 'weren', 's', 'doing', 'ours', 'off', 'whom', "should've", "mustn't", 'mightn', 'will', 'the', 'he', 'hers', 'its', 'themselves', 'if', 'yours', "you've", 'his', 'while', 'having', 'below', 'didn', 'can', 'at', 'is', 'and', 'further', 'were', 'or', 'now', 'has', 'where', 'to', 'mustn', 'that', 'ain', 'it', 'd', 'of', 'her', 'our', 'they', 'so', 'into', 'she', 'as', 'about', 'are', 'myself', 'any', 'until', 'o', 'but', "wouldn't", 'through', 'theirs', "don't", 'because', "you're", 'for', 'there', 'above', 'all', 'couldn', 'with', 'an', 'ourselves', 'did', "couldn't", 't', 'than', 'up', "that'll", 'most', 'by', 'over', 'only', 'needn', 'him', 'am', 'too', 'yoursel

In [81]:
#Removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    #tokenization of text
    tokenizers = ToktokTokenizer()
    #Set english stopwords
    tokens = tokenizers.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filltokens = [token for token in tokens if token not in stop_wr]
    else:
        filltokens = [token for token in tokens if token.lower() not in stop_wr]
    filtered_texts = ' '.join(filltokens)
    return filtered_texts

In [82]:
data['review'] = data['review'].apply(remove_stopwords)

In [83]:
data.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 oz episod ' hook...,positive
1,wonder littl production. film techniqu veri un...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic ' famili littl boy ( jake ) think ' zomb...,negative
4,"petter mattei ' "" love time money "" visual stu...",positive


## Train test split

In [84]:
train_reviews_data = data.review[:30000]
test_reviews_data = data.review[30000:]

## Bag of words

In [85]:
cv = CountVectorizer(min_df=0, max_df=1, binary=False, ngram_range = (1,3))
cv_train = cv.fit_transform(train_reviews_data)
cv_test = cv.transform(test_reviews_data)

print("BOW_cv_train: ", cv_train.shape)
print("BOW_cv_test: ", cv_test.shape)

BOW_cv_train:  (30000, 4954557)
BOW_cv_test:  (20000, 4954557)


## TF_IDF

In [86]:
tf = TfidfVectorizer(min_df=0, max_df=1, use_idf=True, ngram_range=(1,3))
tf_train = tf.fit_transform(train_reviews_data)
tf_test = tf.transform(test_reviews_data)

print("Tfidf_train: ", tf_train.shape)
print("Tfidf_test: ", tf_test.shape)

Tfidf_train:  (30000, 4954557)
Tfidf_test:  (20000, 4954557)


## Lable encoding 

In [87]:
#labeling the sentient data
label=LabelBinarizer()
#transformed sentiment data
sentiment_data=label.fit_transform(data['sentiment'])
# print(sentiment_data[0:5])
print("Sentiment data shape :", sentiment_data.shape)

Sentiment data shape : (50000, 1)


In [88]:
# train_test_split for target variable 
train_data=data.sentiment[:30000]
test_data=data.sentiment[30000:]

## Building logistic regression model

In [89]:
#training the model
logistic=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
                            
#Fitting the model for Bag of words
LogReg_bow=logistic.fit(cv_train,train_data)
print(LogReg_bow)
#Fitting the model for tfidf features
LogReg_tfidf=logistic.fit(tf_train,train_data)
print(LogReg_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)
LogisticRegression(C=1, max_iter=500, random_state=42)


In [90]:
#Predicting the model for bag of words model
LogReg_bow_predict=logistic.predict(cv_test)
print("Predicted sentiments for bag of words: ", LogReg_bow_predict)

Predicted sentiments for bag of words:  ['negative' 'negative' 'negative' ... 'negative' 'positive' 'positive']


In [91]:
#Predicting the model for TF_IDF
LogReg_tfidf_predict=logistic.predict(tf_test)
print("Predicted sentiments for TF_IDF: ", LogReg_tfidf_predict)

Predicted sentiments for TF_IDF:  ['negative' 'negative' 'negative' ... 'negative' 'positive' 'positive']


## Accuracy for bag of words and TF_IDF

In [92]:
#Accuracy score for bag of words features
LogReg_bow_score=accuracy_score(test_data,LogReg_bow_predict)
print("Accuracy for logistic regression using bag of words  :",LogReg_bow_score)

Accuracy for logistic regression using bag of words  : 0.74255


In [93]:
#Accuracy score for TF_IDF features
LogReg_tfidf_score=accuracy_score(test_data,LogReg_tfidf_predict)
print("Accuracy for logistic regression using TF_IDF  :",LogReg_tfidf_score)

Accuracy for logistic regression using TF_IDF  : 0.7426
