In [59]:
import numpy as np
import pandas as pd

In [60]:
data = pd.read_csv("IMDB Dataset.csv")

In [61]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Text normalization
## tokenization

In [62]:
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize

In [63]:
import spacy
import re, string, unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from bs4 import BeautifulSoup

In [64]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [65]:
tokenizers = ToktokTokenizer()

stopwords = nltk.corpus.stopwords.words('english')

In [66]:
#Removing the noisy text
def noiseremoval_text(text):
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    text = re.sub('\[[^]]*\]', '', text)
    return text

In [67]:
data['review'] = data['review'].apply(noiseremoval_text)

In [68]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Stemming

In [69]:
def stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [70]:
data['review'] = data['review'].apply(stemmer)

In [71]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl production. the film techniqu i...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there' a famili where a littl boy (jake)...,negative
4,"petter mattei' ""love in the time of money"" is ...",positive


# Removing stop words

In [72]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [73]:
stop_wr = set(stopwords.words('english'))
print(stop_wr)

{'very', 'wouldn', 'such', "don't", 'in', 'was', "aren't", "doesn't", 'herself', 'won', "mustn't", 'doesn', "weren't", 'doing', 'will', 'didn', 'this', 'with', 'am', 'at', "wouldn't", 'have', 'm', "didn't", 'most', 'how', 'its', 'below', 'and', 'these', 'further', 'did', 'yourself', "that'll", 'having', 'after', 'needn', "isn't", 'them', 'are', 'where', 'only', 'those', 'don', 'your', 'he', 'so', 'our', 'that', 'ma', "hadn't", 'for', 'couldn', 'ourselves', 'yourselves', 'any', "you're", 'ain', 'do', 'who', 'it', 'if', 'above', 'few', 'o', 'all', "shouldn't", 'wasn', 'other', 'down', 'why', 'own', 'same', 'she', 'itself', 'on', 'there', 'of', 'is', 'through', 'y', 'an', "needn't", 'weren', "she's", 'nor', 'yours', 'hers', 'they', 'not', 'i', 'over', 'but', 'once', 're', 'd', 'mightn', 'about', "you'd", 'the', "mightn't", 'here', 'himself', 'from', 'out', "you've", 'because', 'me', 'his', 'being', "you'll", 'between', 'no', "wasn't", "won't", 'a', 'themselves', 'too', 'which', 'each', 's

In [74]:
def removing_stopwords(text, is_lower_case = False):
    tokenizers = ToktokTokenizer()
    tokens = tokenizers.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filter_tokens = [token for token in tokens if token not in stop_wr]
    else:
        filter_tokens = [token for token in tokens if token.lower() not in stop_wr]
        filtered_text = ' '.join(filter_tokens)
        return filtered_text

In [75]:
data['review'] = data['review'].apply(removing_stopwords)

In [76]:
data.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 oz episod ' hook...,positive
1,wonder littl production. film techniqu veri un...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic ' famili littl boy ( jake ) think ' zomb...,negative
4,"petter mattei ' "" love time money "" visual stu...",positive


# Split training & test datasets

In [78]:
#train dataset
train_review_data = data.review[:30000]

#test dataset
test_review_data = data.review[30000:]

# Bag of words

In [79]:
#count vectorizer for bag of words
cv = CountVectorizer(min_df=0, max_df=1, binary=False, ngram_range=(1,3))

#transformed train reviews
cv_train = cv.fit_transform(train_review_data)

#transformed test reviews
cv_test = cv.transform(test_review_data)

print('BOW_cv_train:',cv_train.shape)
print('BOW_cv_test:',cv_test.shape)

BOW_cv_train: (30000, 4954557)
BOW_cv_test: (20000, 4954557)


# TF_IDF

In [86]:
#TF_IDF vectorizer
tf = TfidfVectorizer(min_df=0, max_df=1, use_idf=True, ngram_range=(1,3))

#transformed trian reviews
tf_train = tf.fit_transform(train_review_data)

#transformed test reviews
tf_test = tf.transform(test_review_data)

print('Tfidf_train:',tf_train.shape)
print('Tfidf_test:',tf_test.shape)

Tfidf_train: (30000, 4954557)
Tfidf_test: (20000, 4954557)


# Label encoding

In [87]:
#labeling the sentient data
label = LabelBinarizer()

#transformed sentiment data
sentiment_data = label.fit_transform(data['sentiment'])

print(sentiment_data.shape)

(50000, 1)


In [88]:
train_sentiment_data = data.sentiment[:30000]

test_sentiment_data = data.sentiment[30000:]

# Training the mode

In [89]:
logistic = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)

lr_bow = logistic.fit(cv_train, train_sentiment_data)
print(lr_bow)

lr_tfidf = logistic.fit(tf_train, train_sentiment_data)
print(lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)
LogisticRegression(C=1, max_iter=500, random_state=42)


In [90]:
#Predicting the model for bag of words
lr_bow_predict = logistic.predict(cv_test)
print(lr_bow_predict)

['negative' 'negative' 'negative' ... 'negative' 'positive' 'positive']


In [91]:
#Predicting the model for tfidf fetures
lr_tfidf_predict = logistic.predict(tf_test)
print(lr_tfidf_predict)

['negative' 'negative' 'negative' ... 'negative' 'positive' 'positive']


In [92]:
#Accuracy score for bag of words
lr_bow_score = accuracy_score(test_sentiment_data, lr_bow_predict)

print("lr_bow_score: ", lr_bow_score)

lr_bow_score:  0.74255


In [94]:
#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(test_sentiment_data,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_tfidf_score : 0.7426
