**IMDB NLTK Sentiment Analysis**

In [13]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score


In [5]:
imdb_data=pd.read_csv(r'C:\Users\swast\OneDrive\Documents\NLP HANDS ON\IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [6]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


In [7]:
imdb_data['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [8]:
train_reviews=imdb_data.review[:40000]
train_sentiments=imdb_data.sentiment[:40000]
test_reviews=imdb_data.review[40000:]
test_sentiments=imdb_data.sentiment[40000:]
print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)

(40000,) (40000,)
(10000,) (10000,)


In [9]:
tokenizer=ToktokTokenizer()
stopword_list=nltk.corpus.stopwords.words('english')

In [14]:
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
imdb_data['review']=imdb_data['review'].apply(simple_stemmer)

In [15]:
stop=set(stopwords.words('english'))
print(stop)
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_stopwords)

{'who', 'our', 'hers', "she'll", 'by', 'won', 'too', "they'd", 'my', 'your', "it's", "don't", 'while', 'wasn', 'been', 'but', "that'll", "mustn't", 'from', "hadn't", 'was', 'some', "couldn't", 'hasn', 'o', 'can', 'where', 'whom', 'a', 'only', 'themselves', 'their', 'what', "we've", "we're", 'herself', 'few', 'him', 't', "we'll", 'before', "won't", 'then', 'that', 'to', 'isn', 'is', "she's", 'had', 'yours', 'having', 'd', 's', 'or', 'weren', 'doesn', 'same', "doesn't", 'he', 'how', "didn't", "needn't", 'are', "we'd", 'out', 'just', 'm', 'own', 'until', 'didn', "i'd", 'about', 'they', 'being', 'again', 'this', 're', 'needn', 'them', 'up', 'its', "he's", "she'd", 'did', "shan't", "it'll", 'very', 'below', 'for', 'any', 'theirs', 'after', "hasn't", 'why', "wasn't", 'at', 'with', "aren't", 'mightn', "it'd", 'off', 'above', 'on', "wouldn't", 'of', 'yourself', 'an', "i've", "they've", 'shan', 'if', 'has', 'when', 'most', 'do', 'does', "isn't", 'myself', 'she', 'those', "they're", 'between', "

In [16]:
norm_train_reviews=imdb_data.review[:40000]
norm_train_reviews[0]

"one review ha mention watch 1 oz episod ' hooked. right , thi exactli happen me.<br / ><br / >the first thing struck oz wa brutal unflinch scene violence , set right word go. trust , thi show faint heart timid. thi show pull punch regard drugs , sex violence. hardcore , classic use word.<br / ><br / >it call oz nicknam given oswald maximum secur state penitentary. focus mainli emerald city , experiment section prison cell glass front face inwards , privaci high agenda. em citi home many .. aryans , muslims , gangstas , latinos , christians , italians , irish .... scuffles , death stares , dodgi deal shadi agreement never far away.<br / ><br / >i would say main appeal show due fact goe show ' dare. forget pretti pictur paint mainstream audiences , forget charm , forget romance ... oz ' mess around. first episod ever saw struck nasti wa surreal , ' say wa readi , watch , develop tast oz , got accustom high level graphic violence. violence , injustic ( crook guard ' sold nickel , inmat '

In [17]:
norm_test_reviews=imdb_data.review[40000:]
norm_test_reviews[45005]

"read review watch thi piec cinemat garbag took least 2 page find somebodi els ' think thi appallingli unfunni montag ' acm humour 70 inde ani era ! thi ' least funni set sketch *comedy* ' v ever seen ' till come along. half skit alreadi done ( infinit better ) act monti python woodi allen ... wa say nice piec anim last 90 second highlight thi film would still get close sum mindless drivel-ridden thi wast 75 minut is. semin comedy ? onli world semin realli doe mean semen. scatolog humour ? onli world scat actual feces. precursor jokes ? onli mean thi handbook comedy. tit bum odd beaver. nice ... pubesc boy least one hand free ' found playboy exists. give break becaus wa earli 70s ? way. sketch comedi go back least ten year prior. onli way could even forgiv thi film even made wa gunpoint. retro ? hardly. sketch clown subtli pervert children may cut edg circl ( could actual funny ) come realli quit sad. kept go throughout entir 75 minutes ? sheer belief may save genuin funni skit end. ga

In [18]:
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
cv_train_reviews=cv.fit_transform(norm_train_reviews)
cv_test_reviews=cv.transform(norm_test_reviews)
print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)

BOW_cv_train: (40000, 6307341)
BOW_cv_test: (10000, 6307341)


In [19]:
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
tv_train_reviews=tv.fit_transform(norm_train_reviews)
tv_test_reviews=tv.transform(norm_test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (40000, 6307341)
Tfidf_test: (10000, 6307341)


In [20]:
lb=LabelBinarizer()
sentiment_data=lb.fit_transform(imdb_data['sentiment'])
print(sentiment_data.shape)

(50000, 1)


In [21]:
train_sentiments=sentiment_data[:40000]
test_sentiments=sentiment_data[40000:]
print(train_sentiments)
print(test_sentiments)

[[1]
 [1]
 [1]
 ...
 [1]
 [0]
 [0]]
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [22]:
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
lr_bow=lr.fit(cv_train_reviews,train_sentiments)
print(lr_bow)
lr_tfidf=lr.fit(tv_train_reviews,train_sentiments)
print(lr_tfidf)

  return f(*args, **kwargs)


LogisticRegression(C=1, max_iter=500, random_state=42)


  return f(*args, **kwargs)


LogisticRegression(C=1, max_iter=500, random_state=42)


In [23]:
lr_bow_predict=lr.predict(cv_test_reviews)
print(lr_bow_predict)
lr_tfidf_predict=lr.predict(tv_test_reviews)
print(lr_tfidf_predict)

[0 0 0 ... 0 1 1]
[0 0 0 ... 0 1 1]


In [24]:
lr_bow_score=accuracy_score(test_sentiments,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)
lr_tfidf_score=accuracy_score(test_sentiments,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_bow_score : 0.7529
lr_tfidf_score : 0.7523


In [25]:
lr_bow_report=classification_report(test_sentiments,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)
lr_tfidf_report=classification_report(test_sentiments,lr_tfidf_predict,target_names=['Positive','Negative'])
print(lr_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.75      0.75      0.75      4993
    Negative       0.75      0.76      0.75      5007

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000

              precision    recall  f1-score   support

    Positive       0.74      0.77      0.76      4993
    Negative       0.76      0.73      0.75      5007

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000



In [26]:
cm_bow=confusion_matrix(test_sentiments,lr_bow_predict,labels=[1,0])
print(cm_bow)
cm_tfidf=confusion_matrix(test_sentiments,lr_tfidf_predict,labels=[1,0])
print(cm_tfidf)

[[3782 1225]
 [1246 3747]]
[[3680 1327]
 [1150 3843]]
