In [13]:
import pandas as pd
import numpy as np

In [14]:
data = pd.read_csv("https://github.com/Ankit152/IMDB-sentiment-analysis/raw/master/IMDB-Dataset.csv")

In [15]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [16]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [17]:
data.describe

<bound method NDFrame.describe of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>

In [18]:
data.columns

Index(['review', 'sentiment'], dtype='object')

In [19]:
data.isnull().any()

review       False
sentiment    False
dtype: bool

In [20]:
data.shape

(50000, 2)

In [21]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [22]:
data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

## **Text Normalization**

## tokenization

In [23]:
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize


In [24]:
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from bs4 import BeautifulSoup

In [25]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
#Tokenization of text
tokenizers=ToktokTokenizer()
#setting English stopwords
stopwords=nltk.corpus.stopwords.words('english')

In [27]:
def noiseremoval_text(text):
  soup=BeautifulSoup(text,"html.parser")
  text=soup.get_text()
  text=re.sub('\[[^]]*\]','',text)
  return text

In [28]:
#Apply function on review column
data['review']=data['review'].apply(noiseremoval_text)

  soup=BeautifulSoup(text,"html.parser")


In [29]:
data.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Stemming

In [30]:
#Stemming the text
def stemmer(text):
  ps=nltk.porter.PorterStemmer()
  text=' '.join([ps.stem(word) for word in text.split()])
  return text

In [32]:
#Apply function on review column
data['review']=data['review'].apply(stemmer)

In [33]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl production. the film techniqu i...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there' a famili where a littl boy (jake)...,negative
4,"petter mattei' ""love in the time of money"" is ...",positive


## Removing stop words

In [34]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [36]:
stop_wr=set(stopwords.words('english'))

In [35]:
#removing stopwords
def removing_stopwords(text,is_lower_case=False):
  #Tokenization of text
  tokenizers=ToktokTokenizer()
  #Setting English stopwords
  tokens=tokenizers.tokenize(text)
  tokens=[i.strip() for i in tokens]
  if is_lower_case:
    filtokens=[i for i in tokens if tokens not in stop_wr]
  else:
    filtokens=[i for i in tokens if i.lower() not in stop_wr]
  filtered_texts=' '.join(filtokens)
  return filtered_texts

In [37]:
#Apply function on review column
data['review']=data['review'].apply(removing_stopwords)

In [38]:
data.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 oz episod ' hook...,positive
1,wonder littl production. film techniqu veri un...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic ' famili littl boy ( jake ) think ' zomb...,negative
4,"petter mattei ' "" love time money "" visual stu...",positive


# Train test split

In [39]:
#split the dataset
#train dataset
train_reviews_data=data.review[:30000]

In [40]:
#test dataset
test_reviews_data=data.review[30000:]

## Bag of words

In [41]:
#count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train=cv.fit_transform(train_reviews_data)
#transformed test reviews
cv_test=cv.transform(test_reviews_data)
print('BOW_cv_train:',cv_train.shape)
print('BOW_cv_test:',cv_test.shape)

BOW_cv_train: (30000, 4954557)
BOW_cv_test: (20000, 4954557)


# TF_IDF

In [42]:
#Tfidf vectorizer
tf=TfidfVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
tf_train=tf.fit_transform(train_reviews_data)
#transformed test reviews
tf_test=tf.transform(test_reviews_data)
print('Tfidf_train:',tf_train.shape)
print('Tfidf_test:',tf_test.shape)

Tfidf_train: (30000, 4954557)
Tfidf_test: (20000, 4954557)


# Lable Encoding

In [43]:
#labeling the sentient data
label=LabelBinarizer()
#transformed sentiment data
sentiment_data=label.fit_transform(data['sentiment'])
print(sentiment_data.shape)

(50000, 1)


In [44]:
sentiment_data

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]])

In [45]:
#train dataset
train_data=data.sentiment[:30000]
# test dataset
test_data=data.sentiment[30000:]

In [46]:
#training the model
logistic=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#fitting the model for bag of words
lr_bow=logistic.fit(cv_train,train_data)
print(lr_bow)
#Fitting the model for tfidf features
lr_tfidf=logistic.fit(tf_train,train_data)
print(lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)
LogisticRegression(C=1, max_iter=500, random_state=42)


In [47]:
#Predicting the model of bag of words
bow_predict=logistic.predict(cv_test)
print(bow_predict)


['negative' 'negative' 'negative' ... 'negative' 'positive' 'positive']


In [55]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(test_data,bow_predict)
print("lr_bow_score: ",lr_bow_score)

lr_bow_score:  0.74255


In [52]:
#Fitting the model for tfidf featurex
lr_tfidf=logistic.fit(tf_train,train_data)
print(lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)


In [53]:
#Predicting the model for tfidf featurex
lr_tfidf_predict=logistic.predict(tf_test)
print(lr_tfidf_predict)

['negative' 'negative' 'negative' ... 'negative' 'positive' 'positive']


In [56]:
#Accuracy score for tfidf
lr_tfidf_score=accuracy_score(test_data,lr_tfidf_predict)
print("lr_tfidf_score: ",lr_tfidf_score)

lr_tfidf_score:  0.7426
