IMDB Movies Reviews SENTIMENT ANALYSIS

In [1]:
import numpy as np
import pandas as pd

In [2]:
#loading dataset
data= pd.read_csv("dataset/IMDB Dataset.csv")

In [3]:
#print first five rows of dataset
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# names of columns
data.columns

Index(['review', 'sentiment'], dtype='object')

In [5]:
#check for null values
data.isnull().any()

review       False
sentiment    False
dtype: bool

## Text Normalization
### Tokenization

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize

In [7]:
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from bs4 import BeautifulSoup

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
#tokenization
tokens= ToktokTokenizer()
stopwords= nltk.corpus.stopwords.words('english')

#### noise removal
using BeautifulSoup for removing any html tags that might be present

In [10]:
# noise removal
def noise_removal(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub('\[[^]]*\]', '', text)
    return text

  text = re.sub('\[[^]]*\]', '', text)


In [11]:
data['review'] = data['review'].apply(noise_removal)

  soup = BeautifulSoup(text, "html.parser")


### Stemming

In [12]:
def stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

In [13]:
data['review'] = data['review'].apply(stemmer)

In [14]:
# check if stemming is done
data.head()

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl production. the film techniqu i...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there' a famili where a littl boy (jake)...,negative
4,"petter mattei' ""love in the time of money"" is ...",positive


### Removing Stopwords

In [15]:
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize 

In [16]:
# set stop words to English
stop_wr=set(stopwords.words('english'))
print(stop_wr)

{'been', 'him', 'which', 'so', "couldn't", 'being', 'under', 'into', "you've", "she's", 'not', 'whom', 'having', 'when', 'few', 'it', 'further', 't', 'why', "you'd", 'was', 'should', 'for', 'your', 'herself', 'their', 'those', 'i', 'yours', 'during', 'after', "didn't", 'hasn', "hadn't", 'isn', 'a', 'am', 'mightn', 'only', "should've", 'nor', 'my', "isn't", 'mustn', 'needn', 'y', 'on', 'she', 'at', 'doing', 'what', 'out', 'but', 'ain', "you'll", 'this', 'didn', 'no', 'the', "aren't", 'ma', 'again', 'own', 'couldn', 'her', 'now', "needn't", 've', 'once', 'some', 'do', 'yourself', 'has', 'of', 'were', 'if', 'most', 'haven', 'down', "hasn't", 'theirs', 'our', 'and', 'while', 'how', 'up', "weren't", 'can', 'be', "doesn't", "mustn't", 'doesn', 'wouldn', 'himself', 'each', 'hadn', 'll', 'from', 'won', 'have', 'aren', 'until', 'its', 'ourselves', 'we', 'between', 'they', 'here', 'm', 'shan', 'weren', 'does', 'd', "wasn't", 'against', 'all', "don't", 'where', 'such', 'wasn', "mightn't", 'me', "

In [17]:
#removing the stopwords
def removing_stopwords(text, is_lower_case=False):  #set lower case flag to false 
    #Tokenization of text
    tokenizers=ToktokTokenizer()
    #Setting English stopwords
    tokens = tokenizers.tokenize(text)
    tokens = [token.strip() for token in tokens]    #remove whitespace
    if is_lower_case:
        filter_tokens = [token for token in tokens if token not in stop_wr] #if text in lowercase then compare to stopwords
    else:
        filter_tokens = [token for token in tokens if token.lower() not in stop_wr] #if not in lowercase then convert and compare
    filtered_text = ' '.join(filter_tokens)    
    return filtered_text

In [18]:
data['review'] = data['review'].apply(removing_stopwords)

In [19]:
data.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 oz episod ' hook...,positive
1,wonder littl production. film techniqu veri un...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic ' famili littl boy ( jake ) think ' zomb...,negative
4,"petter mattei ' "" love time money "" visual stu...",positive


### Split dataset
splitting dataset into training and testing sets

In [40]:
#trainig dataset
training_data = data.review[:30000]

In [41]:
#testing dataset
testing_data= data.review[30000:]

### Bag of Words

In [42]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=1,max_df=1,binary=False,ngram_range=(1,3))

#transformed train reviews
cv_train=cv.fit_transform(training_data)

#transformed test reviews
cv_test=cv.transform(testing_data)

print('BOW_cv_train:',cv_train.shape)
print('BOW_cv_test:',cv_test.shape)

BOW_cv_train: (30000, 4954557)
BOW_cv_test: (20000, 4954557)


##### cv=CountVectorizer(min_df=1,max_df=1,binary=False,ngram_range=(1,3))
min_df=1 means the term must appear in at least 1 document to be included.
max_df=1 means the term must appear in at most 1 document to be included.
binary=False: This indicates that the vectorizer will count the frequency of words, rather than marking presence/absence of words with 1 and 0.
ngram_range=(1,3) means that the vectorizer will consider unigrams, bigrams and trigrams.
##### cv_train=cv.fit_transform(training_data)
fit: The vectorizer learns the vocabulary from the training_data (i.e., the words and their counts).
transform: Transforms the training_data into a matrix of token counts (Bag of Words matrix).
##### cv_test=cv.transform(testing_data)
This applies the vocabulary learned from the training data to the testing_data

### TF_IDF
This vectorizer converts text documents into a matrix of TF-IDF features. TF-IDF is a weighting method that combines two factors:
Term Frequency (TF): How frequently a word appears in a document.
Inverse Document Frequency (IDF): Measures how rare a word is across all documents (downweights common words).

In [43]:
tf=TfidfVectorizer(min_df=1,max_df=1,use_idf=True,ngram_range=(1,3))

#transformed train reviews
tf_train=tf.fit_transform(training_data)

#transformed test reviews
tf_test=tf.transform(testing_data)

print('Tfidf_train:',tf_train.shape)
print('Tfidf_test:',tf_test.shape)

Tfidf_train: (30000, 4954557)
Tfidf_test: (20000, 4954557)


### Labelling
convert categorical sentiment data ("positive", "negative") into a binary numerical format, where each label is represented as 0 or 1.

In [44]:
label=LabelBinarizer()

#transformed sentiment data
sentiment_data=label.fit_transform(data['sentiment'])

print(sentiment_data.shape)

(50000, 1)


In [48]:
train_data= data.sentiment[:30000]

In [49]:
test_data= data.sentiment[30000:]

In [50]:
#training the model
logistic=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=logistic.fit(cv_train,train_data)
print(lr_bow)
#Fitting the model for tfidf features
lr_tfidf=logistic.fit(tf_train,train_data)
print(lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)
LogisticRegression(C=1, max_iter=500, random_state=42)


In [51]:
#Predicting the model for bag of words
lr_bow_predict=logistic.predict(cv_test)
print(lr_bow_predict)

['negative' 'negative' 'negative' ... 'negative' 'positive' 'positive']


In [52]:
##Predicting the model for tfidf features
lr_tfidf_predict=logistic.predict(tf_test)
print(lr_tfidf_predict)

['negative' 'negative' 'negative' ... 'negative' 'positive' 'positive']


In [53]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(test_data,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)


lr_bow_score : 0.7428


In [54]:
#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(test_data,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_tfidf_score : 0.743
