In [0]:
#Load the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import os
print(os.listdir("../ML/DATASET/"))
import warnings
warnings.filterwarnings('ignore')

['aclImdb', 'bigramtfidf.output.txt', 'AI-Sentiment-Analysis-on-IMDB-Dataset-master', 'unigramtfidf.output.txt', 'imdb_tr.csv', 'bigram.output.txt', 'IMDB_DATASET.csv', 'unigram.output.txt']


In [0]:
imdb_data=pd.read_csv('../ML/DATASET/IMDB_DATASET.csv',encoding='latin-1')
print(imdb_data.shape)
imdb_data.head(10)  

(25000, 3)


Unnamed: 0,row_Number,text,polarity
0,4933,idea smart title film serious tongue cheek fee...,1
1,2036,De Grot good film. great plot comes novel Tim ...,1
2,3838,first watched film part festival new Argentine...,1
3,6910,"vote 10 10 rare chance happens see review, tak...",1
4,15829,"rented dubbed-English version Lensman, hoping ...",0
5,23573,"Please, someone stop Ben Stiller acting movie....",0
6,7276,Stargate best show ever. actors absolutely per...,1
7,24556,"Okay, film festival crowd probably loved it. a...",0
8,12353,"i, too, loved series kid. 1952 5 family always...",1
9,5341,"father Who's alcoholic drummer, Keith Moon, na...",1


In [0]:
train_reviews=imdb_data.text[:15000]  #train dataset
train_sentiments=imdb_data.polarity[:15000]
#test dataset
test_reviews=imdb_data.text[15000:]
test_sentiments=imdb_data.polarity[15000:]
print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)

(15000,) (15000,)
(10000,) (10000,)


In [0]:
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [0]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
imdb_data['text']=imdb_data['text'].apply(denoise_text)

In [0]:
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
imdb_data['text']=imdb_data['text'].apply(remove_special_characters)

In [0]:
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
imdb_data['text']=imdb_data['text'].apply(simple_stemmer)

In [0]:
stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
imdb_data['text']=imdb_data['text'].apply(remove_stopwords)

{'to', 'no', 'yourself', 'theirs', "hadn't", 'weren', "haven't", "you'd", 'against', 'we', "shouldn't", 'when', 'ourselves', 'than', 'him', 'where', 'under', 'should', 'that', 'do', 'me', 'am', "won't", 'above', 'the', 'its', 'most', 'out', "you've", 'i', 'y', "mightn't", 'ours', 'why', "wasn't", 'because', 'was', 'of', 'be', 'during', 'couldn', 'didn', 'does', "hasn't", 'own', 'too', 'her', 'this', 'from', 'our', 'ain', 'off', 'very', 'will', 'at', 'those', 'until', 'while', 'aren', 'yourselves', 'mightn', 'they', 'each', 'themselves', 'about', 'o', "aren't", 'did', 'myself', "didn't", 'by', 'all', 'as', 'had', "isn't", 's', 'for', 'only', 'on', "should've", 'whom', 'what', 'my', 'doesn', 'then', 'once', 'if', 'now', 'hers', 'before', 'being', 'm', "she's", 'in', 're', 've', 'yours', "doesn't", 'again', 'hadn', 'she', "mustn't", 'but', 'been', 'few', 'just', 'shouldn', 'nor', 'it', 'itself', 'and', 'how', "needn't", 'his', 'wouldn', 'other', 'your', 'have', 'don', 'won', 'a', 'd', "sh

In [0]:
#normalized train reviews
norm_train_reviews=imdb_data.text[:15000]
norm_train_reviews[0]

'idea smart titl film seriou tongu cheek feel subtl dont know read guy full blown comedi someth els go littl dialogu film isnt delic add power film sound switch film actual wouldnt take anyth away film physic action art show strong carri entireti storyi bless bless emot follow shame first say film part black film festiv watch film impress work ask next someon els see work common associ art without purpos isnt found see intent design start finish usag african music style cast everyth seem plan reason charact develop amaz cast think strongest aspect film charact easili defin within 2 minut 6 minut actual filmw need see kind film realli need greater support develop short film across board'

In [0]:
#Normalized test reviews
norm_test_reviews=imdb_data.text[15000:]

In [0]:
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(norm_train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(norm_test_reviews)
#priinting the shape of bow(bag of words) cv test and train
print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)

BOW_cv_train: (15000, 2603730)
BOW_cv_test: (10000, 2603730)


In [0]:
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(norm_train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(norm_test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (15000, 2603730)
Tfidf_test: (10000, 2603730)


In [0]:
lb=LabelBinarizer() #binarize data 
#transformed sentiment data
sentiment_data=lb.fit_transform(imdb_data['polarity'])
print(sentiment_data.shape)

(25000, 1)


In [0]:
train_sentiments=sentiment_data[:15000]
test_sentiments=sentiment_data[15000:]
print(train_sentiments)
print(test_sentiments)

[[1]
 [1]
 [1]
 ...
 [1]
 [1]
 [1]]
[[1]
 [1]
 [0]
 ...
 [1]
 [0]
 [1]]


In [0]:
svm=SGDClassifier(loss='hinge',random_state=42)
#fitting the svm for bag of words
svm_bow=svm.fit(cv_train_reviews,train_sentiments)
print(svm_bow)
#fitting the svm for tfidf features
svm_tfidf=svm.fit(tv_train_reviews,train_sentiments)
print(svm_tfidf)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False)
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False)


In [0]:
svm_bow_predict=svm.predict(cv_test_reviews)
print(svm_bow_predict)
#Predicting the model for tfidf features
svm_tfidf_predict=svm.predict(tv_test_reviews)
print(svm_tfidf_predict)

[1 1 1 ... 1 1 1]
[1 1 1 ... 1 1 1]


In [0]:
svm_bow_score=accuracy_score(test_sentiments,svm_bow_predict)
print("svm_bow_score :",svm_bow_score)
#Accuracy score for tfidf features
svm_tfidf_score=accuracy_score(test_sentiments,svm_tfidf_predict)
print("svm_tfidf_score :",svm_tfidf_score)

svm_bow_score : 0.5968
svm_tfidf_score : 0.5013


In [0]:
svm_bow_report=classification_report(test_sentiments,svm_bow_predict,target_names=['Positive','Negative'])
print(svm_bow_report)
#Classification report for tfidf features
svm_tfidf_report=classification_report(test_sentiments,svm_tfidf_predict,target_names=['Positive','Negative'])
print(svm_tfidf_report)

             precision    recall  f1-score   support

   Positive       0.90      0.22      0.36      5028
   Negative       0.55      0.98      0.71      4972

avg / total       0.73      0.60      0.53     10000

             precision    recall  f1-score   support

   Positive       0.96      0.01      0.02      5028
   Negative       0.50      1.00      0.67      4972

avg / total       0.73      0.50      0.34     10000



In [0]:
cm_bow=confusion_matrix(test_sentiments,svm_bow_predict,labels=[1,0])
print(cm_bow)
#confusion matrix for tfidf features
cm_tfidf=confusion_matrix(test_sentiments,svm_tfidf_predict,labels=[1,0])
print(cm_tfidf)

[[4855  117]
 [3915 1113]]
[[4970    2]
 [4985   43]]
