In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn import tree
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
imdb_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IMDB Dataset.csv")

# Dataset training
train_reviews = imdb_data.review[:40000]
train_sentiment = imdb_data.sentiment[:40000]

# Dataset testing
test_reviews = imdb_data.review[40000:]
test_sentiment = imdb_data.sentiment[40000:]

print({
    "train reviews shape": train_reviews.shape,
    "train sentiment shape": train_sentiment.shape,
    "test reviews shape": test_reviews.shape,
    "test sentiment shape": test_sentiment.shape
    })

{'train reviews shape': (40000,), 'train sentiment shape': (40000,), 'test reviews shape': (10000,), 'test sentiment shape': (10000,)}


In [None]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [None]:
imdb_data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Normalization of text
tokenizer=ToktokTokenizer()

stopword_list=nltk.corpus.stopwords.words('english')

In [None]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Define function for removing special characters
def remove_special_and_noisy_characters(text, remove_digits=True):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text


#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    text = remove_special_and_noisy_characters(text)
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

stop=set(stopwords.words('english'))
print(stop)

imdb_data['review'] = imdb_data['review'].apply(remove_stopwords)

{'those', 'own', 'isn', "wasn't", 'did', 'is', 't', 'other', 'no', 'o', "shouldn't", 'very', 'weren', 'can', 'his', 'any', 'don', "mightn't", 'ourselves', 'me', 'before', 'more', 'most', "won't", 'after', "aren't", 'on', "needn't", 'himself', 'was', 'out', 'by', "didn't", 'does', 'an', 'from', 'during', 'down', 'than', 'these', 'd', 'ours', 'y', 'as', 'we', 'up', 've', 'nor', 'myself', "haven't", "doesn't", 'will', 'where', 'here', 'shouldn', 'there', 'only', 'doing', 'they', 're', 'the', 'mustn', 'how', "isn't", 'my', 'then', "you've", 'do', 'too', 'just', 'who', 'to', 'should', 'so', 'has', "hasn't", 'each', 'through', 'were', 'and', "couldn't", 'it', 'won', "mustn't", 'same', 'not', "weren't", 'wouldn', "shan't", "that'll", 'theirs', 's', 'shan', 'their', 'herself', "she's", 'having', 'above', 'll', 'yourself', 'ma', 'she', 'haven', 'themselves', 'but', 'doesn', 'all', "it's", 'into', 'whom', "you'd", 'that', 'about', 'he', 'some', 'i', 'him', 'its', 'have', 'under', 'your', 'hasn',

In [None]:
#normalized train reviews
norm_train_reviews=imdb_data.review[:40000]
norm_train_reviews[0]

#Normalized test reviews
norm_test_reviews=imdb_data.review[40000:]
norm_test_reviews[45054]

'really loved movie spent several years trying get available TV many many years enjoyed songs something different say made think every person looks something different prespectives Also often dont appreciate something till longer thereMy 12 year old daughter discoverd music entranced songs Someday hope get copy film opportunity view Oh would love see'

In [None]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))

#transformed train reviews
cv_train_reviews=cv.fit_transform(norm_train_reviews)

#transformed test reviews
cv_test_reviews=cv.transform(norm_test_reviews)

In [None]:
print('BOW_cv_test:',cv_test_reviews.shape)

BOW_cv_test: (10000, 6675873)


In [None]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(norm_train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(norm_test_reviews)

In [None]:
print('tv_test_reviews:', tv_test_reviews.shape)

tv_test_reviews: (10000, 6675873)


In [None]:
#labeling the sentient data
lb=LabelBinarizer()

#transformed sentiment data
sentiment_data=lb.fit_transform(imdb_data['sentiment'])

#Spliting the sentiment data
train_sentiments=sentiment_data[:40000]
test_sentiments=sentiment_data[40000:]

# **Logistic Regression**

In [None]:
#training the model using L2
lr2=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)

#training the model using L1
lr1=LogisticRegression(penalty='none',max_iter=500,C=1,random_state=42)

#Fitting the model for Bag of words
lr_bow2=lr2.fit(cv_train_reviews,train_sentiments)
lr_bow1=lr1.fit(cv_train_reviews,train_sentiments)

print(lr_bow2)
print(lr_bow1)

#Fitting the model for tfidf features
lr_tfidf2=lr2.fit(tv_train_reviews,train_sentiments)
lr_tfidf1=lr1.fit(tv_train_reviews,train_sentiments)

print(lr_tfidf2)
print(lr_tfidf1)

LogisticRegression(C=1, max_iter=500, random_state=42)
LogisticRegression(C=1, max_iter=500, penalty='none', random_state=42)
LogisticRegression(C=1, max_iter=500, random_state=42)
LogisticRegression(C=1, max_iter=500, penalty='none', random_state=42)


In [None]:
#Predicting the model for bag of words
lr_bow_predict2=lr2.predict(cv_test_reviews)
print(lr_bow_predict2)

lr_bow_predict1=lr1.predict(cv_test_reviews)
print(lr_bow_predict1)

##Predicting the model for tfidf features
lr_tfidf_predict2=lr2.predict(tv_test_reviews)
lr_tfidf_predict1=lr1.predict(tv_test_reviews)

[0 0 0 ... 0 1 0]
[0 0 0 ... 0 0 0]


In [None]:
#Accuracy score for bag of words
lr_bow_score1=accuracy_score(test_sentiments,lr_bow_predict1)
lr_bow_score2=accuracy_score(test_sentiments,lr_bow_predict2)

print({
    "lr_bow_score1": lr_bow_score1,
    "lr_bow_score2": lr_bow_score2
    })

#Accuracy score for tfidf features
lr_tfidf_score1=accuracy_score(test_sentiments,lr_tfidf_predict1)
lr_tfidf_score2=accuracy_score(test_sentiments,lr_tfidf_predict2)
print({
    "lr_tfidf_score1":lr_tfidf_score1,
    "lr_tfidf_score2":lr_tfidf_score2
    })

{'lr_bow_score1': 0.7451, 'lr_bow_score2': 0.743}
{'lr_tfidf_score1': 0.7256, 'lr_tfidf_score2': 0.7438}


# **Support Vector Machine**

In [None]:
#training the linear svm
svm=SGDClassifier(loss='hinge',max_iter=500,random_state=42)

#fitting the svm for bag of words
svm_bow=svm.fit(cv_train_reviews,train_sentiments)
print(svm_bow)

#fitting the svm for tfidf features
svm_tfidf=svm.fit(tv_train_reviews,train_sentiments)
print(svm_tfidf)

SGDClassifier(max_iter=500, random_state=42)
SGDClassifier(max_iter=500, random_state=42)


In [None]:
#Predicting the model for bag of words
svm_bow_predict=svm.predict(cv_test_reviews)
print(svm_bow_predict)

#Predicting the model for tfidf features
svm_tfidf_predict=svm.predict(tv_test_reviews)
print(svm_tfidf_predict)

[1 1 1 ... 0 1 1]
[1 1 1 ... 1 1 1]


In [None]:
#Accuracy score for bag of words
svm_bow_score=accuracy_score(test_sentiments,svm_bow_predict)
print("svm_bow_score :",svm_bow_score)

#Accuracy score for tfidf features
svm_tfidf_score=accuracy_score(test_sentiments,svm_tfidf_predict)
print("svm_tfidf_score :",svm_tfidf_score)

svm_bow_score : 0.5722
svm_tfidf_score : 0.5111


# **Multinomial Naive Bayes**

In [None]:
#training the model
mnb=MultinomialNB()
#fitting the svm for bag of words
mnb_bow=mnb.fit(cv_train_reviews,train_sentiments)
print(mnb_bow)
#fitting the svm for tfidf features
mnb_tfidf=mnb.fit(tv_train_reviews,train_sentiments)
print(mnb_tfidf)

MultinomialNB()
MultinomialNB()


In [None]:
#Predicting the model for bag of words
mnb_bow_predict=mnb.predict(cv_test_reviews)
print(mnb_bow_predict)

#Predicting the model for tfidf features
mnb_tfidf_predict=mnb.predict(tv_test_reviews)
print(mnb_tfidf_predict)

[0 0 0 ... 0 1 0]
[0 0 0 ... 0 1 0]


In [None]:
#Accuracy score for bag of words
mnb_bow_score=accuracy_score(test_sentiments,mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)

#Accuracy score for tfidf features
mnb_tfidf_score=accuracy_score(test_sentiments,mnb_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_score)

mnb_bow_score : 0.744
mnb_tfidf_score : 0.7442


# **MLP Classifier**

In [None]:
#training the model
mlp1=MLPClassifier(hidden_layer_sizes=(50,), random_state=1, max_iter=300).fit(cv_train_reviews, train_sentiments)
print(mlp1)

mlp2=MLPClassifier(hidden_layer_sizes=(50,), random_state=1, max_iter=300).fit(tv_train_reviews,train_sentiments)
print(mlp2)

In [None]:
# clf.predict(X_test[:5, :])
# clf.score(X_test, y_test)

mlp1_bow_predict=mlp1.predict_proba(cv_test_reviews)
print(mlp1_bow_predict)

#Predicting the model for tfidf features
mlp2_tfidf_predict=mlp2.predict(tv_test_reviews)
print(mlp2_tfidf_predict)

In [None]:
#Accuracy score for bag of words
mlp1_bow_score=mlp1.score(test_sentiments,mlp1_bow_predict)
print("mlp1_bow_score :",mlp1_bow_score)

#Accuracy score for tfidf features
mlp2_tfidf_score=mlp2.score(test_sentiments,mlp2_tfidf_predict)
print("mlp2_tfidf_score :",mlp2_tfidf_score)