# SENTIMENT ANALYSIS OF IMDB MOVIE REVIEWS

## IMPORTING DATASET

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.feature_extraction.text import CountVectorizer
count=CountVectorizer()
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("D:\downloads\sentiment analysis\IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.shape

(50000, 2)

## TEXT CLEANING

In [4]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [5]:
df=df.sample(10000)

In [6]:
df.shape

(10000, 2)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 31042 to 26746
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [8]:
df['sentiment'].replace({'positive':1, 'negative':0},inplace=True)

In [9]:
df.head()

Unnamed: 0,review,sentiment
31042,Has anyone ever read or heard comments by Scor...,1
38806,"This movie can be described in those 2 words ""...",1
10107,"As someone who's never been into sports, it se...",1
47091,I remember seeing this movie a long time ago o...,0
38982,Slackers is just another teen movie that's not...,0


### REMOVING HTML TAGS

In [10]:
import re
clean=re.compile('<.*?>')
re.sub(clean,'',df.iloc[2].review)

'As someone who\'s never been into sports, it seems like it would be hard for me to get into the football (or as we Americans inexplicably call it, soccer)-themed "Bend It Like Beckham". But I gotta say, this was one cool movie! Anglo-Indian Jesminder Bhamra (Parminder Nagra) and her WASP friend Juliette Paxton (Keira Knightley) love to play football (yes, I\'m going to say it the British - and international - way) and just adore football player David Beckham. But Jesminder\'s traditional Sikh parents don\'t approve (her mother offers a really whacked-out description of football early in the movie). Okay, so maybe it was sort of a cliché in that sense, but you gotta love this movie! And if like me, you go to this movie not knowing the definition of "bend" in football...don\'t worry, the movie explains it (I\'d also never heard of David Beckham prior to this movie). And we all know that Keira Knightley hit it big: a few months after "BILB" came out in the States, she starred in the equa

In [11]:
def clean_html(text):
    clean=re.compile('<.*?>')
    return re.sub(clean, '', text)


In [12]:
df['review']=df['review'].apply(clean_html)

### TEXT PREPROCESSING

In [13]:
def convert_lower(text):
    return text.lower()

In [14]:
df['review']=df['review'].apply(convert_lower)

In [15]:
def remove_special(text):
    x=''
    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x=x+' '
    return x

In [16]:
df['review']=df['review'].apply(remove_special)

In [17]:
import nltk
from nltk.corpus import stopwords

In [18]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [19]:
df

Unnamed: 0,review,sentiment
31042,has anyone ever read or heard comments by scor...,1
38806,this movie can be described in those 2 words ...,1
10107,as someone who s never been into sports it se...,1
47091,i remember seeing this movie a long time ago o...,0
38982,slackers is just another teen movie that s not...,0
...,...,...
47567,special sneak previews are always a good time ...,0
6225,wow who ever said that edward d wood jr nev...,0
29494,this movie is the absolutely perfect way to ex...,1
33325,this is just one of the hundred million movies...,0


In [None]:
def remove_stopwords(text):
    x=[]
    for i in text.split():
        
        if i not in stopwords.words('english'):
            x.append(i)
    y=x[:]
    x.clear()
    return y
df['review']=df['review'].apply(remove_stopwords)


In [None]:
df

In [None]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [None]:
y=[]
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z

In [None]:
df['review']=df['review'].apply(stem_words)

In [None]:
df

In [None]:
def join_back(list_input):
    return " ".join(list_input)
df['review']=df['review'].apply(join_back)

df['review']

In [None]:
X=df.iloc[:,0:1].values

In [None]:
X.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1000)

In [None]:
X=cv.fit_transform(df['review']).toarray()
X.shape

In [None]:
y=df.iloc[:,-1].values

In [None]:
X[0]

In [None]:
X[0].max()

In [None]:
X[0].mean()

In [None]:
y=df.iloc[:,-1].values

In [None]:
y.shape

## SPLITTING OF DATA

In [None]:
from  sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2)

In [None]:
 X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

## MACHINE LEARNING MODELS

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegressionCV

clf=LogisticRegressionCV(cv=6,scoring='accuracy',random_state=0,n_jobs=-1,verbose=3,max_iter=500).fit(X_train,y_train)

y_pred = clf.predict(X_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


In [None]:
pd.DataFrame(confusion_matrix(y_test,y_pred),columns=["Predicted No","Predicted Yes"], index=["Actual No","Actual Yes"])

In [None]:
print(classification_report(y_test,y_pred))

### SGDC Classifier

In [None]:
from sklearn.linear_model import SGDClassifier
clf= SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_test,y_pred),columns=["Predicted No","Predicted Yes"], index=["Actual No","Actual Yes"])

In [None]:
print(classification_report(y_test,y_pred))

### Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
clf1=GaussianNB()
clf2=MultinomialNB()
clf3=BernoulliNB()

In [None]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)

In [None]:
y_pred1=clf1.predict(X_test)
y_pred2=clf2.predict(X_test)
y_pred3=clf3.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print("Gausssian",accuracy_score(y_test,y_pred1))
pd.DataFrame(confusion_matrix(y_test,y_pred1),columns=["Predicted No","Predicted Yes"], index=["Actual No","Actual Yes"])
print(classification_report(y_test,y_pred1))

In [None]:
print("Multinomial",accuracy_score(y_test,y_pred2))
pd.DataFrame(confusion_matrix(y_test,y_pred2),columns=["Predicted No","Predicted Yes"], index=["Actual No","Actual Yes"])
print(classification_report(y_test,y_pred2))

In [None]:
print("Bernaulli",accuracy_score(y_test,y_pred3))
pd.DataFrame(confusion_matrix(y_test,y_pred2),columns=["Predicted No","Predicted Yes"], index=["Actual No","Actual Yes"])
print(classification_report(y_test,y_pred3))

### K-Nearest Neighbor Classifier 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn= KNeighborsClassifier()
knn.fit(X_train,y_train)

In [None]:
y_pred=knn.predict(X_test)
metrics.accuracy_score(y_test,y_pred)

In [None]:
pd.DataFrame(confusion_matrix(y_test,y_pred),columns=["Predicted No","Predicted Yes"], index=["Actual No","Actual Yes"])

In [None]:
print(classification_report(y_test,y_pred))

### Random Forest Classifier 

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc= RandomForestClassifier(n_estimators=200)
rfc.fit(X_train,y_train)

In [None]:
y_pred=rfc.predict(X_test)

metrics.accuracy_score(y_test,y_pred)

In [None]:
pd.DataFrame(confusion_matrix(y_test,y_pred),columns=["Predicted No","Predicted Yes"], index=["Actual No","Actual Yes"])

In [None]:
print(classification_report(y_test,y_pred))

### Decision Tree 

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc= DecisionTreeClassifier()
dtc=dtc.fit(X_train,y_train)

In [None]:
y_pred=dtc.predict(X_test)

metrics.accuracy_score(y_test,y_pred)

In [None]:
pd.DataFrame(confusion_matrix(y_test,y_pred),columns=["Predicted No","Predicted Yes"], index=["Actual No","Actual Yes"])

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
import seaborn as sns
sns.set()