In [20]:
# importing the libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split as tts

In [21]:
# importing the dataset
data = pd.read_csv("IMDB-Dataset.csv")

In [22]:
# top values of the data-set
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [23]:
# shape of the data
data.shape

(50000, 2)

In [24]:
# column names 
data.columns

Index(['review', 'sentiment'], dtype='object')

In [25]:
# count of unique values in the column
data['sentiment'].value_counts()

negative    25915
positive    24085
Name: sentiment, dtype: int64

<hr>

## Processing the data

In [26]:
def clean_text1(text):
    text=text.lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub('[%s]'%re.escape(string.punctuation),'',text)
    text=re.sub('\w*\d\w*','',text)
    return text

cleaned1=lambda x:clean_text1(x)

In [27]:
data['review']=pd.DataFrame(data.review.apply(cleaned1))

In [28]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production br br the filmin...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [29]:
# second round of cleaning
def clean_text2(text):
    text=re.sub('[''"",,,]','',text)
    text=re.sub('\n','',text)
    return text

cleaned2=lambda x:clean_text2(x)

In [30]:
data['review']=pd.DataFrame(data.review.apply(cleaned2))
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production br br the filmin...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


<hr>

## Splitting the Data

In [31]:
x = data.iloc[0:,0].values
y = data.iloc[0:,1].values

In [32]:
xtrain,xtest,ytrain,ytest = tts(x,y,test_size = 0.25,random_state = 225)

<hr>

## Extracting the features

In [33]:
tf = TfidfVectorizer()
from sklearn.pipeline import Pipeline

<hr>

## Building the Model

In [34]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression()
model=Pipeline([('vectorizer',tf),('classifier',classifier)])

model.fit(xtrain,ytrain)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier', LogisticRegression())])

In [35]:
ypred=model.predict(xtest)

In [36]:
# model score
accuracy_score(ypred,ytest)

0.81256

In [None]:


conf_matrix = confusion_matrix(y_true=ytest, y_pred=ypred)
#
# Print the confusion matrix using Matplotlib
#
fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

In [37]:
# confusion matrix
A=confusion_matrix(ytest,ypred)
print(A)

[[5252 1230]
 [1113 4905]]


In [38]:
# f1 score
recall=A[0][0]/(A[0][0]+A[1][0])
precision=A[0][0]/(A[0][0]+A[0][1])
F1=2*recall*precision/(recall+precision)
print(F1)

0.817622791313147
