In [1]:
import pandas as pd

In [2]:
data=pd.read_csv("IMDB Dataset.csv")

In [3]:
data.sample(5)

Unnamed: 0,review,sentiment
27887,"""Purple Rain"" has never been a critic's darlin...",positive
6819,"If you make it through the opening credits, th...",negative
33427,This mini-series is iconic of the Australian s...,positive
16026,"Most critics seem to have dismissed this film,...",positive
30124,i expected this movie to be absolutely god awf...,negative


In [4]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
X=data[['review']]
Y=data['sentiment']

In [9]:
X.head()

Unnamed: 0,review
0,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is..."


In [8]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=42)

In [12]:
LB=LabelEncoder()
Y_train_scaled=LB.fit_transform(Y_train)
Y_test_scaled=LB.transform(Y_test)

In [14]:
trf1=ColumnTransformer([
    ("Vectorization",CountVectorizer(),'review')
],remainder='drop')

In [15]:
trf2=BernoulliNB()

In [16]:
pipe=Pipeline([
    ("trf1",trf1),
    ("trf2",trf2)
])

In [17]:
pipe.fit(X_train,Y_train_scaled)

In [18]:
Y_pred=pipe.predict(X_test)

In [20]:
score=accuracy_score(Y_test_scaled,Y_pred)
matrix=confusion_matrix(Y_test_scaled,Y_pred)
report=classification_report(Y_test_scaled,Y_pred)
print("Score is ",score)
print("Confusion Matrix is \n",matrix)
print("Classification report is \n",report)

Score is  0.8561333333333333
Confusion Matrix is 
 [[6521  890]
 [1268 6321]]
Classification report is 
               precision    recall  f1-score   support

           0       0.84      0.88      0.86      7411
           1       0.88      0.83      0.85      7589

    accuracy                           0.86     15000
   macro avg       0.86      0.86      0.86     15000
weighted avg       0.86      0.86      0.86     15000



In [21]:
param_grid={
    'trf2__alpha':[0.01, 0.1, 0.5, 1, 2, 5],
    'trf2__binarize':[0.0, 0.5, 1.0]
}

In [23]:
grid=GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    verbose=1,
    n_jobs=1
)
    

In [24]:
grid.fit(X_train,Y_train_scaled)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [25]:
best_model = grid.best_estimator_

In [26]:
print("Best parameters:", grid.best_params_)

Best parameters: {'trf2__alpha': 0.5, 'trf2__binarize': 0.0}


In [29]:
Y_pred1 = best_model.predict(X_test)

In [30]:
score1=accuracy_score(Y_test_scaled,Y_pred1)
matrix1=confusion_matrix(Y_test_scaled,Y_pred1)
report1=classification_report(Y_test_scaled,Y_pred1)
print("Score is ",score1)
print("Confusion Matrix is \n",matrix1)
print("Classification report is \n",report1)

Score is  0.856
Confusion Matrix is 
 [[6503  908]
 [1252 6337]]
Classification report is 
               precision    recall  f1-score   support

           0       0.84      0.88      0.86      7411
           1       0.87      0.84      0.85      7589

    accuracy                           0.86     15000
   macro avg       0.86      0.86      0.86     15000
weighted avg       0.86      0.86      0.86     15000

