# Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import spacy

# Uploading the dataset

In [2]:
path = "D:\\Work\\My work\\Imdb review analysis\\archive\\IMDB Dataset.csv"

In [3]:
dataset = pd.read_csv(path)

# Preliminary viewing of the dataset

In [4]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
dataset['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

# Checking for null values and empty reviews

In [7]:
dataset.isnull().sum()

review       0
sentiment    0
dtype: int64

In [8]:
blank = []
for i, rv, s in dataset.itertuples():
    if rv.isspace()==True:
        blank.append(i)

In [9]:
blank

[]

# Splitting the dataset

### Into matrix of features and target variable

In [10]:
x = dataset.iloc[:, 0]
y = dataset.iloc[:, 1]

### Applying Spacy preprocessing

In [11]:
nlp = spacy.load("en_core_web_lg")

In [12]:
def preprocessing(text):
    doc = nlp(text)
    lemma = []
    for token in doc:
        if token.is_stop==False:
            lemma.append(token.lemma_)
    processed_text = " ".join(lemma)
    return processed_text

In [13]:
x = x.apply(preprocessing)

### Dividing into training and testing dataset

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.40)

# Applying transformations


In [15]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
y_train_encoded = lb.fit_transform(y_train)
y_test_encoded = lb.transform(y_test)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x_train_encoded = vectorizer.fit_transform(x_train)
x_test_encoded = vectorizer.transform(x_test)

# Building the models

In [17]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report

### Decision Tree Classifier

In [18]:
from sklearn.tree import DecisionTreeClassifier
classifier1 = DecisionTreeClassifier()
classifier1.fit(x_train_encoded, y_train_encoded)

y_pred = classifier1.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[7278 2728]
 [2913 7081]]
The f1 score is 0.7151441700752412
The accuracy score is 0.71795
The classification report is: 
              precision    recall  f1-score   support

           0       0.71      0.73      0.72     10006
           1       0.72      0.71      0.72      9994

    accuracy                           0.72     20000
   macro avg       0.72      0.72      0.72     20000
weighted avg       0.72      0.72      0.72     20000



### Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
classifier2 = LogisticRegression()
classifier2.fit(x_train_encoded, y_train_encoded)

y_pred = classifier2.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[8728 1278]
 [1061 8933]]
The f1 score is 0.8842365751051721
The accuracy score is 0.88305
The classification report is: 
              precision    recall  f1-score   support

           0       0.89      0.87      0.88     10006
           1       0.87      0.89      0.88      9994

    accuracy                           0.88     20000
   macro avg       0.88      0.88      0.88     20000
weighted avg       0.88      0.88      0.88     20000



### Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier
classifier3 = RandomForestClassifier(n_jobs=-1)
classifier3.fit(x_train_encoded, y_train_encoded)

y_pred = classifier3.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[8523 1483]
 [1589 8405]]
The f1 score is 0.8454883814505584
The accuracy score is 0.8464
The classification report is: 
              precision    recall  f1-score   support

           0       0.84      0.85      0.85     10006
           1       0.85      0.84      0.85      9994

    accuracy                           0.85     20000
   macro avg       0.85      0.85      0.85     20000
weighted avg       0.85      0.85      0.85     20000



### Naive Bayes

In [21]:
from sklearn.naive_bayes import MultinomialNB
classifier4 = MultinomialNB()
classifier4.fit(x_train_encoded, y_train_encoded)

y_pred = classifier4.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[8697 1309]
 [1601 8393]]
The f1 score is 0.8522542648253453
The accuracy score is 0.8545
The classification report is: 
              precision    recall  f1-score   support

           0       0.84      0.87      0.86     10006
           1       0.87      0.84      0.85      9994

    accuracy                           0.85     20000
   macro avg       0.85      0.85      0.85     20000
weighted avg       0.85      0.85      0.85     20000



### K-NN

In [22]:
from sklearn.neighbors import KNeighborsClassifier
classifier5 = KNeighborsClassifier()
classifier5.fit(x_train_encoded, y_train_encoded)

y_pred = classifier5.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


The confusion matrix is given by: 
[[7394 2612]
 [2162 7832]]
The f1 score is 0.7664155005382131
The accuracy score is 0.7613
The classification report is: 
              precision    recall  f1-score   support

           0       0.77      0.74      0.76     10006
           1       0.75      0.78      0.77      9994

    accuracy                           0.76     20000
   macro avg       0.76      0.76      0.76     20000
weighted avg       0.76      0.76      0.76     20000



### SVM

### Linear support vector

In [23]:
from sklearn.svm import LinearSVC
classifier6 = LinearSVC()
classifier6.fit(x_train_encoded, y_train_encoded)

y_pred = classifier6.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[8817 1189]
 [1127 8867]]
The f1 score is 0.8844887780548629
The accuracy score is 0.8842
The classification report is: 
              precision    recall  f1-score   support

           0       0.89      0.88      0.88     10006
           1       0.88      0.89      0.88      9994

    accuracy                           0.88     20000
   macro avg       0.88      0.88      0.88     20000
weighted avg       0.88      0.88      0.88     20000



### Sigmoid SVC

In [24]:
from sklearn.svm import SVC
classifier7 = SVC(kernel = 'sigmoid')
classifier7.fit(x_train_encoded, y_train_encoded)

y_pred = classifier7.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[8773 1233]
 [1085 8909]]
The f1 score is 0.8848827969805324
The accuracy score is 0.8841
The classification report is: 
              precision    recall  f1-score   support

           0       0.89      0.88      0.88     10006
           1       0.88      0.89      0.88      9994

    accuracy                           0.88     20000
   macro avg       0.88      0.88      0.88     20000
weighted avg       0.88      0.88      0.88     20000



### Voting Method

##### Soft Voting 

In [27]:
from sklearn.ensemble import VotingClassifier
classifier8 = VotingClassifier(
    estimators=[('lr', LogisticRegression()), ('svc', SVC(kernel = 'sigmoid', probability = True)), ('rf', RandomForestClassifier())],
    voting='soft')
classifier8.fit(x_train_encoded, y_train_encoded)

y_pred = classifier8.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[8830 1176]
 [1059 8935]]
The f1 score is 0.888833623476747
The accuracy score is 0.88825
The classification report is: 
              precision    recall  f1-score   support

           0       0.89      0.88      0.89     10006
           1       0.88      0.89      0.89      9994

    accuracy                           0.89     20000
   macro avg       0.89      0.89      0.89     20000
weighted avg       0.89      0.89      0.89     20000



##### Hard Voting

In [29]:
from sklearn.ensemble import VotingClassifier
classifier9 = VotingClassifier(
 estimators=[('lr', LogisticRegression()), ('svc', LinearSVC()), ('rf', RandomForestClassifier())],
    voting='hard')
classifier9.fit(x_train_encoded, y_train_encoded)

y_pred = classifier9.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[8813 1193]
 [1081 8913]]
The f1 score is 0.886865671641791
The accuracy score is 0.8863
The classification report is: 
              precision    recall  f1-score   support

           0       0.89      0.88      0.89     10006
           1       0.88      0.89      0.89      9994

    accuracy                           0.89     20000
   macro avg       0.89      0.89      0.89     20000
weighted avg       0.89      0.89      0.89     20000



### Stacking

In [30]:
from sklearn.ensemble import StackingClassifier
level0 = list()
level0.append(('lr', LogisticRegression()))
level0.append(('svm', LinearSVC()))
level0.append(('bayes', MultinomialNB()))
level0.append(('random', RandomForestClassifier()))

level1 = LogisticRegression()

classifier10 = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
classifier10.fit(x_train_encoded, y_train_encoded)

y_pred = classifier10.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[8905 1101]
 [1101 8893]]
The f1 score is 0.889833900340204
The accuracy score is 0.8899
The classification report is: 
              precision    recall  f1-score   support

           0       0.89      0.89      0.89     10006
           1       0.89      0.89      0.89      9994

    accuracy                           0.89     20000
   macro avg       0.89      0.89      0.89     20000
weighted avg       0.89      0.89      0.89     20000

