In [1]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [2]:
sample_df = pd.read_csv('IMDB_reviews_Sample(1000).csv')

In [3]:
sample_df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,501
0,499


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sample_df['clean_review'], sample_df['sentiment'], test_size=0.3, random_state=42)


In [23]:
tfidf_vectorizer = TfidfVectorizer(max_features=1500)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

#### Random Forest Model

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix,f1_score,recall_score,precision_score


rf = RandomForestClassifier(random_state=42)

rf.fit(X_train_tfidf, y_train)



y_pred_rf = rf.predict(X_test_tfidf)

print("Random Forest Performance with TF-IDF:")
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("Precision:",precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))

Random Forest Performance with TF-IDF:
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.80      0.78       151
           1       0.79      0.74      0.77       149

    accuracy                           0.77       300
   macro avg       0.77      0.77      0.77       300
weighted avg       0.77      0.77      0.77       300

Accuracy: 0.7733333333333333
AUC-ROC: 0.7731454731321393
Confusion Matrix:
 [[121  30]
 [ 38 111]]
F1 Score: 0.7655172413793103
Precision: 0.7872340425531915
Recall: 0.7449664429530202


#### NAIVE BAYES Model

In [7]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)


y_pred_nb = nb.predict(X_test_tfidf)

print("Naive Bayes Performance with TF-IDF:")

print("Classification Report:\n", classification_report(y_test, y_pred_nb))
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("F1 Score:", f1_score(y_test, y_pred_nb))
print("Precision:",precision_score(y_test, y_pred_nb))
print("Recall:", recall_score(y_test, y_pred_nb))

Naive Bayes Performance with TF-IDF:
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.78      0.77       151
           1       0.77      0.75      0.76       149

    accuracy                           0.77       300
   macro avg       0.77      0.77      0.77       300
weighted avg       0.77      0.77      0.77       300

Accuracy: 0.7666666666666667
AUC-ROC: 0.7665674029956887
Confusion Matrix:
 [[118  33]
 [ 37 112]]
F1 Score: 0.7619047619047619
Precision: 0.7724137931034483
Recall: 0.7516778523489933


#### Support Vector Model

In [8]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix


svm = SVC(random_state=42, probability=True)

svm.fit(X_train_tfidf, y_train)



y_pred_svm = svm.predict(X_test_tfidf)
y_pred_svm_proba = svm.predict_proba(X_test_tfidf)[:, 1]

print("SVM Performance with TF-IDF:")
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_svm_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("F1 Score:", f1_score(y_test, y_pred_svm))
print("Precision:",precision_score(y_test, y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))

SVM Performance with TF-IDF:
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.83      0.79       151
           1       0.81      0.72      0.76       149

    accuracy                           0.78       300
   macro avg       0.78      0.78      0.78       300
weighted avg       0.78      0.78      0.78       300

Accuracy: 0.7766666666666666
AUC-ROC: 0.8558158140361795
Confusion Matrix:
 [[126  25]
 [ 42 107]]
F1 Score: 0.7615658362989324
Precision: 0.8106060606060606
Recall: 0.7181208053691275


#### Logistic Regression Model

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix

LR = LogisticRegression(random_state=42, max_iter=1000)

LR.fit(X_train_tfidf, y_train)



y_pred_logistic = LR.predict(X_test_tfidf)

print("Logistic Regression Performance with TF-IDF:")

print("Classification Report:\n", classification_report(y_test, y_pred_logistic))
print("Accuracy:", accuracy_score(y_test, y_pred_logistic))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_logistic))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logistic))
print("F1 Score:", f1_score(y_test, y_pred_logistic))
print("Precision:",precision_score(y_test, y_pred_logistic))
print("Recall:", recall_score(y_test, y_pred_logistic))

Logistic Regression Performance with TF-IDF:
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.83      0.79       151
           1       0.81      0.74      0.77       149

    accuracy                           0.78       300
   macro avg       0.79      0.78      0.78       300
weighted avg       0.79      0.78      0.78       300

Accuracy: 0.7833333333333333
AUC-ROC: 0.7830348015467354
Confusion Matrix:
 [[125  26]
 [ 39 110]]
F1 Score: 0.7719298245614035
Precision: 0.8088235294117647
Recall: 0.738255033557047


#### Layered Neural Network

In [56]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout

X_train_tfidf_dense = X_train_tfidf.toarray()
X_test_tfidf_dense = X_test_tfidf.toarray()

model = Sequential()
model.add(Dense(units=512, activation='relu', input_dim=X_train_tfidf_dense.shape[1]))
model.add(Dense(units=256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(units=1, activation='sigmoid'))



model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(X_train_tfidf_dense, y_train, epochs=15, batch_size=10, validation_data=(X_test_tfidf_dense, y_test))

test_loss, test_accuracy = model.evaluate(X_test_tfidf_dense, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

y_pred_dl = model.predict(X_test_tfidf_dense)
y_pred_classes = (y_pred > 0.5).astype(int)

print(classification_report(y_test, y_pred_classes))
print("Classification Report:\n", classification_report(y_test, y_pred_classes))
print("Accuracy:", accuracy_score(y_test, y_pred_classes))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_dl))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_classes))
print("F1 Score:", f1_score(y_test, y_pred_classes))
print("Precision:",precision_score(y_test, y_pred_classes))
print("Recall:", recall_score(y_test, y_pred_classes))


Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_72 (Dense)            (None, 512)               768512    
                                                                 
 dense_73 (Dense)            (None, 256)               131328    
                                                                 
 dense_74 (Dense)            (None, 128)               32896     
                                                                 
 dropout_28 (Dropout)        (None, 128)               0         
                                                                 
 dense_75 (Dense)            (None, 64)                8256      
                                                                 
 dropout_29 (Dropout)        (None, 64)                0         
                                                                 
 dense_76 (Dense)            (None, 1)               

#### Bidirectional LSTM Model

In [55]:

X_train_tfidf_reshaped = X_train_tfidf_dense.reshape(X_train_tfidf_dense.shape[0], X_train_tfidf_dense.shape[1], 1)
X_test_tfidf_reshaped = X_test_tfidf_dense.reshape(X_test_tfidf_dense.shape[0], X_test_tfidf_dense.shape[1], 1)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout


model = Sequential()
model.add(Bidirectional(LSTM(512, input_shape=(X_train_tfidf_reshaped.shape[1], X_train_tfidf_reshaped.shape[2]))))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


history = model.fit(X_train_tfidf_reshaped, y_train, epochs=6, batch_size=10, validation_data=(X_test_tfidf_reshaped, y_test))

test_loss, test_accuracy = model.evaluate(X_test_tfidf_reshaped, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

y_pred_dl = model.predict(X_test_tfidf_reshaped)
y_pred_classes = (y_pred > 0.5).astype(int)

print(classification_report(y_test, y_pred_classes))
print("Classification Report:\n", classification_report(y_test, y_pred_classes))
print("Accuracy:", accuracy_score(y_test, y_pred_classes))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_dl))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_classes))
print("F1 Score:", f1_score(y_test, y_pred_classes))
print("Precision:",precision_score(y_test, y_pred_classes))
print("Recall:", recall_score(y_test, y_pred_classes))


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Test Accuracy: 0.4967
              precision    recall  f1-score   support

           0       0.65      0.69      0.67       151
           1       0.67      0.63      0.65       149

    accuracy                           0.66       300
   macro avg       0.66      0.66      0.66       300
weighted avg       0.66      0.66      0.66       300

Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.69      0.67       151
           1       0.67      0.63      0.65       149

    accuracy                           0.66       300
   macro avg       0.66      0.66      0.66       300
weighted avg       0.66      0.66      0.66       300

Accuracy: 0.66
AUC-ROC: 0.5330681363616161
Confusion Matrix:
 [[104  47]
 [ 55  94]]
F1 Score: 0.6482758620689655
Precision: 0.6666666666666666
Recall: 0.6308724832214765


### Conclusion:
The SVM model and Logistic Regression Model show the best results bu since we are more focused on the precision and recall we are gonna move ahed with The SVM model.

#### Hyperparameter tuning the SVM model

In [60]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {
    'C': [0.1, 1, 10, 100, 1000,5000],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4,8,16]
}

svm = SVC(random_state=42, probability=True)

grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train)

best_params = grid_search.best_params_
print("Best Parameters found by GridSearchCV:", best_params)

best_svm = grid_search.best_estimator_

y_pred_svm = best_svm.predict(X_test_tfidf)
y_pred_svm_proba = best_svm.predict_proba(X_test_tfidf)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred_svm))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_svm_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("F1 Score:", f1_score(y_test, y_pred_svm))
print("Precision:",precision_score(y_test, y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))


Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Best Parameters found by GridSearchCV: {'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.82      0.79       151
           1       0.80      0.74      0.77       149

    accuracy                           0.78       300
   macro avg       0.78      0.78      0.78       300
weighted avg       0.78      0.78      0.78       300

Accuracy: 0.7833333333333333
AUC-ROC: 0.8584381528067914
Confusion Matrix:
 [[124  27]
 [ 38 111]]
F1 Score: 0.7735191637630662
Precision: 0.8043478260869565
Recall: 0.7449664429530202
