In [2]:
import re
import numpy as np
import pandas as pd
import gensim
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from lazypredict.Supervised import LazyClassifier

In [3]:
data=pd.read_csv(r'D:\CODING\Python\NLP\END_SEM\IMDB Dataset.csv')

In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

data['review'] = data['review'].apply(preprocess_text)

data['label'] = data['sentiment'].map({'positive': 1, 'negative': 0})


In [5]:
def generate_embedding_vector(word_vector, word2vec_model):
    word_embeddings = []
    for word in word_vector:
        if word in word2vec_model.wv:
            word_embedding = word2vec_model.wv[word]
            word_embeddings.append(word_embedding)

    embedding_vector = np.sum(word_embeddings, axis=0)
    return embedding_vector


In [6]:
tokenized_corpus=data['review'].to_list()

In [7]:
word2vec_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

In [8]:
data['embedding_vector'] = data['review'].apply(lambda x: generate_embedding_vector(x, word2vec_model))

In [9]:
X = np.array(data['embedding_vector'].tolist())
y = np.array(data['label'].tolist())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

del data
del tokenized_corpus
del X
del y

In [10]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

 97%|█████████▋| 28/29 [12:14<00:24, 24.72s/it]

[LightGBM] [Info] Number of positive: 19961, number of negative: 20039
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006412 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499025 -> initscore=-0.003900
[LightGBM] [Info] Start training from score -0.003900


100%|██████████| 29/29 [12:15<00:00, 25.35s/it]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
SVC                                0.87               0.87     0.87      0.87   
LogisticRegression                 0.86               0.86     0.86      0.86   
CalibratedClassifierCV             0.86               0.86     0.86      0.86   
LinearDiscriminantAnalysis         0.86               0.86     0.86      0.86   
RidgeClassifier                    0.86               0.86     0.86      0.86   
RidgeClassifierCV                  0.86               0.86     0.86      0.86   
SGDClassifier                      0.85               0.85     0.85      0.85   
LinearSVC                          0.85               0.85     0.85      0.85   
XGBClassifier                      0.85               0.85     0.85      0.85   
LGBMClassifier                     0.84               0.84     0.84      0.84   
NuSVC                       




In [12]:
best_model = models.iloc[0]
best_model

Accuracy              0.87
Balanced Accuracy     0.87
ROC AUC               0.87
F1 Score              0.87
Time Taken          119.46
Name: SVC, dtype: float64