# Walmart Reviews Sentiment Analysis

This notebook performs sentiment analysis on Walmart product reviews using a combination of:
- TextBlob for basic sentiment scoring
- VADER for context-aware sentiment analysis
- Random Forest for final classification

The dataset was obtained from https://www.kaggle.com/datasets/promptcloud/walmart-product-reviews-dataset on January 6, 2025

## 1. Import required packages and install the required libraries

In [1]:
# Install required packages
# nltk==3.9b1 was used here as other versions had some bugs
!pip install -q textblob vaderSentiment nltk==3.9b1 scikit-learn pandas numpy matplotlib seaborn swifter

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/1.2 MB[0m [31m12.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m626.3/626.3 kB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for swifter (setup.py) ... [?25l[?25hdone


In [2]:
!pip -qq install catboost

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import joblib
from datetime import datetime
import swifter #for paralel processing
from google.colab import files

In [4]:
# Download required NLTK resources
nltk_resources = ['punkt', 'punkt_tab','stopwords', 'wordnet', 'omw-1.4', 'averaged_perceptron_tagger']
for resource in nltk_resources:
    try:
        nltk.data.find(f'tokenizers/{resource}' if 'punkt' in resource else f'corpora/{resource}')
    except LookupError:
        nltk.download(resource, quiet=True)

## 2. Load the dataset and make basic preprocessing

In [5]:
uploaded = files.upload()

Saving marketing_sample_for_walmart_com-walmart_product_reviews__20200401_20200630__30k_data.csv to marketing_sample_for_walmart_com-walmart_product_reviews__20200401_20200630__30k_data.csv


In [6]:
# Load your dataset
file_path = '/content/marketing_sample_for_walmart_com-walmart_product_reviews__20200401_20200630__30k_data.csv'
df = pd.read_csv(file_path)

# Print dataset statistics
print("\nDataset Statistics:")
print(f"Total reviews: {len(df)}")
print("\nRating distribution:")
print(df['Rating'].value_counts().sort_index())
print(f"\nAverage rating: {df['Rating'].mean():.2f}")


Dataset Statistics:
Total reviews: 29997

Rating distribution:
Rating
1.0     3508
2.0     1244
3.0     1707
4.0     4513
5.0    19025
Name: count, dtype: int64

Average rating: 4.14


In [7]:
# Basic preprocessing
df = df.dropna(subset=['Review', 'Rating'])
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df = df.dropna(subset=['Rating'])
df = df[df['Rating'].between(1, 5)]

# Create boolean columns
df['is_verified'] = df['Verified Purchaser'].map({'Yes': 1, 'No': 0})
df['is_recommended'] = df['Recommended Purchase'].map({'Yes': 1, 'No': 0})

# Handle numeric features
df['upvotes'] = pd.to_numeric(df['Review Upvotes'], errors='coerce').fillna(0)
df['downvotes'] = pd.to_numeric(df['Review Downvotes'], errors='coerce').fillna(0)
df['helpfulness_score'] = np.where(df['upvotes'] + df['downvotes'] > 0,
                                 df['upvotes'] / (df['upvotes'] + df['downvotes']),
                                 0)

## 3. Further preprocessing before model training

### 3.1. Text preprocessing of the reviews

In [8]:
# Text preprocessing functions
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return ' '.join(text.split())

def remove_stopwords(text):
    if not isinstance(text, str):
        return ""
    stop_words = set(stopwords.words('english'))
    words = text.split()
    return ' '.join([w for w in words if w not in stop_words])

def lemmatize_text(text):
    try:
        lemmatizer = WordNetLemmatizer()
        words = word_tokenize(text)
        pos_tags = nltk.pos_tag(words)

        def get_wordnet_pos(tag):
            tag = tag[0].upper()
            tag_dict = {
                "J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV
            }
            return tag_dict.get(tag, wordnet.NOUN)

        lemmatized_words = []
        for word, tag in pos_tags:
            pos = get_wordnet_pos(tag)
            lemma = lemmatizer.lemmatize(word.lower(), pos)
            lemmatized_words.append(lemma)

        return ' '.join(lemmatized_words)
    except Exception as e:
        words = word_tokenize(text)
        return ' '.join([lemmatizer.lemmatize(word.lower(), wordnet.VERB) for word in words])

def preprocess_text(text):
    text = clean_text(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

In [9]:
# Process reviews using swifter to save time and have progress bar
df['processed_review'] = df['Review'].swifter.apply(preprocess_text)

Pandas Apply:   0%|          | 0/24617 [00:00<?, ?it/s]

Saving the cleaned dataset

In [10]:
df.to_csv("walmart_reviews_cleaned.csv", index=False)
print("Cleaned data saved to walmart_reviews_cleaned.csv")

Cleaned data saved to walmart_reviews_cleaned.csv


### 3.2. Feature extraction

In [11]:
# Feature extraction functions
def extract_text_features(text):
    blob = TextBlob(str(text))
    vader = SentimentIntensityAnalyzer()
    vader_scores = vader.polarity_scores(str(text))

    return {
        'polarity': blob.sentiment.polarity,
        'subjectivity': blob.sentiment.subjectivity,
        'compound_score': vader_scores['compound'],
        'review_length': len(text),
        'caps_ratio': sum(1 for c in text if c.isupper()) / len(text),
        'exclamation_count': text.count('!'),
        'question_count': text.count('?')
    }

def extract_topic_features(text):
    text = text.lower()
    return {
        'has_customer_service': int(any(word in text for word in ['service', 'support', 'customer'])),
        'has_delivery_issue': int(any(word in text for word in ['delivery', 'shipping', 'late'])),
        'has_account_issue': int(any(word in text for word in ['account', 'login', 'password'])),
        'has_refund_issue': int(any(word in text for word in ['refund', 'return', 'money']))
    }

In [12]:
# Extract features
text_features = df['Review'].swifter.apply(extract_text_features).apply(pd.Series)
topic_features = df['processed_review'].swifter.apply(extract_topic_features).apply(pd.Series)

Pandas Apply:   0%|          | 0/24617 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/24617 [00:00<?, ?it/s]

In [13]:
# Combine features
df = pd.concat([df, text_features, topic_features], axis=1)

## 4. Prepare data for modeling

#### 4.1. Create sentiment labels

* pd.cut() categorizes the continuous compound scores into discrete bins

* The bins are defined as:

-1 to -0.1: Negative sentiment (label: 0)

-0.1 to 0.1: Neutral sentiment (label: 1)

0.1 to 1: Positive sentiment (label: 2)

In [14]:
df['sentiment'] = pd.cut(
    df['compound_score'],
    bins=[-1, -0.1, 0.1, 1],
    labels=[0, 1, 2]
)

In [28]:
df

Unnamed: 0,Uniq Id,Crawl Timestamp,Pageurl,Website,Title,Rating,Review,Reviewer Name,Review Upvotes,Review Downvotes,...,compound_score,review_length,caps_ratio,exclamation_count,question_count,has_customer_service,has_delivery_issue,has_account_issue,has_refund_issue,sentiment
0,1800878dcfa5255c691f2a87ce8b2869,2020-05-16 17:58:33 +0000,https://www.walmart.com/reviews/product/36907838,walmart.com,,1.0,One star for looking nice. That’s it. After tr...,PJ10,0,0,...,-0.6988,188.0,0.031915,1.0,0.0,0,0,0,1,0
1,ac06ca1b77310d539c9a74eed7a8a8e2,2020-04-17 01:05:43 +0000,https://www.walmart.com/reviews/product/708236785,walmart.com,,4.0,Love this phone so far have had it almost a mo...,KelJo,9,3,...,0.9390,190.0,0.021053,4.0,0.0,0,0,0,0,2
2,9c4b10fff4e5e2cc7e2e1bbed76e2220,2020-06-08 10:35:05 +0000,https://www.walmart.com/reviews/product/831078728,walmart.com,,4.0,This TV is absolutely fantastic. This is my th...,TheBigChalupa11,0,0,...,0.9624,1669.0,0.032355,0.0,0.0,0,0,0,0,2
3,b5a5dc8de9fc0468aab961f097b56091,2020-04-13 03:57:29 +0000,https://www.walmart.com/reviews/product/40712755,walmart.com,,5.0,"Refurb, good shape, good price, does what I wa...",Micalic,0,0,...,0.7269,58.0,0.034483,0.0,0.0,0,0,0,0,2
4,430433b52b882e540c16363d448e012b,2020-05-19 03:52:42 +0000,https://www.walmart.com/reviews/product/46597970,walmart.com,,5.0,Very nice tablet! Looks brand new. Fired right...,jcan74,0,0,...,0.8789,116.0,0.043103,1.0,0.0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29991,8d751eb9c8ed7cebcd0451861b1abe09,2020-04-13 01:22:43 +0000,https://www.walmart.com/reviews/product/46573964,walmart.com,,5.0,I'm pleased to say that the overall style fit ...,Quanzo,0,0,...,0.9558,235.0,0.017021,0.0,0.0,0,0,0,0,2
29992,355d86361ed106ae2e3168c75f501f34,2020-06-11 09:59:04 +0000,https://www.walmart.com/reviews/product/27299919,walmart.com,Roku 1 Streaming Player,5.0,My son got it for me for christmas i love it i...,jksbell19,0,0,...,0.6369,64.0,0.015625,0.0,0.0,0,0,0,0,2
29994,ed0fc14441b566c17cdf6a7d52ba1959,2020-05-18 00:56:22 +0000,https://www.walmart.com/reviews/product/680155633,walmart.com,,1.0,Ive had my t.v. since xmas. First thing that w...,Erin,0,0,...,-0.8682,575.0,0.019130,0.0,0.0,0,0,0,0,0
29995,5a19ee1daaf4c1b4d9459afaafe61e7e,2020-06-06 08:28:55 +0000,https://www.walmart.com/reviews/product/786202247,walmart.com,,5.0,"Good value, quality product, easy to setup, no...",TX17,0,0,...,0.8576,58.0,0.017241,0.0,0.0,0,0,0,0,2


In [29]:
df.to_csv("walmart_reviews_cleaned_2.csv", index=False)
print("Cleaned data saved to walmart_reviews_cleaned_2.csv")

Cleaned data saved to walmart_reviews_cleaned_2.csv


In [17]:
oh yeah

SyntaxError: invalid syntax (ipython-input-17-1749835563.py, line 1)

#### 4.2. Initialize vectorizer

* TF-IDF Vectorizer is a technique used in natural language processing (NLP) and information retrieval to convert text data into numerical form, which can then be used for machine learning models. TF-IDF stands for Term Frequency-Inverse Document Frequency, and it is a statistical measure that reflects the importance of a word in a document relative to a collection of documents (corpus).

* How TF-IDF Vectorizer Works:
    * The TF-IDF Vectorizer takes a corpus of text documents and converts them into a matrix of TF-IDF features.
    * Each row in the matrix represents a document, and each column represents a term (word) from the corpus.
    * The value in each cell is the TF-IDF score of the term for that document.

In [30]:
vectorizer = TfidfVectorizer(
    max_features=3500,      # Keep only top 3500 most frequent words
    ngram_range=(1, 3),     # Use unigrams, bigrams, and trigrams
    min_df=2,               # Ignore terms that appear in less than 2 documents
    max_df=0.95,           # Ignore terms that appear in more than 95% of documents
    strip_accents='unicode', # Remove accents
    use_idf=True,          # Enable inverse-document-frequency reweighting
    smooth_idf=True,       # Add 1 to document frequencies to prevent division by zero
    sublinear_tf=True      # Apply sublinear scaling to term frequencies (1 + log(tf))
)

In [31]:
text_features = vectorizer.fit_transform(df['processed_review']).toarray()

In [32]:
# Combine with your existing numeric/topic features
numeric_features = df[
    [
        'has_customer_service',
        'has_delivery_issue',
        'has_account_issue',
        'has_refund_issue',
        'review_length',
        'caps_ratio',
        'exclamation_count',
        'question_count'
    ]
].values

In [33]:
# Final feature matrix and labels
X = np.hstack([text_features, numeric_features])
y = df['sentiment']

In [34]:
print(df['sentiment'].value_counts(normalize=True))

sentiment
2    0.804647
0    0.121908
1    0.073445
Name: proportion, dtype: float64


In [35]:
# 4.3 Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [36]:
from sklearn.preprocessing import StandardScaler

# 4.3 Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Assume TF-IDF is first part of X, numeric_features second
tfidf_dim = text_features.shape[1]

scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[:, tfidf_dim:])
X_test_num  = scaler.transform(X_test[:, tfidf_dim:])

# Reassemble feature matrices
X_train = np.hstack([X_train[:, :tfidf_dim], X_train_num])
X_test  = np.hstack([X_test[:, :tfidf_dim], X_test_num])

In [25]:
import joblib

# Save vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
print("✅ Saved TF‑IDF vectorizer → tfidf_vectorizer.joblib")

# Save scaler
joblib.dump(scaler, "numeric_scaler.joblib")
print("✅ Saved numeric scaler → numeric_scaler.joblib")

✅ Saved TF‑IDF vectorizer → tfidf_vectorizer.joblib
✅ Saved numeric scaler → numeric_scaler.joblib


In [37]:
# 4.4 Train and evaluate multiple classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(
    solver="saga",           # better for large sparse data
    penalty="l2",
    C=1.0,
    max_iter=2000,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)


classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_pred_proba = classifier.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

print("Logistic Regression Performance")
print(f"Accuracy: {accuracy:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")

In [None]:
joblib.dump(classifier, "logistic_regression.joblib")
print("✅ Saved Logistic Regression → logistic_regression.joblib")

In [None]:
from sklearn.ensemble import RandomForestClassifier

classifier2 = RandomForestClassifier(
    n_estimators=300,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features="sqrt",
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)


classifier2.fit(X_train, y_train)

y_pred = classifier2.predict(X_test)
y_pred_proba = classifier2.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

print("Random Forest Performance")
print(f"Accuracy: {accuracy:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")

In [None]:
joblib.dump(classifier2, "random_forest.joblib")
print("✅ Saved Random Forest → random_forest.joblib")


In [38]:
from xgboost import XGBClassifier

classifier3 = XGBClassifier(
    tree_method="hist",        # fast histogram grower
    n_estimators=200,          # boosting rounds
    max_depth=6,               # tree depth
    learning_rate=0.1,         # shrinkage
    subsample=0.8,             # row sampling
    colsample_bytree=0.8,      # feature sampling
    scale_pos_weight=(y_train == 0).sum() / (y_train == 2).sum(),
    # balances pos/neg if binary; for multiclass can omit or tune per-class
    use_label_encoder=False,
    eval_metric="mlogloss",
    random_state=42
)
classifier3.fit(X_train, y_train)


y_pred = classifier3.predict(X_test)
y_pred_proba = classifier3.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

print("XGBoost Performance")
print(f"Accuracy: {accuracy:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



XGBoost Performance
Accuracy: 0.879
F1 Score: 0.869
ROC AUC: 0.930


In [39]:
joblib.dump(classifier3, "xgboost_classifier.joblib")
print("✅ Saved XGBoost → xgboost_classifier.joblib")


✅ Saved XGBoost → xgboost_classifier.joblib


In [None]:
from catboost import CatBoostClassifier

classifier4 = CatBoostClassifier(
    iterations=300,            # boosting rounds
    depth=6,                   # tree depth
    learning_rate=0.1,
    l2_leaf_reg=3,             # L2 regularization
    class_weights=[(y_train == 0).sum(), (y_train == 1).sum(), (y_train == 2).sum()],
    # balances all three classes
    verbose=False,
    random_seed=42
)
classifier4.fit(X_train, y_train)


y_pred = classifier4.predict(X_test)
y_pred_proba = classifier4.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

print("CatBoost Performance")
print(f"Accuracy: {accuracy:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")

In [None]:
joblib.dump(classifier4, "catboost_classifier.joblib")
print("✅ Saved CatBoost → catboost_classifier.joblib")


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

classifier5 = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    min_samples_split=5,
    random_state=42
    # no class_weight parameter—use oversampling or sample_weight if needed
)
classifier5.fit(X_train, y_train)


y_pred = classifier5.predict(X_test)
y_pred_proba = classifier5.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

print("Gradient Boosting Performance")
print(f"Accuracy: {accuracy:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")

In [None]:
joblib.dump(classifier5, "gradient_boosting.joblib")
print("✅ Saved Gradient Boosting → gradient_boosting.joblib")

In [None]:
from sklearn.ensemble import AdaBoostClassifier

classifier6 = AdaBoostClassifier(
    n_estimators=200,
    learning_rate=0.5,
    random_state=42
)
classifier6.fit(X_train, y_train)


y_pred  = classifier6.predict(X_test)
y_proba = classifier6.predict_proba(X_test)

print("AdaBoost")
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.3f}")
print(f"F1 Score:  {f1_score(y_test, y_pred, average='weighted'):.3f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba, multi_class='ovr'):.3f}")

In [None]:
joblib.dump(classifier6, "adaboost_classifier.joblib")
print("✅ Saved AdaBoost → adaboost_classifier.joblib")

In [None]:
from sklearn.svm import SVC

classifier7 = SVC(
    kernel="rbf",
    C=1.0,
    gamma="scale",
    probability=True,
    class_weight="balanced",  # handle imbalance
    max_iter=2000,
    random_state=42
)
classifier7.fit(X_train, y_train)


y_pred  = classifier7.predict(X_test)
y_proba = classifier7.predict_proba(X_test)

print("SVC")
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.3f}")
print(f"F1 Score:  {f1_score(y_test, y_pred, average='weighted'):.3f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba, multi_class='ovr'):.3f}")

In [None]:
joblib.dump(classifier7, "svc_classifier.joblib")
print("✅ Saved SVC → svc_classifier.joblib")

In [None]:
from sklearn.neighbors import KNeighborsClassifier

classifier8 = KNeighborsClassifier(
    n_neighbors=5,
    weights="distance",
    leaf_size=30,
    metric="minkowski",
    p=2
)
classifier8.fit(X_train, y_train)


y_pred  = classifier8.predict(X_test)
y_proba = classifier8.predict_proba(X_test)

print("KNN Classifier")
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.3f}")
print(f"F1 Score:  {f1_score(y_test, y_pred, average='weighted'):.3f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba, multi_class='ovr'):.3f}")

In [None]:
joblib.dump(classifier8, "knn_classifier.joblib")
print("✅ Saved KNN → knn_classifier.joblib")

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

classifier9 = ExtraTreesClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features="sqrt",
    class_weight="balanced",  # handle imbalance
    random_state=42,
    n_jobs=-1
)
classifier9.fit(X_train, y_train)


y_pred  = classifier9.predict(X_test)
y_proba = classifier9.predict_proba(X_test)

print("🔹 Extra Trees")
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.3f}")
print(f"F1 Score:  {f1_score(y_test, y_pred, average='weighted'):.3f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba, multi_class='ovr'):.3f}")

In [None]:
joblib.dump(classifier9, "extra_trees_classifier.joblib")
print("✅ Saved Extra Trees → extra_trees_classifier.joblib")

In [None]:
!pip -q install lightgbm

from lightgbm import LGBMClassifier

classifier10 = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight="balanced",  # auto‑adjust weights
    random_state=42,
    n_jobs=-1
)
classifier10.fit(X_train, y_train)

y_pred  = classifier10.predict(X_test)
y_proba = classifier10.predict_proba(X_test)

print("🔹 LightGBM")
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.3f}")
print(f"F1 Score:  {f1_score(y_test, y_pred, average='weighted'):.3f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba, multi_class='ovr'):.3f}")

In [None]:
joblib.dump(classifier10, "lightgbm_classifier.joblib")
print("✅ Saved LightGBM → lightgbm_classifier.joblib")

# The Best Model which is ot be used is XGBoost Classifier