## Here we will do the Feature Engineering(Data Preprocessing) and Model Training Experiments

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot 

In [32]:
# load the data
df = pd.read_csv("wine.csv")

In [33]:
df.head()

Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,6.0,0.18,0.31,1.4,0.036,14.0,75.0,0.99085,3.34,0.58,11.1,8
1,white,5.3,0.395,0.07,1.3,0.035,26.0,102.0,0.992,3.5,0.35,10.6,6
2,red,8.1,0.56,0.28,1.7,0.368,16.0,56.0,0.9968,3.11,1.28,9.3,5
3,white,6.4,0.22,0.34,1.4,0.023,56.0,115.0,0.98958,3.18,0.7,11.7,6
4,red,9.4,0.27,0.53,2.4,0.074,6.0,18.0,0.9962,3.2,1.13,12.0,7


- **convert the quality into 3 different groups**
    - `quality > 7` => `Premium(2)`
    - `5 <= quality <= 6` => `Standard(1)`
    - otherwise `quality` => `Basic(0)`

In [34]:
def wine_label(q):
    if q > 7:
        return "premium" # label encode => 2 
    elif 5 <= q <= 6:
        return "standard" # label encode => 1 
    else:
        return "basic" # label encode => 0 

df["quality"] = df["quality"].apply(wine_label)

In [35]:
df.head()

Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,6.0,0.18,0.31,1.4,0.036,14.0,75.0,0.99085,3.34,0.58,11.1,premium
1,white,5.3,0.395,0.07,1.3,0.035,26.0,102.0,0.992,3.5,0.35,10.6,standard
2,red,8.1,0.56,0.28,1.7,0.368,16.0,56.0,0.9968,3.11,1.28,9.3,standard
3,white,6.4,0.22,0.34,1.4,0.023,56.0,115.0,0.98958,3.18,0.7,11.7,standard
4,red,9.4,0.27,0.53,2.4,0.074,6.0,18.0,0.9962,3.2,1.13,12.0,basic


In [36]:
#  wine type is not related to quality. Drop this column
df.drop(columns = ['wine type'] , axis = 1 , inplace = True)

In [37]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,6.0,0.18,0.31,1.4,0.036,14.0,75.0,0.99085,3.34,0.58,11.1,premium
1,5.3,0.395,0.07,1.3,0.035,26.0,102.0,0.992,3.5,0.35,10.6,standard
2,8.1,0.56,0.28,1.7,0.368,16.0,56.0,0.9968,3.11,1.28,9.3,standard
3,6.4,0.22,0.34,1.4,0.023,56.0,115.0,0.98958,3.18,0.7,11.7,standard
4,9.4,0.27,0.53,2.4,0.074,6.0,18.0,0.9962,3.2,1.13,12.0,basic


In [38]:
# column citric acid values with greater than 0.95 is outlier remove all where value crosses the thresold 
df[df['citric acid'] > 0.95].shape

(9, 12)

In [39]:
# column citric acid values with greater than 0.95 is outlier remove all where value crosses the thresold 
df = df[df['citric acid'] <= 0.95]

In [40]:
# residual sugar column has outlier, remove all rows where values are greater than 25
df[df['residual sugar'] > 25]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
3671,7.8,0.965,0.6,65.8,0.074,8.0,160.0,1.03898,3.39,0.69,11.7,standard
4701,7.9,0.33,0.28,31.6,0.053,35.0,176.0,1.0103,3.15,0.38,8.8,standard
5479,6.8,0.45,0.28,26.05,0.031,27.0,122.0,1.00295,3.06,0.42,10.6,standard
5857,7.9,0.33,0.28,31.6,0.053,35.0,176.0,1.0103,3.15,0.38,8.8,standard
6180,6.8,0.45,0.28,26.05,0.031,27.0,122.0,1.00295,3.06,0.42,10.6,standard


In [41]:
# residual sugar column has outlier, remove all rows where values are greater than 25
df = df[df['residual sugar'] <= 25]

In [42]:
df.shape

(6483, 12)

In [43]:
# total sulfur dioxide has outlier 
df = df[df['total sulfur dioxide'] <= 280]

In [44]:
# fixed acidity is almost same for all quality , also correlation is -0.08 not really correlated 
df = df.drop(columns=["fixed acidity"])

In [45]:
df.head()

Unnamed: 0,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.18,0.31,1.4,0.036,14.0,75.0,0.99085,3.34,0.58,11.1,premium
1,0.395,0.07,1.3,0.035,26.0,102.0,0.992,3.5,0.35,10.6,standard
2,0.56,0.28,1.7,0.368,16.0,56.0,0.9968,3.11,1.28,9.3,standard
3,0.22,0.34,1.4,0.023,56.0,115.0,0.98958,3.18,0.7,11.7,standard
4,0.27,0.53,2.4,0.074,6.0,18.0,0.9962,3.2,1.13,12.0,basic


In [46]:
df.shape

(6475, 11)

In [48]:
# `density` is not changing for `quality` not an usefull column , drop this column
df = df.drop(columns = ['density'])

In [49]:
# pH is not changing for quality not an usefull column drop this column 
df = df.drop(columns = ['pH'])

In [51]:
# quality is not really depending on sulphates, drop this column 
df = df.drop(columns = ['sulphates'])

In [52]:
df.shape

(6475, 8)

In [53]:
df.shape

(6475, 8)

In [54]:
# seprate X and y 
X = df.drop("quality" , axis = 1)
y = df['quality']

In [55]:
X.shape

(6475, 7)

In [56]:
len(y)

6475

### Now we have to do the transformation

In [77]:
X.columns

Index(['volatile acidity', 'citric acid', 'residual sugar', 'chlorides',
       'free sulfur dioxide', 'total sulfur dioxide', 'alcohol'],
      dtype='object')

In [86]:
log_cols = ['volatile acidity' , 'residual sugar', 'chlorides']
sqrt_cols = ['free sulfur dioxide']
other_cols = [col for col in X.columns if col not in log_cols + sqrt_cols] 

In [87]:
other_cols

['citric acid', 'total sulfur dioxide', 'alcohol']

In [88]:
from sklearn.preprocessing import FunctionTransformer , StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [89]:
# define log transformers  
log_pipeline = Pipeline(steps = [
    ("log" , FunctionTransformer(np.log1p , validate = False)) , 
    ("scaler" , StandardScaler())
])

In [90]:
# define sqrt transformer with scaling 
sqrt_pipeline = Pipeline(steps = [
    ("sqrt" , FunctionTransformer(np.sqrt , validate = False)), 
    ("scaler" , StandardScaler())
])

In [91]:
# define a scaler for others column
scaler_pipeline = Pipeline(steps = [
    ("scaler" , StandardScaler())
])

In [92]:
# define the column transformer 
preprocessor = ColumnTransformer(
    transformers = [
        ("log" , log_pipeline , log_cols),
        ("sqrt" , sqrt_pipeline , sqrt_cols),
        ("others" , scaler_pipeline , other_cols)
    ]
)

In [93]:
# transformed X 
X_transformed = preprocessor.fit_transform(X)

In [94]:
X_transformed

array([[-1.04508324, -1.09330029, -0.61256543, ..., -0.05248694,
        -0.7213929 ,  0.51033366],
       [ 0.41355607, -1.15593226, -0.64438931, ..., -1.73979984,
        -0.23928402,  0.09093582],
       [ 1.38776663, -0.91996732,  8.54797192, ..., -0.26340105,
        -1.0606547 , -0.99949859],
       ...,
       [ 0.31934393,  0.71483978, -0.2645162 , ..., -0.54461987,
         0.63565432, -1.08337816],
       [-0.5428721 , -0.76491569, -0.45390539, ...,  0.50995069,
        -1.88202538,  1.68464764],
       [-1.11924998,  1.19971103, -0.54900971, ...,  0.72086481,
         1.18918673, -1.33501687]])

In [95]:
set(y)

{'basic', 'premium', 'standard'}

In [96]:
# now encode the target y 
mapping = {"basic": 0, "standard": 1, "premium": 2}
y_encoded = y.map(mapping)

In [98]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X_transformed , y_encoded , test_size = 0.2 , random_state = 42)
X_train.shape , X_test.shape

((5180, 7), (1295, 7))

In [99]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score,roc_curve 
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [100]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

def evaluate_clf(true, predicted, probs = None, n_classes = 3): # n_classes = 3 cause we have 3 classes
    """
    Evaluate multi-class classification metrics.
    
    Parameters:
    - true: true labels
    - predicted: predicted labels
    - probs: predicted probabilities for roc_auc (optional)
    - n_classes: number of classes
    """
    
    acc = accuracy_score(true , predicted)
    f1 = f1_score(true , predicted , average = "macro")
    precision = precision_score(true , predicted , average = "macro")
    recall = recall_score(true , predicted , average = "macro")
    
    # ROC-AUC for multi-class requires probability estimates
    if probs is not None:
        roc_auc = roc_auc_score(true , probs , multi_class = "ovr", average = "macro")
    else:
        roc_auc = None
    
    return acc, f1, precision, recall, roc_auc

In [101]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
     "K-Neighbors Classifier": KNeighborsClassifier(),
    "XGBClassifier": XGBClassifier(), 
     "CatBoosting Classifier": CatBoostClassifier(verbose=False),
     "Support Vector Classifier": SVC(),
    "AdaBoost Classifier": AdaBoostClassifier()
}

In [104]:
def evaluate_models(X_train , y_train , X_test , y_test , models, n_classes = 3):
    
    models_list = []
    accuracy_list = []
    auc_list = []

    for model_name, model in models.items():
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Predicted probabilities for ROC-AUC (if available)
        try:
            y_train_probs = model.predict_proba(X_train)
            y_test_probs = model.predict_proba(X_test)
        except:
            y_train_probs = None
            y_test_probs = None

        # Evaluate training set
        model_train_accuracy, model_train_f1, model_train_precision, model_train_recall, model_train_rocauc_score = \
            evaluate_clf(y_train, y_train_pred, probs=y_train_probs, n_classes=n_classes)
        
        # Evaluate test set
        model_test_accuracy, model_test_f1, model_test_precision, model_test_recall, model_test_rocauc_score = \
            evaluate_clf(y_test, y_test_pred, probs=y_test_probs, n_classes=n_classes)

        # Print report
        print(f"Model: {model_name}")
        models_list.append(model_name)
        
        print('Model performance for Training set')
        print(f"- Accuracy: {model_train_accuracy:.4f}")
        print(f"- F1 score: {model_train_f1:.4f}")
        print(f"- Precision: {model_train_precision:.4f}")
        print(f"- Recall: {model_train_recall:.4f}")
        print(f"- ROC AUC Score: {model_train_rocauc_score}")
        print('----------------------------------')
        
        print('Model performance for Test set')
        print(f"- Accuracy: {model_test_accuracy:.4f}")
        accuracy_list.append(model_test_accuracy)
        print(f"- F1 score: {model_test_f1:.4f}")
        print(f"- Precision: {model_test_precision:.4f}")
        print(f"- Recall: {model_test_recall:.4f}")
        print(f"- ROC AUC Score: {model_test_rocauc_score}")
        auc_list.append(model_test_rocauc_score)
        print('='*35 + '\n')

    # Summary report
    report = pd.DataFrame({
        'Model Name': models_list,
        'Accuracy': accuracy_list,
        'ROC-AUC': auc_list
    }).sort_values(by='Accuracy', ascending=False)
    
    return report


In [105]:
report = evaluate_models(X_train = X_train , X_test = X_test , y_train = y_train , y_test = y_test , models = models)

Model: Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- ROC AUC Score: 1.0
----------------------------------
Model performance for Test set
- Accuracy: 0.8463
- F1 score: 0.6557
- Precision: 0.8189
- Recall: 0.5883
- ROC AUC Score: 0.8686243638961395

Model: Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- ROC AUC Score: 1.0
----------------------------------
Model performance for Test set
- Accuracy: 0.7900
- F1 score: 0.6097
- Precision: 0.5969
- Recall: 0.6249
- ROC AUC Score: 0.725138118406055

Model: Gradient Boosting
Model performance for Training set
- Accuracy: 0.8172
- F1 score: 0.5819
- Precision: 0.8477
- Recall: 0.5166
- ROC AUC Score: 0.8850304254636812
----------------------------------
Model performance for Test set
- Accuracy: 0.8046
- F1 score: 0.4903
- Precision: 0.7347
- Recall: 0.4508
- ROC AUC Score: 0.8029230110

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Model: K-Neighbors Classifier
Model performance for Training set
- Accuracy: 0.8429
- F1 score: 0.6435
- Precision: 0.7774
- Recall: 0.5912
- ROC AUC Score: 0.9303720778456751
----------------------------------
Model performance for Test set
- Accuracy: 0.7730
- F1 score: 0.4757
- Precision: 0.5222
- Recall: 0.4607
- ROC AUC Score: 0.7582322477066953

Model: XGBClassifier
Model performance for Training set
- Accuracy: 0.9815
- F1 score: 0.9805
- Precision: 0.9898
- Recall: 0.9722
- ROC AUC Score: 0.9991225391317289
----------------------------------
Model performance for Test set
- Accuracy: 0.8278
- F1 score: 0.6443
- Precision: 0.7700
- Recall: 0.5853
- ROC AUC Score: 0.8435895614889937

Model: CatBoosting Classifier
Model performance for Training set
- Accuracy: 0.9465
- F1 score: 0.9192
- Precision: 0.9721
- Recall: 0.8781
- ROC AUC Score: 0.9922694792316715
----------------------------------
Model performance for Test set
- Accuracy: 0.8309
- F1 score: 0.6421
- Precision: 0.8158
-

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Model: Support Vector Classifier
Model performance for Training set
- Accuracy: 0.7774
- F1 score: 0.3563
- Precision: 0.4549
- Recall: 0.3678
- ROC AUC Score: None
----------------------------------
Model performance for Test set
- Accuracy: 0.7938
- F1 score: 0.3694
- Precision: 0.4951
- Recall: 0.3752
- ROC AUC Score: None

Model: AdaBoost Classifier
Model performance for Training set
- Accuracy: 0.7689
- F1 score: 0.3910
- Precision: 0.4219
- Recall: 0.3908
- ROC AUC Score: 0.7511587508360301
----------------------------------
Model performance for Test set
- Accuracy: 0.7861
- F1 score: 0.4072
- Precision: 0.4343
- Recall: 0.4044
- ROC AUC Score: 0.7361654570157702



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [106]:
report

Unnamed: 0,Model Name,Accuracy,ROC-AUC
0,Random Forest,0.846332,0.868624
6,CatBoosting Classifier,0.830888,0.847122
5,XGBClassifier,0.827799,0.84359
2,Gradient Boosting,0.804633,0.802923
7,Support Vector Classifier,0.793822,
1,Decision Tree,0.789961,0.725138
8,AdaBoost Classifier,0.7861,0.736165
3,Logistic Regression,0.780695,0.73495
4,K-Neighbors Classifier,0.772973,0.758232


In [107]:
y_train.value_counts()

quality
1    3954
0    1073
2     153
Name: count, dtype: int64

In [108]:
# doing stratified sampling 
X_train, X_test, y_train, y_test = train_test_split(X_transformed , y_encoded , test_size = 0.2 , random_state = 42 , stratify = y)

In [109]:
def evaluate_clf(true, predicted, probs = None , n_classes = 3):
    """
    Evaluate multi-class classification metrics with class imbalance handling.
    """
    acc = accuracy_score(true, predicted)
    f1 = f1_score(true , predicted , average = "weighted" , zero_division = 0)
    precision = precision_score(true, predicted, average = "weighted" , zero_division = 0)
    recall = recall_score(true, predicted, average = "weighted", zero_division = 0)
    
    if probs is not None:
        roc_auc = roc_auc_score(true, probs, multi_class = "ovr", average = "weighted")
    else:
        roc_auc = None
    
    return acc, f1, precision, recall, roc_auc

In [110]:
def evaluate_models(X_train, y_train, X_test, y_test, models, n_classes=3):
    models_list = []
    accuracy_list = []
    auc_list = []

    for model_name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)
        
        # Predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Probabilities for ROC-AUC (if available)
        try:
            y_train_probs = model.predict_proba(X_train)
            y_test_probs = model.predict_proba(X_test)
        except AttributeError:
            y_train_probs = None
            y_test_probs = None

        # Evaluate training set
        model_train_accuracy, model_train_f1, model_train_precision, model_train_recall, model_train_rocauc_score = \
            evaluate_clf(y_train, y_train_pred, probs=y_train_probs, n_classes=n_classes)
        
        # Evaluate test set
        model_test_accuracy, model_test_f1, model_test_precision, model_test_recall, model_test_rocauc_score = \
            evaluate_clf(y_test, y_test_pred, probs=y_test_probs, n_classes=n_classes)

        # Print report
        print(f"Model: {model_name}")
        models_list.append(model_name)
        
        print('Model performance for Training set')
        print(f"- Accuracy: {model_train_accuracy:.4f}")
        print(f"- F1 score: {model_train_f1:.4f}")
        print(f"- Precision: {model_train_precision:.4f}")
        print(f"- Recall: {model_train_recall:.4f}")
        print(f"- ROC AUC Score: {model_train_rocauc_score}")
        print('----------------------------------')
        
        print('Model performance for Test set')
        print(f"- Accuracy: {model_test_accuracy:.4f}")
        accuracy_list.append(model_test_accuracy)
        print(f"- F1 score: {model_test_f1:.4f}")
        print(f"- Precision: {model_test_precision:.4f}")
        print(f"- Recall: {model_test_recall:.4f}")
        print(f"- ROC AUC Score: {model_test_rocauc_score}")
        auc_list.append(model_test_rocauc_score)
        print('='*35 + '\n')

    # Summary report
    report = pd.DataFrame({
        'Model Name': models_list,
        'Accuracy': accuracy_list,
        'ROC-AUC': auc_list
    }).sort_values(by='Accuracy', ascending=False)
    
    return report

In [113]:
report2 = evaluate_models(X_train = X_train , X_test = X_test , y_train = y_train , y_test = y_test , models = models , n_classes = 3)

Model: Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- ROC AUC Score: 1.0
----------------------------------
Model performance for Test set
- Accuracy: 0.8541
- F1 score: 0.8410
- Precision: 0.8476
- Recall: 0.8541
- ROC AUC Score: 0.8817346353248153

Model: Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- ROC AUC Score: 1.0
----------------------------------
Model performance for Test set
- Accuracy: 0.7792
- F1 score: 0.7797
- Precision: 0.7804
- Recall: 0.7792
- ROC AUC Score: 0.7117068468684861

Model: Gradient Boosting
Model performance for Training set
- Accuracy: 0.8180
- F1 score: 0.7827
- Precision: 0.8118
- Recall: 0.8180
- ROC AUC Score: 0.8605840497672708
----------------------------------
Model performance for Test set
- Accuracy: 0.7815
- F1 score: 0.7399
- Precision: 0.7471
- Recall: 0.7815
- ROC AUC Score: 0.789352217

In [114]:
report2

Unnamed: 0,Model Name,Accuracy,ROC-AUC
0,Random Forest,0.854054,0.881735
5,XGBClassifier,0.836293,0.845883
6,CatBoosting Classifier,0.827027,0.848516
2,Gradient Boosting,0.781467,0.789352
1,Decision Tree,0.779151,0.711707
7,Support Vector Classifier,0.773745,
3,Logistic Regression,0.766795,0.720589
4,K-Neighbors Classifier,0.765251,0.755894
8,AdaBoost Classifier,0.760618,0.702743


## Take top-3 model and do hyper-parameter tuning

In [115]:
#Initialize few parameter for Hyperparamter tuning

# XGBoost parameters
xgboost_params = {
    'max_depth': range(3, 10, 2),
    'min_child_weight': range(1, 6, 2),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [50, 100, 200, 300]
}

# Random Forest parameters
rf_params = {
    "max_depth": [10, 12, None, 15, 20],
    "max_features": ['sqrt', 'log2', None],
    "n_estimators": [50, 100, 200, 300],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

# CatBoost parameters
catboost_params = {
    "depth": [4, 6, 8, 10],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "iterations": [100, 200, 300],
    "l2_leaf_reg": [1, 3, 5, 7]
} 

In [123]:
# Models list for Hyperparameter tuning
randomcv_models = [
    ('XGBoost', XGBClassifier(use_label_encoder = False , eval_metric = 'mlogloss' , random_state = 42), xgboost_params),
    ('RandomForest', RandomForestClassifier(random_state = 42), rf_params),
    ('CatBoost', CatBoostClassifier(verbose = False, random_state = 42), catboost_params)
]

In [124]:
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y_train) 
class_weights = compute_class_weight(class_weight = "balanced", classes = classes, y = y_train)
class_weights_dict = {cls: weight for cls, weight in zip(classes, class_weights)}

In [125]:
class_weights_dict

{0: 1.6351010101010102, 1: 0.4353672886199361, 2: 10.928270042194093}

In [126]:
# sample weights for XGBoost 
sample_weights = np.array([class_weights_dict[y] for y in y_train])

## Do the tuning

In [127]:
from sklearn.model_selection import RandomizedSearchCV 

best_models = {}  # to store the best estimators
best_params = {}  # to store the best parameters

for model_name, model_obj, params in randomcv_models:
    print(f"\nRunning RandomizedSearchCV for {model_name}...")
    
    random_search = RandomizedSearchCV(
        estimator = model_obj,
        param_distributions = params,
        n_iter = 20,        # number of parameter combinations to try
        cv = 5,
        verbose = 3,
        n_jobs = -1,
        random_state = 42
    )

    # Handle class imbalance
    if model_name == "XGBoost":
        random_search.fit(X_train, y_train, **{'sample_weight': sample_weights})
    elif model_name == "RandomForest":
        random_search.estimator.set_params(class_weight='balanced')
        random_search.fit(X_train, y_train)
    else:  # CatBoost
        random_search.fit(X_train, y_train)

    best_models[model_name] = random_search.best_estimator_
    best_params[model_name] = random_search.best_params_
    print(f"Best parameters for {model_name}: {random_search.best_params_}")



Running RandomizedSearchCV for XGBoost...
Fitting 5 folds for each of 20 candidates, totalling 100 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best parameters for XGBoost: {'n_estimators': 100, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.1}

Running RandomizedSearchCV for RandomForest...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters for RandomForest: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 20}

Running RandomizedSearchCV for CatBoost...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters for CatBoost: {'learning_rate': 0.05, 'l2_leaf_reg': 1, 'iterations': 300, 'depth': 10}


In [129]:
tuned_report = evaluate_models(X_train = X_train , X_test = X_test , y_train = y_train , y_test = y_test , models = best_models , n_classes = 3)

Unnamed: 0,Model Name,Accuracy,ROC-AUC
1,RandomForest,0.849421,0.879195
2,CatBoost,0.827027,0.847939
0,XGBoost,0.802317,0.822229


In [133]:
from sklearn.model_selection import GridSearchCV 

best_models = {}  # to store the best estimators
best_params = {}  # to store the best parameters

for model_name, model_obj, params in randomcv_models:
    print(f"\nRunning GridSearchCV for {model_name}...")
    
    grid = GridSearchCV(
        estimator = model_obj,
        param_grid = params,
        cv = 3,
        n_jobs = -1,
        verbose = 5
    )

    if model_name == "XGBoost":
        # Pass sample_weight for XGBoost
        grid.fit(X_train, y_train, **{'sample_weight': sample_weights})
    elif model_name == "RandomForest":
        # Pass class_weight for RF automatically
        grid.estimator.set_params(class_weight='balanced')
        grid.fit(X_train, y_train)
    else:
        # CatBoost
        grid.fit(X_train, y_train)

    best_models[model_name] = random_search.best_estimator_
    best_params[model_name] = random_search.best_params_
    print(f"Best parameters for {model_name}: {random_search.best_params_}")



Running GridSearchCV for XGBoost...
Fitting 3 folds for each of 192 candidates, totalling 576 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best parameters for XGBoost: {'learning_rate': 0.05, 'l2_leaf_reg': 1, 'iterations': 300, 'depth': 10}

Running GridSearchCV for RandomForest...
Fitting 3 folds for each of 540 candidates, totalling 1620 fits
Best parameters for RandomForest: {'learning_rate': 0.05, 'l2_leaf_reg': 1, 'iterations': 300, 'depth': 10}

Running GridSearchCV for CatBoost...
Fitting 3 folds for each of 192 candidates, totalling 576 fits
Best parameters for CatBoost: {'learning_rate': 0.05, 'l2_leaf_reg': 1, 'iterations': 300, 'depth': 10}


In [134]:
tuned_report2 = evaluate_models(X_train = X_train , X_test = X_test , y_train = y_train , y_test = y_test , models = best_models , n_classes = 3)

Model: XGBoost
Model performance for Training set
- Accuracy: 0.9392
- F1 score: 0.9361
- Precision: 0.9416
- Recall: 0.9392
- ROC AUC Score: 0.9856259908491014
----------------------------------
Model performance for Test set
- Accuracy: 0.8270
- F1 score: 0.8064
- Precision: 0.8164
- Recall: 0.8270
- ROC AUC Score: 0.847938790632712

Model: RandomForest
Model performance for Training set
- Accuracy: 0.9392
- F1 score: 0.9361
- Precision: 0.9416
- Recall: 0.9392
- ROC AUC Score: 0.9856259908491014
----------------------------------
Model performance for Test set
- Accuracy: 0.8270
- F1 score: 0.8064
- Precision: 0.8164
- Recall: 0.8270
- ROC AUC Score: 0.847938790632712

Model: CatBoost
Model performance for Training set
- Accuracy: 0.9392
- F1 score: 0.9361
- Precision: 0.9416
- Recall: 0.9392
- ROC AUC Score: 0.9856259908491014
----------------------------------
Model performance for Test set
- Accuracy: 0.8270
- F1 score: 0.8064
- Precision: 0.8164
- Recall: 0.8270
- ROC AUC Score:

In [135]:
tuned_report2

Unnamed: 0,Model Name,Accuracy,ROC-AUC
0,XGBoost,0.827027,0.847939
1,RandomForest,0.827027,0.847939
2,CatBoost,0.827027,0.847939


## For Multi-Class we got best model as Random Forest Classifier with best accuracy 85%