## Here we will test for multi-class but just with random forest.(Extend of EX-01)

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot 

In [2]:
# load the data
df = pd.read_csv("wine.csv")

In [3]:
df.head()

Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,6.0,0.18,0.31,1.4,0.036,14.0,75.0,0.99085,3.34,0.58,11.1,8
1,white,5.3,0.395,0.07,1.3,0.035,26.0,102.0,0.992,3.5,0.35,10.6,6
2,red,8.1,0.56,0.28,1.7,0.368,16.0,56.0,0.9968,3.11,1.28,9.3,5
3,white,6.4,0.22,0.34,1.4,0.023,56.0,115.0,0.98958,3.18,0.7,11.7,6
4,red,9.4,0.27,0.53,2.4,0.074,6.0,18.0,0.9962,3.2,1.13,12.0,7


In [4]:
def wine_label(q):
    if q > 7:
        return "premium" # label encode => 2 
    elif 5 <= q <= 6:
        return "standard" # label encode => 1 
    else:
        return "basic" # label encode => 0 

df["quality"] = df["quality"].apply(wine_label)

In [5]:
#  wine type is not related to quality. Drop this column
df.drop(columns = ['wine type'] , axis = 1 , inplace = True)

In [6]:
# column citric acid values with greater than 0.95 is outlier remove all where value crosses the thresold 
df[df['citric acid'] > 0.95].shape

(9, 12)

In [7]:
# column citric acid values with greater than 0.95 is outlier remove all where value crosses the thresold 
df = df[df['citric acid'] <= 0.95]

In [8]:
# residual sugar column has outlier, remove all rows where values are greater than 25
df[df['residual sugar'] > 25]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
3671,7.8,0.965,0.6,65.8,0.074,8.0,160.0,1.03898,3.39,0.69,11.7,standard
4701,7.9,0.33,0.28,31.6,0.053,35.0,176.0,1.0103,3.15,0.38,8.8,standard
5479,6.8,0.45,0.28,26.05,0.031,27.0,122.0,1.00295,3.06,0.42,10.6,standard
5857,7.9,0.33,0.28,31.6,0.053,35.0,176.0,1.0103,3.15,0.38,8.8,standard
6180,6.8,0.45,0.28,26.05,0.031,27.0,122.0,1.00295,3.06,0.42,10.6,standard


In [9]:
# residual sugar column has outlier, remove all rows where values are greater than 25
df = df[df['residual sugar'] <= 25]

In [10]:
# total sulfur dioxide has outlier 
df = df[df['total sulfur dioxide'] <= 280]

In [11]:
# fixed acidity is almost same for all quality , also correlation is -0.08 not really correlated 
df = df.drop(columns=["fixed acidity"])

In [12]:
# `density` is not changing for `quality` not an usefull column , drop this column
df = df.drop(columns = ['density'])

In [13]:
# pH is not changing for quality not an usefull column drop this column 
df = df.drop(columns = ['pH'])

In [14]:
# quality is not really depending on sulphates, drop this column 
df = df.drop(columns = ['sulphates'])

In [15]:
# seprate X and y 
X = df.drop("quality" , axis = 1)
y = df['quality']

## Now we have to do the transformation

In [16]:
log_cols = ['volatile acidity' , 'residual sugar', 'chlorides']
sqrt_cols = ['free sulfur dioxide']
other_cols = [col for col in X.columns if col not in log_cols + sqrt_cols] 

In [17]:
from sklearn.preprocessing import FunctionTransformer , StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [18]:
# define log transformers  
log_pipeline = Pipeline(steps = [
    ("log" , FunctionTransformer(np.log1p , validate = False)) , 
    ("scaler" , StandardScaler())
])

In [19]:
# define sqrt transformer with scaling 
sqrt_pipeline = Pipeline(steps = [
    ("sqrt" , FunctionTransformer(np.sqrt , validate = False)), 
    ("scaler" , StandardScaler())
])

In [20]:
# define a scaler for others column
scaler_pipeline = Pipeline(steps = [
    ("scaler" , StandardScaler())
])

In [21]:
# define the column transformer 
preprocessor = ColumnTransformer(
    transformers = [
        ("log" , log_pipeline , log_cols),
        ("sqrt" , sqrt_pipeline , sqrt_cols),
        ("others" , scaler_pipeline , other_cols)
    ]
)

In [22]:
# transformed X 
X_transformed = preprocessor.fit_transform(X)

In [23]:
# now encode the target y 
mapping = {"basic": 0, "standard": 1, "premium": 2}
y_encoded = y.map(mapping)

In [24]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X_transformed , y_encoded , test_size = 0.2 , random_state = 42)
X_train.shape , X_test.shape

((5180, 7), (1295, 7))

### Handle Imbalance Issue

In [28]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from collections import Counter

In [29]:
smote = SMOTE(random_state = 42)
X_train_res , y_train_res = smote.fit_resample(X_train , y_train)

In [30]:
print("Before:", Counter(y_train))
print("After :", Counter(y_train_res))

Before: Counter({1: 3954, 0: 1073, 2: 153})
After : Counter({1: 3954, 0: 3954, 2: 3954})


In [31]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

def evaluate_clf(true, predicted, probs = None, n_classes = 3): # n_classes = 3 cause we have 3 classes
    """
    Evaluate multi-class classification metrics.
    
    Parameters:
    - true: true labels
    - predicted: predicted labels
    - probs: predicted probabilities for roc_auc (optional)
    - n_classes: number of classes
    """
    
    acc = accuracy_score(true , predicted)
    f1 = f1_score(true , predicted , average = "macro")
    precision = precision_score(true , predicted , average = "macro")
    recall = recall_score(true , predicted , average = "macro")
    
    # ROC-AUC for multi-class requires probability estimates
    if probs is not None:
        roc_auc = roc_auc_score(true , probs , multi_class = "ovr", average = "macro")
    else:
        roc_auc = None
    
    return acc, f1, precision, recall, roc_auc

In [33]:
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier , GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [34]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBClassifier": XGBClassifier(), 
     "CatBoosting Classifier": CatBoostClassifier(verbose = False),
    "AdaBoost Classifier": AdaBoostClassifier()
}

In [39]:
def evaluate_models(X_train , y_train , X_test , y_test , models  , n_classes = 3): 
    # store model and metrices
    models_list = []
    accuracy_list = []
    f1_list = []
    auc_list = []

    # go to each base model 
    for model_name , model_obj in models.items(): 
        # train the model 
        model_obj.fit(X_train , y_train) 
        
        # predict 
        y_train_pred = model_obj.predict(X_train)
        y_test_pred = model_obj.predict(X_test)
        
        # Predicted probabilities for ROC-AUC (if available)
        try:
            y_train_probs = model_obj.predict_proba(X_train)
            y_test_probs = model_obj.predict_proba(X_test)
        except:
            y_train_probs = None
            y_test_probs = None
        
        # Evaluate training set
        model_train_accuracy, model_train_f1, model_train_precision, model_train_recall, model_train_rocauc_score = \
            evaluate_clf(y_train, y_train_pred, probs=y_train_probs, n_classes = n_classes)
       
        # Evaluate test set
        model_test_accuracy, model_test_f1, model_test_precision, model_test_recall, model_test_rocauc_score = \
            evaluate_clf(y_test, y_test_pred, probs = y_test_probs, n_classes = n_classes)
        
        # Print report
        print(f"Model: {model_name}")
        models_list.append(model_name)
        
        print('Model performance for Training set')
        print(f"- Accuracy: {model_train_accuracy:.4f}")
        print(f"- F1 score: {model_train_f1:.4f}")
        print(f"- Precision: {model_train_precision:.4f}")
        print(f"- Recall: {model_train_recall:.4f}")
        print(f"- ROC AUC Score: {model_train_rocauc_score}")
        print('----------------------------------')
        
        print('Model performance for Test set')
        print(f"- Accuracy: {model_test_accuracy:.4f}")
        
        print(f"- F1 score: {model_test_f1:.4f}")
        print(f"- Precision: {model_test_precision:.4f}")
        print(f"- Recall: {model_test_recall:.4f}")
        print(f"- ROC AUC Score: {model_test_rocauc_score}")
        
        print('='*35 + '\n')

        # store metrices 
        accuracy_list.append(model_test_accuracy) 
        auc_list.append(model_test_rocauc_score)
    # report summary
    # Summary report
    report = pd.DataFrame({
        'Model Name': models_list,
        'Accuracy': accuracy_list,
        'ROC-AUC': auc_list
    }).sort_values(by='Accuracy', ascending=False)
    
    return report

In [40]:
base_model_report = evaluate_models(
    X_train = X_train_res , y_train = y_train_res , X_test = X_test , y_test = y_test , models = models , 
)

Model: Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- ROC AUC Score: 1.0
----------------------------------
Model performance for Test set
- Accuracy: 0.8046
- F1 score: 0.6276
- Precision: 0.6137
- Recall: 0.6473
- ROC AUC Score: 0.8752543161238963

Model: Gradient Boosting
Model performance for Training set
- Accuracy: 0.7685
- F1 score: 0.7647
- Precision: 0.7666
- Recall: 0.7685
- ROC AUC Score: 0.9160381906679812
----------------------------------
Model performance for Test set
- Accuracy: 0.6934
- F1 score: 0.5036
- Precision: 0.4833
- Recall: 0.5893
- ROC AUC Score: 0.7880048886031089

Model: XGBClassifier
Model performance for Training set
- Accuracy: 0.9864
- F1 score: 0.9864
- Precision: 0.9865
- Recall: 0.9864
- ROC AUC Score: 0.9994446440080144
----------------------------------
Model performance for Test set
- Accuracy: 0.8000
- F1 score: 0.6068
- Precision: 0.6103
- Recall: 0.6068
- ROC AUC Sco

## Not Looking Good, Skip