In [1]:
import numpy as np 
import pandas as pd 

In [2]:
df = pd.read_csv("wine.csv")
df.head()

Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,6.0,0.18,0.31,1.4,0.036,14.0,75.0,0.99085,3.34,0.58,11.1,8
1,white,5.3,0.395,0.07,1.3,0.035,26.0,102.0,0.992,3.5,0.35,10.6,6
2,red,8.1,0.56,0.28,1.7,0.368,16.0,56.0,0.9968,3.11,1.28,9.3,5
3,white,6.4,0.22,0.34,1.4,0.023,56.0,115.0,0.98958,3.18,0.7,11.7,6
4,red,9.4,0.27,0.53,2.4,0.074,6.0,18.0,0.9962,3.2,1.13,12.0,7


In [3]:
df.drop(columns = ['wine type' , 'fixed acidity' , 'sulphates' , "pH"] , axis = 1 , inplace = True)

In [4]:
df.head()

Unnamed: 0,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,alcohol,quality
0,0.18,0.31,1.4,0.036,14.0,75.0,0.99085,11.1,8
1,0.395,0.07,1.3,0.035,26.0,102.0,0.992,10.6,6
2,0.56,0.28,1.7,0.368,16.0,56.0,0.9968,9.3,5
3,0.22,0.34,1.4,0.023,56.0,115.0,0.98958,11.7,6
4,0.27,0.53,2.4,0.074,6.0,18.0,0.9962,12.0,7


In [5]:
def wine_label(q):
    if q > 5:
        return 1 
    else: 
        return 0
df["quality"] = df["quality"].apply(wine_label)

In [7]:
# column citric acid values with greater than 0.95 is outlier remove all where value crosses the thresold 
df = df[df['citric acid'] <= 0.95]

In [8]:
# residual sugar column has outlier, remove all rows where values are greater than 25
df = df[df['residual sugar'] <= 25]

In [9]:
# total sulfur dioxide has outlier 
df = df[df['total sulfur dioxide'] <= 280]

In [10]:
# `density` is not changing for `quality` not an usefull column , drop this column
df.drop(columns = ['density'] , axis = 1 , inplace = True)

In [11]:
# seprate X and y 
X = df.drop("quality" , axis = 1)
y = df['quality']

In [12]:
from collections import Counter

Counter(y)

Counter({1: 4100, 0: 2375})

In [13]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(X , y , stratify = y , test_size = 0.2 , random_state = 42)

In [14]:
X_train.shape

(5180, 7)

In [15]:
X_test.shape

(1295, 7)

## Data preprocessing

In [16]:
log_cols = ['volatile acidity' , 'residual sugar', 'chlorides']
sqrt_cols = ['free sulfur dioxide']
other_cols = [col for col in X.columns if col not in log_cols + sqrt_cols] 

In [17]:
from sklearn.preprocessing import FunctionTransformer , StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [18]:
# define log transformers  
log_pipeline = Pipeline(steps = [
    ("log" , FunctionTransformer(np.log1p , validate = False)) , 
    ("scaler" , StandardScaler())
])

In [19]:
# define sqrt transformer with scaling 
sqrt_pipeline = Pipeline(steps = [
    ("sqrt" , FunctionTransformer(np.sqrt , validate = False)), 
    ("scaler" , StandardScaler())
])

In [20]:
# define a scaler for others column
scaler_pipeline = Pipeline(steps = [
    ("scaler" , StandardScaler())
])

In [21]:
# define the column transformer 
preprocessor = ColumnTransformer(
    transformers = [
        ("log" , log_pipeline , log_cols),
        ("sqrt" , sqrt_pipeline , sqrt_cols),
        ("others" , scaler_pipeline , other_cols)
    ]
)

In [22]:
# check the imbalance class issue 
Counter(y_train)

Counter({1: 3280, 0: 1900})

In [23]:
# do the preprocessing for train data 
X_train_transformed = preprocessor.fit_transform(X_train)

In [27]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score,roc_curve 
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [25]:
def evaluate_clf(true, predicted):
    acc = accuracy_score(true, predicted) # Calculate Accuracy
    f1 = f1_score(true, predicted) # Calculate F1-score
    precision = precision_score(true, predicted) # Calculate Precision
    recall = recall_score(true, predicted)  # Calculate Recall
    roc_auc = roc_auc_score(true, predicted) #Calculate Roc
    return acc, f1 , precision, recall, roc_auc

In [28]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
     "K-Neighbors Classifier": KNeighborsClassifier(),
    "XGBClassifier": XGBClassifier(), 
     "CatBoosting Classifier": CatBoostClassifier(verbose=False),
     "Support Vector Classifier": SVC(),
    "AdaBoost Classifier": AdaBoostClassifier()
}

In [38]:
from imblearn.combine import SMOTEENN
from sklearn.model_selection import StratifiedKFold , cross_val_score
from imblearn.pipeline import Pipeline

def model_evaluation(X_train , y_train , model_dict): 
    results = [] 
    
    for model_name , model_obj in model_dict.items(): 
        print(f"--------------------------- model name: {model_name} ----------------------")
        # make the pipeline 
        pipeline = Pipeline(steps = [
            ("smoteen" , SMOTEENN(random_state = 42 , sampling_strategy = "minority")), 
            (model_name , model_obj)
        ])

        cv = StratifiedKFold(n_splits = 5 , shuffle = True , random_state = 42)
        scores = cross_val_score(
            pipeline , X_train , y_train , cv = cv , scoring = 'accuracy' , n_jobs = -1
        )
        
        # store mean and std of CV scores
        results.append({
            "Model Name": model_name,
            "Mean CV (Balanced Accuracy)": scores.mean(),
            "Std CV": scores.std()
        })

        print(results[-1])
    # make dataframe
    report = pd.DataFrame(results).sort_values(
        by=["Mean CV (Balanced Accuracy)"], ascending=False
    ).reset_index(drop=True)

    return report

In [39]:
base_model_report = model_evaluation(X_train = X_train_transformed , y_train = y_train , model_dict = models)

--------------------------- model name: Random Forest ----------------------
{'Model Name': 'Random Forest', 'Mean CV (Balanced Accuracy)': 0.7420849420849421, 'Std CV': 0.01511966351289674}
--------------------------- model name: Decision Tree ----------------------
{'Model Name': 'Decision Tree', 'Mean CV (Balanced Accuracy)': 0.7162162162162163, 'Std CV': 0.006603332577794238}
--------------------------- model name: Gradient Boosting ----------------------
{'Model Name': 'Gradient Boosting', 'Mean CV (Balanced Accuracy)': 0.7305019305019306, 'Std CV': 0.016600071680853812}
--------------------------- model name: Logistic Regression ----------------------
{'Model Name': 'Logistic Regression', 'Mean CV (Balanced Accuracy)': 0.7027027027027029, 'Std CV': 0.013107525248469457}
--------------------------- model name: K-Neighbors Classifier ----------------------
{'Model Name': 'K-Neighbors Classifier', 'Mean CV (Balanced Accuracy)': 0.726061776061776, 'Std CV': 0.007755717925328462}
----



{'Model Name': 'CatBoosting Classifier', 'Mean CV (Balanced Accuracy)': 0.7467181467181467, 'Std CV': 0.012490213219785192}
--------------------------- model name: Support Vector Classifier ----------------------
{'Model Name': 'Support Vector Classifier', 'Mean CV (Balanced Accuracy)': 0.7212355212355213, 'Std CV': 0.011508771089412253}
--------------------------- model name: AdaBoost Classifier ----------------------
{'Model Name': 'AdaBoost Classifier', 'Mean CV (Balanced Accuracy)': 0.7142857142857143, 'Std CV': 0.011206892103243228}


In [40]:
base_model_report

Unnamed: 0,Model Name,Mean CV (Balanced Accuracy),Std CV
0,CatBoosting Classifier,0.746718,0.01249
1,XGBClassifier,0.746332,0.0157
2,Random Forest,0.742085,0.01512
3,Gradient Boosting,0.730502,0.0166
4,K-Neighbors Classifier,0.726062,0.007756
5,Support Vector Classifier,0.721236,0.011509
6,Decision Tree,0.716216,0.006603
7,AdaBoost Classifier,0.714286,0.011207
8,Logistic Regression,0.702703,0.013108


In [41]:
base_model_report2 = model_evaluation(X_train = X_train , y_train = y_train , model_dict = models)

--------------------------- model name: Random Forest ----------------------
{'Model Name': 'Random Forest', 'Mean CV (Balanced Accuracy)': 0.7355212355212355, 'Std CV': 0.010287963393416404}
--------------------------- model name: Decision Tree ----------------------
{'Model Name': 'Decision Tree', 'Mean CV (Balanced Accuracy)': 0.694980694980695, 'Std CV': 0.011711004546797778}
--------------------------- model name: Gradient Boosting ----------------------
{'Model Name': 'Gradient Boosting', 'Mean CV (Balanced Accuracy)': 0.7247104247104248, 'Std CV': 0.011062972032350902}
--------------------------- model name: Logistic Regression ----------------------
{'Model Name': 'Logistic Regression', 'Mean CV (Balanced Accuracy)': 0.6897683397683397, 'Std CV': 0.01664715145738232}
--------------------------- model name: K-Neighbors Classifier ----------------------
{'Model Name': 'K-Neighbors Classifier', 'Mean CV (Balanced Accuracy)': 0.6183397683397683, 'Std CV': 0.01053141443782693}
-----

In [42]:
base_model_report2

Unnamed: 0,Model Name,Mean CV (Balanced Accuracy),Std CV
0,CatBoosting Classifier,0.737259,0.011573
1,Random Forest,0.735521,0.010288
2,XGBClassifier,0.734556,0.009398
3,Gradient Boosting,0.72471,0.011063
4,AdaBoost Classifier,0.710039,0.019484
5,Decision Tree,0.694981,0.011711
6,Logistic Regression,0.689768,0.016647
7,K-Neighbors Classifier,0.61834,0.010531
8,Support Vector Classifier,0.583977,0.011454


In [48]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

In [44]:
# Define pipelines
xgb_pipeline = Pipeline(steps=[
    ("smoteen", SMOTEENN(random_state=42, sampling_strategy="minority")),
    ("clf", XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42))
])

rf_pipeline = Pipeline(steps=[
    ("smoteen", SMOTEENN(random_state=42, sampling_strategy="minority")),
    ("clf", RandomForestClassifier(random_state=42))
])

In [45]:
# Parameter grids (note the "clf__" prefix for pipeline parameters)
xgboost_params = {
    'clf__max_depth': range(3, 10, 2),
    'clf__min_child_weight': range(1, 6, 2),
    'clf__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'clf__n_estimators': [50, 100, 200, 300]
}

rf_params = {
    "clf__max_depth": [10, 12, None, 15, 20],
    "clf__max_features": ['sqrt', 'log2', None],
    "clf__n_estimators": [50, 100, 200, 300],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4]
}


In [46]:
# Cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [50]:
# RandomizedSearchCV for Random Forest
rf_random_search = RandomizedSearchCV(
    estimator=rf_pipeline,
    param_distributions={
        "max_depth": [10, 12, None, 15, 20],
        "max_features": ['sqrt', 'log2', None],
        "n_estimators": [50, 100, 200, 300],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    },
    n_iter=20,  # number of random combinations to try
    cv=cv,
    scoring="balanced_accuracy",  # better for imbalanced data
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# Fit search
rf_random_search.fit(X_train, y_train)

# Results
print("Best RF params:", rf_random_search.best_params_)
print("Best RF CV Score:", rf_random_search.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


ValueError: Invalid parameter 'n_estimators' for estimator Pipeline(steps=[('smoteen',
                 SMOTEENN(random_state=42, sampling_strategy='minority')),
                ('clf', RandomForestClassifier(random_state=42))]). Valid parameters are: ['memory', 'steps', 'transform_input', 'verbose'].

In [None]:
# GridSearch for XGBoost
xgb_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=xgboost_params,
    cv=cv,
    scoring='balanced_accuracy',  
    n_jobs=-1,
    verbose=2
)

xgb_search.fit(X_train, y_train)
print("Best XGBoost params:", xgb_search.best_params_)
print("Best XGBoost CV Score:", xgb_search.best_score_)