In [1]:
import numpy as np 
import pandas as pd 

In [2]:
df = pd.read_csv("wine.csv")
df.head()

Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,6.0,0.18,0.31,1.4,0.036,14.0,75.0,0.99085,3.34,0.58,11.1,8
1,white,5.3,0.395,0.07,1.3,0.035,26.0,102.0,0.992,3.5,0.35,10.6,6
2,red,8.1,0.56,0.28,1.7,0.368,16.0,56.0,0.9968,3.11,1.28,9.3,5
3,white,6.4,0.22,0.34,1.4,0.023,56.0,115.0,0.98958,3.18,0.7,11.7,6
4,red,9.4,0.27,0.53,2.4,0.074,6.0,18.0,0.9962,3.2,1.13,12.0,7


In [3]:
df.drop(columns = ['wine type' , 'fixed acidity' , 'sulphates' , "pH"] , axis = 1 , inplace = True)

In [4]:
# column citric acid values with greater than 0.95 is outlier remove all where value crosses the thresold 
df = df[df['citric acid'] <= 0.95]

In [5]:
# residual sugar column has outlier, remove all rows where values are greater than 25
df = df[df['residual sugar'] <= 25]

In [6]:
# total sulfur dioxide has outlier 
df = df[df['total sulfur dioxide'] <= 280]

In [7]:
# `density` is not changing for `quality` not an usefull column , drop this column
df.drop(columns = ['density'] , axis = 1 , inplace = True)

In [10]:
drop_quality = [3 , 9]
# drop these rows where quality is in drop_quality
df = df[~df['quality'].isin(drop_quality)]

In [28]:
# [0 1 2 3 4] ->  [4 5 6 7 8]
mapper = {
    4 : 0 , 5 : 1 , 6 : 2 , 7 : 3 , 8 : 4
}
df['quality'] = df['quality'].map(mapper)

In [29]:
# seprate X and y 
X = df.drop("quality" , axis = 1)
y = df['quality']

In [30]:
from collections import Counter

Counter(y)

Counter({2: 2824, 1: 2133, 3: 1078, 0: 215, 4: 193})

In [31]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(X , y , stratify = y , test_size = 0.2 , random_state = 42)

In [32]:
log_cols = ['volatile acidity' , 'residual sugar', 'chlorides']
sqrt_cols = ['free sulfur dioxide']
other_cols = [col for col in X.columns if col not in log_cols + sqrt_cols] 

In [33]:
from sklearn.preprocessing import FunctionTransformer , StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [34]:
# define log transformers  
log_pipeline = Pipeline(steps = [
    ("log" , FunctionTransformer(np.log1p , validate = False)) , 
    ("scaler" , StandardScaler())
])

In [35]:
# define sqrt transformer with scaling 
sqrt_pipeline = Pipeline(steps = [
    ("sqrt" , FunctionTransformer(np.sqrt , validate = False)), 
    ("scaler" , StandardScaler())
])

In [36]:
# define a scaler for others column
scaler_pipeline = Pipeline(steps = [
    ("scaler" , StandardScaler())
])

In [37]:
# define the column transformer 
preprocessor = ColumnTransformer(
    transformers = [
        ("log" , log_pipeline , log_cols),
        ("sqrt" , sqrt_pipeline , sqrt_cols),
        ("others" , scaler_pipeline , other_cols)
    ]
)

In [38]:
# do the preprocessing for train data 
X_train_transformed = preprocessor.fit_transform(X_train)

In [39]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score,roc_curve 
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [40]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(multi_class = "multinomial", solver = "lbfgs", max_iter = 500),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "XGBClassifier": XGBClassifier(eval_metric="mlogloss", use_label_encoder=False),
    "CatBoosting Classifier": CatBoostClassifier(verbose = False),
    "Support Vector Classifier": SVC(probability = True),
    "AdaBoost Classifier": AdaBoostClassifier()
}

In [41]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline

In [42]:
def evaluate_clf(true, predicted, predicted_proba=None):
    acc = accuracy_score(true, predicted)  
    f1 = f1_score(true, predicted, average="weighted")  
    precision = precision_score(true, predicted, average="weighted")
    recall = recall_score(true, predicted, average="weighted")
    
    # ROC-AUC only works with probability scores + needs `multi_class` flag
    roc_auc = None
    if predicted_proba is not None:
        try:
            roc_auc = roc_auc_score(true, predicted_proba, multi_class="ovr", average="weighted")
        except Exception:
            roc_auc = None  # In case some models don’t support probability outputs
    
    return acc, f1, precision, recall, roc_auc


In [43]:
def model_evaluation(X_train, y_train, model_dict): 
    results = [] 
    
    for model_name, model_obj in model_dict.items(): 
        print(f"--------------------------- Model: {model_name} ----------------------")

        # Pipeline with SMOTEENN + model
        pipeline = Pipeline(steps=[
            ("smoteen", SMOTEENN(random_state=42, sampling_strategy="minority")),
            (model_name, model_obj)
        ])

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        scores = cross_val_score(
            pipeline, X_train, y_train, cv=cv, scoring="accuracy", n_jobs=-1
        )

        results.append({
            "Model Name": model_name,
            "Mean CV (Accuracy)": scores.mean(),
            "Std CV": scores.std()
        })

        print(results[-1])

    report = pd.DataFrame(results).sort_values(
        by=["Mean CV (Balanced Accuracy)"], ascending=False
    ).reset_index(drop=True)

    return report

In [None]:
base_model_report = model_evaluation(X_train = X_train_transformed , y_train = y_train , model_dict = models)

--------------------------- Model: Random Forest ----------------------
{'Model Name': 'Random Forest', 'Mean CV (Accuracy)': 0.4635200060267627, 'Std CV': 0.012736823569282608}
--------------------------- Model: Decision Tree ----------------------
{'Model Name': 'Decision Tree', 'Mean CV (Accuracy)': 0.4528513178834763, 'Std CV': 0.011383011226525203}
--------------------------- Model: Gradient Boosting ----------------------
{'Model Name': 'Gradient Boosting', 'Mean CV (Accuracy)': 0.4375205522021225, 'Std CV': 0.017648512155452434}
--------------------------- Model: Logistic Regression ----------------------
{'Model Name': 'Logistic Regression', 'Mean CV (Accuracy)': 0.2574698897290782, 'Std CV': 0.003952925653408192}
--------------------------- Model: K-Neighbors Classifier ----------------------
{'Model Name': 'K-Neighbors Classifier', 'Mean CV (Accuracy)': 0.4062847833661352, 'Std CV': 0.008587278812715182}
--------------------------- Model: XGBClassifier ----------------------


1 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "E:\end-to-end-machine-learning-project\End-To-End-Wine-Quality-Prediction\wineEnv\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\end-to-end-machine-learning-project\End-To-End-Wine-Quality-Prediction\wineEnv\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\end-to-end-machine-learning-project\End-To-End-Wine-Quality-Prediction\wineEnv\Lib\site-packages\imblea

{'Model Name': 'XGBClassifier', 'Mean CV (Accuracy)': nan, 'Std CV': nan}
--------------------------- Model: CatBoosting Classifier ----------------------
