In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [3]:
df = pd.read_csv('raw/data.csv',sep="\t")

In [4]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:


# Create a copy to work on
df_clean = df.copy()

# Remove outliers column by column using IQR
for col in df_clean.select_dtypes(include=['float64', 'int64']).columns:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

print(f"Original shape: {df.shape}")
print(f"Cleaned shape: {df_clean.shape}")


Original shape: (1599, 12)
Cleaned shape: (1124, 12)


In [6]:
X = df_clean.drop(columns=['quality'],axis=1)
y=df_clean['quality']

In [7]:
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [8]:
y

0       5
1       5
2       5
3       6
4       5
       ..
1594    5
1595    6
1596    6
1597    5
1598    6
Name: quality, Length: 1124, dtype: int64

In [9]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((899, 11), (225, 11))

In [10]:
sc = StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)
X_train
X_test

array([[-0.59865106,  1.3013592 , -1.36390779, ...,  0.65544819,
        -0.2527394 , -1.02087587],
       [-0.39136362,  1.63976256, -0.18948521, ..., -1.58739073,
        -1.67782896, -1.23492824],
       [-0.11498036,  0.59378855,  0.48161341, ..., -0.11794454,
        -0.43087559,  0.37046458],
       ...,
       [-0.87503432, -0.75982486, -0.02171055, ..., -0.50464091,
        -0.43087559, -0.37871874],
       [ 0.50688198, -1.86732675,  1.32048669, ...,  0.50076964,
        -0.87621608, -1.02087587],
       [-0.04588454, -1.19052004,  1.09678715, ...,  0.19141255,
        -0.1636713 ,  0.37046458]], shape=(225, 11))

In [11]:
from sklearn.preprocessing import LabelEncoder

# Fit LabelEncoder on full y set
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define classification models
class_models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "SVC": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    
    "CatBoost": CatBoostClassifier(verbose=False)
}

# Lists to store results
class_model_names = []
accuracy_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
confusion_matrices = []

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report


def evaluate_classification_basic(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    return acc, f1, precision, recall, cm




for name, model in class_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc,f1,precision,recall,cm= evaluate_classification_basic(y_test, y_pred)
    
    class_model_names.append(name)
    precision_scores.append(precision)
    recall_scores.append(recall)
    accuracy_scores.append(acc)
    f1_scores.append(f1)
    confusion_matrices.append(cm)

    
    print(f"📌 {name}")
    print("- Accuracy: {:.4f}".format(acc))
    print("- F1 Score: {:.4f}".format(f1))
    print("- precision_Score: {:.4f}".format(precision))
    print("- recall_score: {:.4f}".format(recall))
    print("- Confusion Matrix:\n", cm)
    print("- Classification Report:\n", classification_report(y_test, y_pred))
    print("="*40)


    

📌 Logistic Regression
- Accuracy: 0.6133
- F1 Score: 0.5968
- precision_Score: 0.5922
- recall_score: 0.6133
- Confusion Matrix:
 [[ 0  3  2  0]
 [ 0 81 21  0]
 [ 1 39 51  6]
 [ 0  0 15  6]]
- Classification Report:
               precision    recall  f1-score   support

           4       0.00      0.00      0.00         5
           5       0.66      0.79      0.72       102
           6       0.57      0.53      0.55        97
           7       0.50      0.29      0.36        21

    accuracy                           0.61       225
   macro avg       0.43      0.40      0.41       225
weighted avg       0.59      0.61      0.60       225

📌 Decision Tree
- Accuracy: 0.6044
- F1 Score: 0.6104
- precision_Score: 0.6225
- recall_score: 0.6044
- Confusion Matrix:
 [[ 2  2  1  0]
 [ 7 65 25  5]
 [ 0 28 56 13]
 [ 0  0  8 13]]
- Classification Report:
               precision    recall  f1-score   support

           4       0.22      0.40      0.29         5
           5       0.68     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


📌 Random Forest
- Accuracy: 0.6933
- F1 Score: 0.6837
- precision_Score: 0.6752
- recall_score: 0.6933
- Confusion Matrix:
 [[ 0  3  2  0]
 [ 0 82 20  0]
 [ 0 25 63  9]
 [ 0  0 10 11]]
- Classification Report:
               precision    recall  f1-score   support

           4       0.00      0.00      0.00         5
           5       0.75      0.80      0.77       102
           6       0.66      0.65      0.66        97
           7       0.55      0.52      0.54        21

    accuracy                           0.69       225
   macro avg       0.49      0.49      0.49       225
weighted avg       0.68      0.69      0.68       225

📌 Gradient Boosting
- Accuracy: 0.6889
- F1 Score: 0.6812
- precision_Score: 0.6787
- recall_score: 0.6889
- Confusion Matrix:
 [[ 0  3  2  0]
 [ 1 83 18  0]
 [ 1 28 58 10]
 [ 0  0  7 14]]
- Classification Report:
               precision    recall  f1-score   support

           4       0.00      0.00      0.00         5
           5       0.73      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


📌 AdaBoost
- Accuracy: 0.6311
- F1 Score: 0.5907
- precision_Score: 0.5572
- recall_score: 0.6311
- Confusion Matrix:
 [[ 0  3  2  0]
 [ 0 84 18  0]
 [ 0 38 58  1]
 [ 0  0 21  0]]
- Classification Report:
               precision    recall  f1-score   support

           4       0.00      0.00      0.00         5
           5       0.67      0.82      0.74       102
           6       0.59      0.60      0.59        97
           7       0.00      0.00      0.00        21

    accuracy                           0.63       225
   macro avg       0.31      0.36      0.33       225
weighted avg       0.56      0.63      0.59       225

📌 SVC
- Accuracy: 0.6400
- F1 Score: 0.6220
- precision_Score: 0.6160
- recall_score: 0.6400
- Confusion Matrix:
 [[ 0  3  2  0]
 [ 0 83 19  0]
 [ 0 36 55  6]
 [ 0  1 14  6]]
- Classification Report:
               precision    recall  f1-score   support

           4       0.00      0.00      0.00         5
           5       0.67      0.81      0.74      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
# Define regression models

models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    def evaluate_model(true, predicted):
        mae = mean_absolute_error(true, predicted)
        mse = mean_squared_error(true, predicted)
        rmse = np.sqrt(mean_squared_error(true, predicted))
        r2_square = r2_score(true, predicted)
        return mae, rmse, r2_square




    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.5820
- Mean Absolute Error: 0.4650
- R2 Score: 0.3677
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5325
- Mean Absolute Error: 0.4344
- R2 Score: 0.3992


Lasso
Model performance for Training set
- Root Mean Squared Error: 0.7320
- Mean Absolute Error: 0.6413
- R2 Score: 0.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.6879
- Mean Absolute Error: 0.6092
- R2 Score: -0.0025


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.5820
- Mean Absolute Error: 0.4650
- R2 Score: 0.3677
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5325
- Mean Absolute Error: 0.4344
- R2 Score: 0.3993


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 0.4876
- Mean Absolute Error: 0.3691
- R2 Score: 0.5563
----------------------