# Machine Learning Assignment 2


In [24]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn utilities
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# XGBoost 
from xgboost import XGBClassifier


# Evaluation metrics
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report
)

# Visualization style
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")


Libraries imported successfully!


---

## Loading the Dataset
## Exploratory Data Analysis (EDA)
## Preprocessing


In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# ==============================
# Load Wine Quality dataset (red wine)
# ==============================
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url, sep=';')

# ==============================
# Q1: Dataset size, missing values, and feature types
# ==============================
print("=" * 60)
print("Q1: Dataset Examination")
print("=" * 60)

# 1. Dataset Size
print("\n1. Dataset Size:")
print(f"   Total rows: {data.shape[0]}  Total columns: {data.shape[1]}")

# 2. Missing Values
print("\n2. Missing Values:")
missing_values = data.isnull().sum()
if missing_values.sum() == 0:
    print("    No missing values found")
else:
    print(missing_values[missing_values > 0])

# 3. Feature Names (instead of float64 types)
print("\n3. Feature Names:")
feature_names = list(data.drop("quality", axis=1).columns)
print("    Features:", feature_names)
print("    Target column: 'quality'")

# 4. First few rows of data
print("\n4. First few rows of dataset:")
print(data.head())

# ==============================
# Preprocessing: Features, Target, Split, Scale
# ==============================
X = data.drop('quality', axis=1)   # Features
y = data['quality']                # Target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("\nPreprocessing complete:")
print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")
print(f"Features: {X_train.shape[1]}")


Q1: Dataset Examination

1. Dataset Size:
   Total rows: 1599  Total columns: 12

2. Missing Values:
    No missing values found

3. Feature Names:
    Features: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
    Target column: 'quality'

4. First few rows of dataset:
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51    

---

## Regression Model




In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)
from sklearn.model_selection import train_test_split
import pandas as pd

# ==============================
# Logistic Regression Model
# ==============================
print("=" * 60)
print("Logistic Regression Model")
print("=" * 60)

# Split into training and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print(f"\nTraining set: {X_train_split.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Features: {X_train_split.shape[1]}")

# Build Logistic Regression model
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train_split, y_train_split)

# Predictions
y_train_pred = log_reg.predict(X_train_split)
y_val_pred = log_reg.predict(X_val)
y_val_proba = log_reg.predict_proba(X_val)

# Metrics
train_acc = accuracy_score(y_train_split, y_train_pred)
val_acc = accuracy_score(y_val, y_val_pred)
val_auc = roc_auc_score(y_val, y_val_proba, multi_class='ovr')
val_precision = precision_score(y_val, y_val_pred, average='weighted')
val_recall = recall_score(y_val, y_val_pred, average='weighted')
val_f1 = f1_score(y_val, y_val_pred, average='weighted')
val_mcc = matthews_corrcoef(y_val, y_val_pred)

print("\n" + "="*60)
print("Logistic Regression Results:")
print("="*60)
print(f"Training Accuracy : {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Validation AUC     : {val_auc:.4f}")
print(f"Validation Precision: {val_precision:.4f}")
print(f"Validation Recall   : {val_recall:.4f}")
print(f"Validation F1       : {val_f1:.4f}")
print(f"Validation MCC      : {val_mcc:.4f}")

# Store results for comparison
results_log_reg = pd.DataFrame({
    'Model': ['Logistic Regression'],
    'Accuracy': [val_acc],
    'AUC': [val_auc],
    'Precision': [val_precision],
    'Recall': [val_recall],
    'F1 Score': [val_f1],
    'MCC': [val_mcc]
})

print("\nLogistic Regression model trained and evaluated")
print(results_log_reg)


Logistic Regression Model

Training set: 1023 samples
Validation set: 256 samples
Features: 11

Logistic Regression Results:
Training Accuracy : 0.6109
Validation Accuracy: 0.5820
Validation AUC     : 0.8012
Validation Precision: 0.5651
Validation Recall   : 0.5820
Validation F1       : 0.5670
Validation MCC      : 0.3255

Logistic Regression model trained and evaluated
                 Model  Accuracy       AUC  Precision    Recall  F1 Score  \
0  Logistic Regression  0.582031  0.801206   0.565149  0.582031  0.567013   

        MCC  
0  0.325498  


---

## Desicion Tree Classifier


In [43]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)
import pandas as pd

# ==============================
# Decision Tree Classifier
# ==============================
print("=" * 60)
print("Decision Tree Classifier")
print("=" * 60)

# Build and train model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predictions
y_test_pred_dt = dt_model.predict(X_test)
y_test_proba_dt = dt_model.predict_proba(X_test)

# Metrics (evaluated on test data)
acc_dt = accuracy_score(y_test, y_test_pred_dt)
auc_dt = roc_auc_score(y_test, y_test_proba_dt, multi_class='ovr')
precision_dt = precision_score(y_test, y_test_pred_dt, average='weighted')
recall_dt = recall_score(y_test, y_test_pred_dt, average='weighted')
f1_dt = f1_score(y_test, y_test_pred_dt, average='weighted')
mcc_dt = matthews_corrcoef(y_test, y_test_pred_dt)

print("\n" + "="*60)
print("Decision Tree Results:")
print("="*60)
print(f"Accuracy   : {acc_dt:.4f}")
print(f"AUC        : {auc_dt:.4f}")
print(f"Precision  : {precision_dt:.4f}")
print(f"Recall     : {recall_dt:.4f}")
print(f"F1 Score   : {f1_dt:.4f}")
print(f"MCC        : {mcc_dt:.4f}")

# Store results (standardized schema)
results_dt = pd.DataFrame({
    'Model': ['Decision Tree'],
    'Accuracy': [acc_dt],
    'AUC': [auc_dt],
    'Precision': [precision_dt],
    'Recall': [recall_dt],
    'F1 Score': [f1_dt],
    'MCC': [mcc_dt]
})

print("\nDecision Tree model trained and evaluated")
print(results_dt)


Decision Tree Classifier

Decision Tree Results:
Accuracy   : 0.6094
AUC        : 0.6584
Precision  : 0.6121
Recall     : 0.6094
F1 Score   : 0.6095
MCC        : 0.3982

Decision Tree model trained and evaluated
           Model  Accuracy       AUC  Precision    Recall  F1 Score       MCC
0  Decision Tree  0.609375  0.658352   0.612092  0.609375  0.609477  0.398241


---

## K-Nearest Neighbor Classifier


In [44]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)
import pandas as pd

# ==============================
# KNN Classifier
# ==============================
print("=" * 60)
print("KNN Classifier")
print("=" * 60)

# Build and train model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Predictions
y_test_pred_knn = knn_model.predict(X_test)
y_test_proba_knn = knn_model.predict_proba(X_test)

# Metrics (evaluated on test data)
acc_knn = accuracy_score(y_test, y_test_pred_knn)
auc_knn = roc_auc_score(y_test, y_test_proba_knn, multi_class='ovr')
precision_knn = precision_score(y_test, y_test_pred_knn, average='weighted')
recall_knn = recall_score(y_test, y_test_pred_knn, average='weighted')
f1_knn = f1_score(y_test, y_test_pred_knn, average='weighted')
mcc_knn = matthews_corrcoef(y_test, y_test_pred_knn)

print("\n" + "="*60)
print("KNN Results:")
print("="*60)
print(f"Accuracy   : {acc_knn:.4f}")
print(f"AUC        : {auc_knn:.4f}")
print(f"Precision  : {precision_knn:.4f}")
print(f"Recall     : {recall_knn:.4f}")
print(f"F1 Score   : {f1_knn:.4f}")
print(f"MCC        : {mcc_knn:.4f}")

# Store results (standardized schema)
results_knn = pd.DataFrame({
    'Model': ['KNN'],
    'Accuracy': [acc_knn],
    'AUC': [auc_knn],
    'Precision': [precision_knn],
    'Recall': [recall_knn],
    'F1 Score': [f1_knn],
    'MCC': [mcc_knn]
})

print("\nKNN model trained and evaluated")
print(results_knn)


KNN Classifier

KNN Results:
Accuracy   : 0.6094
AUC        : 0.6983
Precision  : 0.5841
Recall     : 0.6094
F1 Score   : 0.5959
MCC        : 0.3733

KNN model trained and evaluated
  Model  Accuracy       AUC  Precision    Recall  F1 Score       MCC
0   KNN  0.609375  0.698329   0.584116  0.609375  0.595887  0.373313


---

## Naive Bayes Classifier - Gaussian or Multinomial


In [45]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)
import pandas as pd

# ==============================
# Naive Bayes Classifier
# ==============================
print("=" * 60)
print("Naive Bayes Classifier")
print("=" * 60)

# Build and train model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Predictions
y_test_pred_nb = nb_model.predict(X_test)
y_test_proba_nb = nb_model.predict_proba(X_test)

# Metrics (evaluated on test data)
acc_nb = accuracy_score(y_test, y_test_pred_nb)
auc_nb = roc_auc_score(y_test, y_test_proba_nb, multi_class='ovr')
precision_nb = precision_score(y_test, y_test_pred_nb, average='weighted')
recall_nb = recall_score(y_test, y_test_pred_nb, average='weighted')
f1_nb = f1_score(y_test, y_test_pred_nb, average='weighted')
mcc_nb = matthews_corrcoef(y_test, y_test_pred_nb)

print("\n" + "="*60)
print("Naive Bayes Results:")
print("="*60)
print(f"Accuracy   : {acc_nb:.4f}")
print(f"AUC        : {auc_nb:.4f}")
print(f"Precision  : {precision_nb:.4f}")
print(f"Recall     : {recall_nb:.4f}")
print(f"F1 Score   : {f1_nb:.4f}")
print(f"MCC        : {mcc_nb:.4f}")

# Store results (standardized schema)
results_nb = pd.DataFrame({
    'Model': ['Naive Bayes (GaussianNB)'],
    'Accuracy': [acc_nb],
    'AUC': [auc_nb],
    'Precision': [precision_nb],
    'Recall': [recall_nb],
    'F1 Score': [f1_nb],
    'MCC': [mcc_nb]
})

print("\nNaive Bayes model trained and evaluated")
print(results_nb)


Naive Bayes Classifier

Naive Bayes Results:
Accuracy   : 0.5625
AUC        : 0.6838
Precision  : 0.5745
Recall     : 0.5625
F1 Score   : 0.5681
MCC        : 0.3299

Naive Bayes model trained and evaluated
                      Model  Accuracy       AUC  Precision  Recall  F1 Score  \
0  Naive Bayes (GaussianNB)    0.5625  0.683783   0.574461  0.5625  0.568067   

        MCC  
0  0.329911  


---

## Ensemble Model - Random Forest


In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)
import pandas as pd

# ==============================
# Random Forest Classifier
# ==============================
print("=" * 60)
print("Random Forest Classifier")
print("=" * 60)

# Build and train model
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_test_pred_rf = rf_model.predict(X_test)
y_test_proba_rf = rf_model.predict_proba(X_test)

# Metrics (evaluated on test data)
acc_rf = accuracy_score(y_test, y_test_pred_rf)
auc_rf = roc_auc_score(y_test, y_test_proba_rf, multi_class='ovr')
precision_rf = precision_score(y_test, y_test_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_test_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_test_pred_rf, average='weighted')
mcc_rf = matthews_corrcoef(y_test, y_test_pred_rf)

print("\n" + "="*60)
print("Random Forest Results:")
print("="*60)
print(f"Accuracy   : {acc_rf:.4f}")
print(f"AUC        : {auc_rf:.4f}")
print(f"Precision  : {precision_rf:.4f}")
print(f"Recall     : {recall_rf:.4f}")
print(f"F1 Score   : {f1_rf:.4f}")
print(f"MCC        : {mcc_rf:.4f}")

# Store results (standardized schema)
results_rf = pd.DataFrame({
    'Model': ['Random Forest'],
    'Accuracy': [acc_rf],
    'AUC': [auc_rf],
    'Precision': [precision_rf],
    'Recall': [recall_rf],
    'F1 Score': [f1_rf],
    'MCC': [mcc_rf]
})

print("\nRandom Forest model trained and evaluated")
print(results_rf)


Random Forest Classifier

Random Forest Results:
Accuracy   : 0.6750
AUC        : 0.7907
Precision  : 0.6539
Recall     : 0.6750
F1 Score   : 0.6599
MCC        : 0.4746

Random Forest model trained and evaluated
           Model  Accuracy       AUC  Precision  Recall  F1 Score       MCC
0  Random Forest     0.675  0.790705   0.653858   0.675  0.659933  0.474554


---

## Ensemble Model - XGBoost

In [47]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)
import pandas as pd

# ==============================
# XGBoost Classifier
# ==============================
print("=" * 60)
print("XGBoost Classifier")
print("=" * 60)

# Encode target labels to start from 0
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Build and train model
xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)
xgb_model.fit(X_train, y_train_enc)

# Predictions
y_test_pred_xgb = xgb_model.predict(X_test)
y_test_proba_xgb = xgb_model.predict_proba(X_test)

# Metrics (evaluated on test data)
acc_xgb = accuracy_score(y_test_enc, y_test_pred_xgb)
auc_xgb = roc_auc_score(y_test_enc, y_test_proba_xgb, multi_class='ovr')
precision_xgb = precision_score(y_test_enc, y_test_pred_xgb, average='weighted')
recall_xgb = recall_score(y_test_enc, y_test_pred_xgb, average='weighted')
f1_xgb = f1_score(y_test_enc, y_test_pred_xgb, average='weighted')
mcc_xgb = matthews_corrcoef(y_test_enc, y_test_pred_xgb)

print("\n" + "="*60)
print("XGBoost Results:")
print("="*60)
print(f"Accuracy   : {acc_xgb:.4f}")
print(f"AUC        : {auc_xgb:.4f}")
print(f"Precision  : {precision_xgb:.4f}")
print(f"Recall     : {recall_xgb:.4f}")
print(f"F1 Score   : {f1_xgb:.4f}")
print(f"MCC        : {mcc_xgb:.4f}")

# Store results (standardized schema)
results_xgb = pd.DataFrame({
    'Model': ['XGBoost'],
    'Accuracy': [acc_xgb],
    'AUC': [auc_xgb],
    'Precision': [precision_xgb],
    'Recall': [recall_xgb],
    'F1 Score': [f1_xgb],
    'MCC': [mcc_xgb]
})

print("\nXGBoost model trained and evaluated")
print(results_xgb)


XGBoost Classifier

XGBoost Results:
Accuracy   : 0.6594
AUC        : 0.8374
Precision  : 0.6520
Recall     : 0.6594
F1 Score   : 0.6488
MCC        : 0.4535

XGBoost model trained and evaluated
     Model  Accuracy       AUC  Precision    Recall  F1 Score       MCC
0  XGBoost  0.659375  0.837422   0.651973  0.659375  0.648799  0.453534


---

## Comparision Table

In [49]:
# ==============================
# Final Comparison Table
# ==============================
print("=" * 60)
print("Final Comparison Table")
print("=" * 60)

# Concatenate all model results
comparison = pd.concat([
    results_log_reg,
    results_dt,
    results_knn,
    results_nb,
    results_rf,
    results_xgb
], ignore_index=True)

# Print full table in one block
print("\nComparison of all models:")
print(comparison.to_string(index=False))

# Save to CSV
comparison.to_csv("results_summary.csv", index=False)

print("\nFinal comparison table created and saved as 'results_summary.csv'")


Final Comparison Table

Comparison of all models:
                   Model  Accuracy      AUC  Precision   Recall  F1 Score      MCC
     Logistic Regression  0.582031 0.801206   0.565149 0.582031  0.567013 0.325498
           Decision Tree  0.609375 0.658352   0.612092 0.609375  0.609477 0.398241
                     KNN  0.609375 0.698329   0.584116 0.609375  0.595887 0.373313
Naive Bayes (GaussianNB)  0.562500 0.683783   0.574461 0.562500  0.568067 0.329911
           Random Forest  0.675000 0.790705   0.653858 0.675000  0.659933 0.474554
                 XGBoost  0.659375 0.837422   0.651973 0.659375  0.648799 0.453534

Final comparison table created and saved as 'results_summary.csv'
