In [1]:
# Q-3)
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score

# Load the CSV file
file_path = "/content/winequality-red.csv"
data = pd.read_csv(file_path, delimiter=';')

data['quality'] = (data['quality'] > 6).astype(int)
X = data.drop(columns=['quality']).values
y = data['quality'].values

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [2]:
# K-Fold Cross Validation to find optimal hyperparameters
def find_best_hyperparameters():
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # Decision Tree Grid Search
    dt_params = {'max_depth': [3, 5, 10], 'min_samples_leaf': [1, 5, 10]}
    dt_grid = GridSearchCV(DecisionTreeClassifier(), dt_params, scoring='roc_auc', cv=kf)
    dt_grid.fit(X_train, y_train)
    best_dt_params = dt_grid.best_params_

    # k-NN Grid Search
    knn_params = {'n_neighbors': [3, 5, 7, 10]}
    knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, scoring='roc_auc', cv=kf)
    knn_grid.fit(X_train, y_train)
    best_knn_params = knn_grid.best_params_

    return best_dt_params, best_knn_params

best_dt_params, best_knn_params = find_best_hyperparameters()

def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    auc = roc_auc_score(y_test, y_pred_proba)
    acc = accuracy_score(y_test, y_pred)
    return auc, acc
# Evaluate models with data reduction
removal_rates = [0.01, 0.05, 0.1]
kNN_results = []
dt_results = []


In [3]:
# Train original models
knn_model = KNeighborsClassifier(**best_knn_params)
dt_model = DecisionTreeClassifier(**best_dt_params)

original_knn_auc, original_knn_acc = evaluate_model(knn_model, X_train, y_train, X_test, y_test)
original_dt_auc, original_dt_acc = evaluate_model(dt_model, X_train, y_train, X_test, y_test)
kNN_results.append(("Original", original_knn_auc, original_knn_acc))
dt_results.append(("Original", original_dt_auc, original_dt_acc))

for rate in removal_rates:
    X_subset, _, y_subset, _ = train_test_split(X_train, y_train, test_size=rate, random_state=42)

    knn_model = KNeighborsClassifier(**best_knn_params)
    dt_model = DecisionTreeClassifier(**best_dt_params)

    knn_auc, knn_acc = evaluate_model(knn_model, X_subset, y_subset, X_test, y_test)
    dt_auc, dt_acc = evaluate_model(dt_model, X_subset, y_subset, X_test, y_test)

    kNN_results.append((f"{int(rate*100)}% Removed", knn_auc, knn_acc))
    dt_results.append((f"{int(rate*100)}% Removed", dt_auc, dt_acc))

# Print results
print("\nK-NN Results:")
print("Dataset | AUC | Accuracy")
for result in kNN_results:
    print(f"{result[0]} | {result[1]:.4f} | {result[2]:.4f}")

print("\nDecision Tree Results:")
print("Dataset | AUC | Accuracy")
for result in dt_results:
    print(f"{result[0]} | {result[1]:.4f} | {result[2]:.4f}")


K-NN Results:
Dataset | AUC | Accuracy
Original | 0.7925 | 0.8562
1% Removed | 0.7925 | 0.8562
5% Removed | 0.7930 | 0.8531
10% Removed | 0.7974 | 0.8531

Decision Tree Results:
Dataset | AUC | Accuracy
Original | 0.8645 | 0.8594
1% Removed | 0.8676 | 0.8594
5% Removed | 0.8500 | 0.8594
10% Removed | 0.8573 | 0.8594


For K-NN, performance remains largely stable across all levels of data removal,
with only a slight increase in AUC at higher removal rates. This suggests that
K-NN is relatively insensitive to small amounts of noisy or redundant data, but
overall its discriminative power is limited compared to tree-based models.

The Decision Tree consistently outperforms K-NN in terms of AUC, indicating
better class separation. Minor data removal (1%) slightly improves AUC,
suggesting reduced noise, while larger removals (5â€“10%) lead to fluctuating AUC
without affecting accuracy. The stable accuracy across datasets implies that
class predictions are robust, though ranking quality (AUC) is more sensitive
to data composition.