In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder


In [3]:
df = pd.read_csv("drug_200.csv")

print("Dataset shape:", df.shape)
print("First few rows:")
display(df.head())

Dataset shape: (200, 6)
First few rows:


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [4]:
# Encode categorical features
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

X = df.drop("Drug", axis=1).values
y = df["Drug"].values



In [5]:
def std_scaling(df):
    return (df-df.mean())/(df.std() + 1e-8)


In [6]:
X=std_scaling(X)

# healper function

In [7]:
def euclidean_distance(x1, x2):
    """Compute Euclidean distance between two vectors"""
    return np.sqrt(np.sum((x1 - x2) ** 2))

def predict_knn(X_train, y_train, x_test, k):
    """Predict label for one test sample"""
    distances = np.sqrt(np.sum((X_train - x_test) ** 2, axis=1))
    k_indices = np.argsort(distances)[:k]
    k_nearest_labels = y_train[k_indices]
    # Return the most common class among neighbors
    values, counts = np.unique(k_nearest_labels, return_counts=True)
    return values[np.argmax(counts)]


## Evaluation metrics

In [15]:

def evaluate(y_true, y_pred, name="Model"):
    accuracy = np.mean(y_true == y_pred)
    precision_list, recall_list, f1_list = [], [], []

    for c in np.unique(y_true):
        tp = np.sum((y_pred == c) & (y_true == c))
        fp = np.sum((y_pred == c) & (y_true != c))
        fn = np.sum((y_pred != c) & (y_true == c))

        precision = tp / (tp + fp + 1e-9)
        recall = tp / (tp + fn + 1e-9)
        f1 = 2 * precision * recall / (precision + recall + 1e-9)

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    return {
        "accuracy": accuracy,
        "precision": np.mean(precision_list),
        "recall": np.mean(recall_list),
        "f1": np.mean(f1_list),
    }


## Cross-validation

In [19]:

def cross_validate_knn(X, y, k_neighbors):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []}

    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        y_pred = np.array([predict_knn(X_train, y_train, x, k_neighbors) for x in X_test])
        results = evaluate(y_test, y_pred)

        for key in metrics:
            metrics[key].append(results[key])

    return {m: np.mean(v) for m, v in metrics.items()}


# Evaluate Regularization

In [20]:

results = {}
for k in [1, 3, 5]:
    results[f"K={k}"] = cross_validate_knn(X, y, k)

In [21]:
result_df = pd.DataFrame(results).T
display(result_df)

print("\nAverage metrics across 5 folds:")
print(result_df)

Unnamed: 0,accuracy,precision,recall,f1
K=1,0.735,0.603139,0.607629,0.579246
K=3,0.72,0.649609,0.617166,0.576896
K=5,0.655,0.55547,0.543975,0.518259



Average metrics across 5 folds:
     accuracy  precision    recall        f1
K=1     0.735   0.603139  0.607629  0.579246
K=3     0.720   0.649609  0.617166  0.576896
K=5     0.655   0.555470  0.543975  0.518259
