In [3]:
# Load & split
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv("week7_lab_binary.csv")
X = df.drop(columns=["label"]).values
y = df["label"].values
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# k-NN (no scaling)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(Xtr, ytr)
p = knn.predict(Xte)
proba = knn.predict_proba(Xte)[:,1]
acc = accuracy_score(yte, p)
prec, rec, f1, _ = precision_recall_fscore_support(yte, p, average="binary", zero_division=0)
auc = roc_auc_score(yte, proba)
cm = confusion_matrix(yte, p)
print(acc, prec, rec, f1, auc, cm, sep="\n")

0.7375
0.7741935483870968
0.9230769230769231
0.8421052631578947
0.5553239863584691
[[  9  49]
 [ 14 168]]


In [11]:
# k-NN with Standardization (use Pipeline)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
knn_scaled = Pipeline([
("scaler", StandardScaler()),
("knn", KNeighborsClassifier(n_neighbors=5))
])
knn_scaled.fit(Xtr, ytr)
p2 = knn_scaled.predict(Xte)
proba2 = knn_scaled.predict_proba(Xte)[:,1]

prec, rec, f1, _ = precision_recall_fscore_support(yte, p2, average="binary", zero_division=0)
auc = roc_auc_score(yte, proba)
cm = confusion_matrix(yte, p2)

print(acc, prec, rec, f1, auc, cm, sep="\n")

0.7375
0.9247311827956989
0.945054945054945
0.9347826086956522
0.5553239863584691
[[ 44  14]
 [ 10 172]]


In [14]:
# Logistic Regression (with/without scaling)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=200)
lr.fit(Xtr, ytr); p = lr.predict(Xte); proba = lr.predict_proba(Xte)[:,1]
lr_scaled = Pipeline([
("scaler", StandardScaler()),
("lr", LogisticRegression(max_iter=200))
])
lr_scaled.fit(Xtr, ytr); p2 = lr_scaled.predict(Xte); proba2 = lr_scaled.predict_proba(Xte)

prec, rec, f1, _ = precision_recall_fscore_support(yte, p2, average="binary", zero_division=0)
auc = roc_auc_score(yte, proba)
cm = confusion_matrix(yte, p2)

print(acc, prec, rec, f1, auc, cm, sep="\n")

0.7375
0.9109947643979057
0.9560439560439561
0.9329758713136729
0.9673171655930277
[[ 41  17]
 [  8 174]]


In [15]:
# Decision Tree (scaling not needed)
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(Xtr, ytr)
p2 = dt.predict(Xte)

prec, rec, f1, _ = precision_recall_fscore_support(yte, p2, average="binary", zero_division=0)
auc = roc_auc_score(yte, proba)
cm = confusion_matrix(yte, p2)

print(acc, prec, rec, f1, auc, cm, sep="\n")

0.7375
0.8961748633879781
0.9010989010989011
0.8986301369863013
0.9673171655930277
[[ 39  19]
 [ 18 164]]
