In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [2]:
# Load German Credit from OpenML (dataset id: 31)
ds = fetch_openml(name="credit-g", version=1, as_frame=True)
df = ds.frame.copy()

# Separate target and features
target_col = "class"
assert target_col in df.columns, "Expected 'class' column as target."
y = df[target_col]
X = df.drop(columns=[target_col])

# Drop missing values to satisfy rubric (usually none, but do it explicitly)
full = pd.concat([X, y], axis=1).dropna()
X = full.drop(columns=[target_col])
y = full[target_col]

print("Shape after dropping NaNs:", X.shape)
print("Target distribution:\n", y.value_counts())


Shape after dropping NaNs: (1000, 20)
Target distribution:
 class
good    700
bad     300
Name: count, dtype: int64


In [3]:
# Known numeric columns in credit-g
numeric_features = [
    "duration",
    "credit_amount",
    "installment_commitment",
    "residence_since",
    "age",
    "existing_credits",
    "num_dependents",
]

# Known categorical columns in credit-g
categorical_features = [
    "checking_status",
    "credit_history",
    "purpose",
    "savings_status",
    "employment",
    "personal_status",
    "other_parties",          # (aka other_debtors in some versions)
    "property_magnitude",
    "other_payment_plans",
    "housing",
    "job",
    "own_telephone",
    "foreign_worker",
]

# Keep only columns that exist (defensive against minor schema variants)
numeric_features = [c for c in numeric_features if c in X.columns]
categorical_features = [c for c in categorical_features if c in X.columns]

assert len(numeric_features) >= 4, f"Need ≥4 numeric features, got {len(numeric_features)}"
assert len(categorical_features) >= 3, f"Need ≥3 categorical features, got {len(categorical_features)}"

X = X[numeric_features + categorical_features]

print("Numeric features used:", numeric_features)
print("Categorical features used:", categorical_features)
print("Final feature matrix shape:", X.shape)


Numeric features used: ['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
Categorical features used: ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']
Final feature matrix shape: (1000, 20)


In [4]:
# Preprocessing: scale numeric, one-hot categorical
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

# 80% train, 20% temp (to be split into val/test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)

# 10% val, 10% test (split the 20% temp in half)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=RANDOM_STATE, stratify=y_temp
)

print(f"Shapes -> Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")


Shapes -> Train: (800, 20), Val: (100, 20), Test: (100, 20)


In [5]:
k_values = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]
val_scores = []

for k in k_values:
    pipe = Pipeline(steps=[
        ("preprocess", preprocess),
        ("clf", KNeighborsClassifier(n_neighbors=k))
    ])
    pipe.fit(X_train, y_train)
    pred_val = pipe.predict(X_val)
    acc_val = accuracy_score(y_val, pred_val)
    val_scores.append(acc_val)

print("Validation Accuracy by k:")
for k, score in zip(k_values, val_scores):
    print(f"  k={k:<2d} -> val_accuracy={score:.4f}")

best_idx = int(np.argmax(val_scores))
best_k = k_values[best_idx]
best_val_acc = val_scores[best_idx]
print(f"\nBest k (validation): k={best_k} with val_accuracy={best_val_acc:.4f}")


Validation Accuracy by k:
  k=1  -> val_accuracy=0.7000
  k=3  -> val_accuracy=0.6800
  k=5  -> val_accuracy=0.7100
  k=7  -> val_accuracy=0.7000
  k=9  -> val_accuracy=0.6800
  k=11 -> val_accuracy=0.7000
  k=13 -> val_accuracy=0.7000
  k=15 -> val_accuracy=0.7100
  k=17 -> val_accuracy=0.7100
  k=19 -> val_accuracy=0.6700
  k=21 -> val_accuracy=0.7000

Best k (validation): k=5 with val_accuracy=0.7100


In [6]:
# Refit on TRAIN+VAL
X_trainval = pd.concat([X_train, X_val], axis=0)
y_trainval = pd.concat([y_train, y_val], axis=0)

best_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", KNeighborsClassifier(n_neighbors=best_k))
])
best_model.fit(X_trainval, y_trainval)

# Evaluate on TEST
y_pred_test = best_model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred_test)
print(f"Final Test Accuracy (k={best_k}): {test_acc:.4f}")


Final Test Accuracy (k=5): 0.7400


In [7]:
labels_order = sorted(y.unique())
cm = confusion_matrix(y_test, y_pred_test, labels=labels_order)
cm_df = pd.DataFrame(cm,
                     index=[f"true_{l}" for l in labels_order],
                     columns=[f"pred_{l}" for l in labels_order])
print("Confusion Matrix (rows = true, cols = pred):")
cm_df


Confusion Matrix (rows = true, cols = pred):


Unnamed: 0,pred_bad,pred_good
true_bad,8,22
true_good,4,66
