In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, roc_curve
from process_bank_churn import preprocess_data, preprocess_new_data
import time

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

X_train, y_train, X_val, y_val, input_cols, scaler, encoder = preprocess_data(train_df, scale_numeric=True)



In [3]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)


train_preds = knn.predict_proba(X_train)[:,1]
val_preds = knn.predict_proba(X_val)[:,1]


train_auc = roc_auc_score(y_train, train_preds)
val_auc = roc_auc_score(y_val, val_preds)

print(f"Train AUROC: {train_auc:.4f}")
print(f"Validation AUROC: {val_auc:.4f}")


Train AUROC: 0.9617
Validation AUROC: 0.8679


**On the training set, the result is very high, but on the validation set it is noticeably lower. The model does not suffer from high bias, it fits the training data well. The gap between training and validation performance indicates overfitting/high variance. This is expected for kNN, since the method is sensitive to local data patterns and the scale of features.**

In [4]:
param_grid = {"n_neighbors": list(range(2, 31, 3))}
grid_knn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

grid_knn.fit(X_train, y_train)
knn_best = grid_knn.best_estimator_

print("Best parameter n_neighbors:", grid_knn.best_params_)

train_preds_best = knn_best.predict_proba(X_train)[:,1]
val_preds_best = knn_best.predict_proba(X_val)[:,1]

print("Train AUROC:", roc_auc_score(y_train, train_preds_best))
print("Validation AUROC:", roc_auc_score(y_val, val_preds_best))


Найкращий параметр n_neighbors: {'n_neighbors': 29}
Train AUROC: 0.9343208861545823
Validation AUROC: 0.9101219089434536


**The gap between training and validation is now much smaller, which indicates a reduction in high variance. The model has become more balanced. The decision tree generalizes slightly better (0.9153 on validation).**

In [5]:
param_grid_dt = {
    "max_depth": list(range(1, 21, 2)),
    "max_leaf_nodes": list(range(2, 11, 1))
}

dt = DecisionTreeClassifier(random_state=42)

start = time.time()
grid_dt = GridSearchCV(
    estimator=dt,
    param_grid=param_grid_dt,
    cv=3,
    scoring="roc_auc",
    n_jobs=-1
)
grid_dt.fit(X_train, y_train)
end = time.time()

dt_best = grid_dt.best_estimator_

print("Time:", end - start, "sec")
print("Best parameters:", grid_dt.best_params_)


train_preds_dt = dt_best.predict_proba(X_train)[:,1]
val_preds_dt = dt_best.predict_proba(X_val)[:,1]

print("Train AUROC:", roc_auc_score(y_train, train_preds_dt))
print("Validation AUROC:", roc_auc_score(y_val, val_preds_dt))


Час пошуку: 1.032602071762085 секунд
Найкращі параметри: {'max_depth': 5, 'max_leaf_nodes': 10}
Train AUROC: 0.9073235544075688
Validation AUROC: 0.8880524407604071


**GridSearchCV, using the specified parameter grid, selected a simpler tree that shows lower validation performance. The model became less complex and more stable, but it lost predictive power. It performs worse than the one tuned manually.**

In [6]:
params_dt = {
    "criterion": ["gini", "entropy"],
    "splitter": ["best", "random"],
    "max_depth": np.arange(1, 20),
    "max_leaf_nodes": np.arange(2, 20),
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": [None, "sqrt", "log2"]
}

dt = DecisionTreeClassifier(random_state=42)

start = time.time()
random_dt = RandomizedSearchCV(
    estimator=dt,
    param_distributions=params_dt,
    n_iter=40,
    cv=3,
    scoring="roc_auc",
    random_state=42,
    n_jobs=-1
)
random_dt.fit(X_train, y_train)
end = time.time()

dt_random_search_best = random_dt.best_estimator_

print("Time:", end - start, "sec")
print("Best parameters:", random_dt.best_params_)


train_preds_dt_rand = dt_random_search_best.predict_proba(X_train)[:,1]
val_preds_dt_rand = dt_random_search_best.predict_proba(X_val)[:,1]

print("Train AUROC:", roc_auc_score(y_train, train_preds_dt_rand))
print("Validation AUROC:", roc_auc_score(y_val, val_preds_dt_rand))


Час пошуку: 0.4089169502258301 секунд
Найкращі параметри: {'splitter': 'best', 'min_samples_split': 20, 'min_samples_leaf': 2, 'max_leaf_nodes': 14, 'max_features': None, 'max_depth': 16, 'criterion': 'entropy'}
Train AUROC: 0.9183779916036456
Validation AUROC: 0.9025238054687034


**RandomizedSearchCV selected a more complex and balanced model that outperforms GridSearchCV. The key difference in parameters is that RandomizedSearch was able to find a combination of tree depth and constraints that achieved a better balance between bias and variance.**

In [7]:
X_test = preprocess_new_data(test_df, input_cols, encoder, scaler)

# prediction with the best kNN model
test_preds = knn_best.predict_proba(X_test)[:,1]

submission = sample_submission.copy()
submission["Exited"] = test_preds
submission.to_csv("submission_knn_best.csv", index=False)