In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("forestCover.csv", na_values=['?'])

In [3]:
df.drop(columns=['Water_Level', 'Observation_ID',"Inclination", "Facet"], inplace=True)

In [4]:
df['Soil_Type1'] = df['Soil_Type1'].map({'positive': 0, 'negative': 1})

# Remove extreme outliers 

In [5]:
# An extreme outlier can distort distances, making it look closer or further in misleading ways.

In [6]:
cap = df["Horizontal_Distance_To_Hydrology"].quantile(0.999)
df.loc[df["Horizontal_Distance_To_Hydrology"] > cap, "Horizontal_Distance_To_Hydrology"] = cap

In [7]:
import numpy as np
import pandas as pd

quant_features = [
    "Elevation",
    "Aspect",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points"
]

cols = [c for c in quant_features if c in df.columns]
missing_cols = sorted(set(quant_features) - set(cols))
if missing_cols:
    print("Not found in df:", missing_cols)

df[cols] = df[cols].replace('?', np.nan)
for c in cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

summary = df[cols].agg(['count','min','max','mean','std']).T
summary['missing'] = len(df) - summary['count']
summary['range'] = summary['max'] - summary['min']

summary = summary[['min','max','range','mean','std','count','missing']] \
                 .sort_values('range', ascending=False)

summary.round(3)


Unnamed: 0,min,max,range,mean,std,count,missing
Elevation,2054195.0,4263090.0,2208895.0,3270098.657,309383.131,581012.0,0.0
Horizontal_Distance_To_Fire_Points,0.0,7173.0,7173.0,1980.291,1324.195,581012.0,0.0
Horizontal_Distance_To_Roadways,0.0,7117.0,7117.0,2350.147,1559.255,581012.0,0.0
Horizontal_Distance_To_Hydrology,0.0,1189.0,1189.0,269.411,212.369,581012.0,0.0
Vertical_Distance_To_Hydrology,-173.0,601.0,774.0,46.419,58.295,581012.0,0.0
Aspect,0.0,360.0,360.0,155.657,111.914,581012.0,0.0
Hillshade_9am,0.0,254.0,254.0,212.146,26.77,581012.0,0.0
Hillshade_Noon,0.0,254.0,254.0,223.319,19.769,581012.0,0.0
Hillshade_3pm,0.0,254.0,254.0,142.528,38.275,581012.0,0.0
Slope,0.0,66.0,66.0,14.104,7.488,580714.0,298.0


In [21]:
#df.to_csv("df_knn_clean.csv", index=False)

# TT split

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

target_col = "Cover_Type"

y = df[target_col]
X = df.drop(columns=[target_col])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=0
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((464809, 54), (116203, 54), (464809,), (116203,))

# Missing valuess

In [9]:
# 1. Compute median from training set only
slope_median = X_train["Slope"].median()

# 2. Impute in training and test set using the same median
X_train["Slope"] = X_train["Slope"].fillna(slope_median)
X_test["Slope"]  = X_test["Slope"].fillna(slope_median)

# Scale Numeric - Min max

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

quant_features = [
    "Elevation",
    "Aspect",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points"
]

pre  = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), quant_features)
    ],
    remainder="passthrough"  # leaves binary and categorical columns untouched
)

In [12]:
import time
import numpy as np
import pandas as pd
import joblib

from sklearn.experimental import enable_halving_search_cv  
from sklearn.model_selection import (
    StratifiedKFold,
    HalvingRandomSearchCV,
    cross_validate,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer, f1_score, matthews_corrcoef
from scipy.stats import randint

knn = KNeighborsClassifier(algorithm="auto", n_jobs=-1)

pipe = Pipeline([
    ("pre", pre),
    ("knn", knn),
])



param_distributions = {
    "knn__n_neighbors": randint(3, 60),
    "knn__weights": ["uniform", "distance"],
    "knn__metric": ["minkowski"],  # vary via p
    "knn__p": [1, 2],              
    "knn__leaf_size": randint(20, 50),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

search = HalvingRandomSearchCV(
    estimator=pipe,
    param_distributions=param_distributions,
    scoring="balanced_accuracy",   
    refit=True,                   
    cv=cv,
    factor=3,                      
    resource="n_samples",          
    min_resources="smallest",
    aggressive_elimination=True,
    n_candidates="exhaust",
    random_state=0,
    n_jobs=-1,                     
    verbose=2,                    
    error_score=np.nan,
    return_train_score=False,
)

print(">>> Starting Hyperband search (balanced_accuracy)…")
t0 = time.time()
search.fit(X_train, y_train)
t1 = time.time()
print("\n=== Hyperband DONE ===")
print(f"Elapsed: {t1 - t0:.1f}s")
print("Best params:", search.best_params_)
print(f"Best CV balanced_accuracy: {search.best_score_:.4f}")


>>> Starting Hyperband search (balanced_accuracy)…
n_iterations: 9
n_required_iterations: 9
n_possible_iterations: 9
min_resources_: 70
max_resources_: 464809
aggressive_elimination: True
factor: 3
----------
iter: 0
n_candidates: 6640
n_resources: 70
Fitting 5 folds for each of 6640 candidates, totalling 33200 fits




----------
iter: 1
n_candidates: 2214
n_resources: 210
Fitting 5 folds for each of 2214 candidates, totalling 11070 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 2
n_candidates: 738
n_resources: 630
Fitting 5 folds for each of 738 candidates, totalling 3690 fits




----------
iter: 3
n_candidates: 246
n_resources: 1890
Fitting 5 folds for each of 246 candidates, totalling 1230 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 4
n_candidates: 82
n_resources: 5670
Fitting 5 folds for each of 82 candidates, totalling 410 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 5
n_candidates: 28
n_resources: 17010
Fitting 5 folds for each of 28 candidates, totalling 140 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 6
n_candidates: 10
n_resources: 51030
Fitting 5 folds for each of 10 candidates, totalling 50 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 7
n_candidates: 4
n_resources: 153090
Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 8
n_candidates: 2
n_resources: 459270
Fitting 5 folds for each of 2 candidates, totalling 10 fits


  _data = np.array(data, dtype=dtype, copy=copy,



=== Hyperband DONE ===
Elapsed: 4110.9s
Best params: {'knn__leaf_size': 43, 'knn__metric': 'minkowski', 'knn__n_neighbors': 4, 'knn__p': 1, 'knn__weights': 'distance'}
Best CV balanced_accuracy: 0.8941


In [13]:
search

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [16]:
cv_df = pd.DataFrame(search.cv_results_).copy()

def acc_key(row):
    p = row["params"]
    return (p["knn__n_neighbors"], p["knn__p"], p["knn__weights"], p["knn__metric"])

cv_df["acc_key"] = cv_df.apply(acc_key, axis=1)

# Sort by rank (balanced_accuracy) 
cv_df_unique = (cv_df.sort_values("rank_test_score")
                   .drop_duplicates(subset="acc_key", keep="first")
                   .reset_index(drop=True))


topk_unique = cv_df_unique.head(5).reset_index(drop=True)


rows = []
for _, row in topk_unique.iterrows():
    params = row["params"]

    pipe_i = Pipeline([("pre", pre), ("knn", KNeighborsClassifier(n_jobs=-1))])
    pipe_i.set_params(**params)

    res = cross_validate(
        pipe_i, X_train, y_train,
        scoring=multi_scoring,
        cv=cv_post,
        n_jobs=-1,
        return_train_score=False,
    )

    rows.append({
        "rank": int(row["rank_test_score"]),
        "params": params,
        "balanced_accuracy": mean_std_str(res["test_balanced_accuracy"]),
        "macro_f1":           mean_std_str(res["test_macro_f1"]),
        "weighted_f1":        mean_std_str(res["test_weighted_f1"]),
        "mcc":                mean_std_str(res["test_mcc"]),
    })

top5_multi = pd.DataFrame(rows).sort_values("rank").reset_index(drop=True)
with pd.option_context("display.max_colwidth", None):
    print("\n=== Top-5 UNIQUE (ignoring leaf_size) | Multi-metric CV (mean ± std) ===")
    print(top5_multi.to_string(index=False))



=== Top-5 UNIQUE (ignoring leaf_size) | Multi-metric CV (mean ± std) ===
 rank                                                                                                             params balanced_accuracy      macro_f1   weighted_f1           mcc
    1 {'knn__leaf_size': 49, 'knn__metric': 'minkowski', 'knn__n_neighbors': 4, 'knn__p': 1, 'knn__weights': 'distance'}     0.895 ± 0.002 0.901 ± 0.001 0.942 ± 0.001 0.906 ± 0.001
   32 {'knn__leaf_size': 30, 'knn__metric': 'minkowski', 'knn__n_neighbors': 5, 'knn__p': 1, 'knn__weights': 'distance'}     0.890 ± 0.002 0.898 ± 0.001 0.940 ± 0.001 0.903 ± 0.001
   45 {'knn__leaf_size': 25, 'knn__metric': 'minkowski', 'knn__n_neighbors': 7, 'knn__p': 1, 'knn__weights': 'distance'}     0.885 ± 0.002 0.896 ± 0.002 0.938 ± 0.000 0.901 ± 0.001
   71 {'knn__leaf_size': 39, 'knn__metric': 'minkowski', 'knn__n_neighbors': 4, 'knn__p': 2, 'knn__weights': 'distance'}     0.890 ± 0.002 0.897 ± 0.002 0.939 ± 0.001 0.902 ± 0.001
  103 {'knn__leaf_siz

In [17]:
top5_multi

Unnamed: 0,rank,params,balanced_accuracy,macro_f1,weighted_f1,mcc
0,1,"{'knn__leaf_size': 49, 'knn__metric': 'minkows...",0.895 ± 0.002,0.901 ± 0.001,0.942 ± 0.001,0.906 ± 0.001
1,32,"{'knn__leaf_size': 30, 'knn__metric': 'minkows...",0.890 ± 0.002,0.898 ± 0.001,0.940 ± 0.001,0.903 ± 0.001
2,45,"{'knn__leaf_size': 25, 'knn__metric': 'minkows...",0.885 ± 0.002,0.896 ± 0.002,0.938 ± 0.000,0.901 ± 0.001
3,71,"{'knn__leaf_size': 39, 'knn__metric': 'minkows...",0.890 ± 0.002,0.897 ± 0.002,0.939 ± 0.001,0.902 ± 0.001
4,103,"{'knn__leaf_size': 48, 'knn__metric': 'minkows...",0.882 ± 0.002,0.891 ± 0.002,0.936 ± 0.001,0.897 ± 0.001


In [19]:
# {'knn__leaf_size': 49, 'knn__metric': 'minkowski', 'knn__n_neighbors': 4, 'knn__p': 1, 'knn__weights': 'distance'} best in all

# Save model

In [20]:
import joblib, json, pandas as pd
from pathlib import Path

out_dir = Path("knn_artifacts")
out_dir.mkdir(exist_ok=True)

best_pipe = search.best_estimator_          # already refit on the full data by balanced_accuracy
joblib.dump(best_pipe, out_dir/"knn_best_pipeline.pkl", compress=3)


best_params = search.best_params_
with open(out_dir/"knn_best_params.json", "w") as f:
    json.dump(best_params, f, indent=2)