# K-Nearest Neighbors Classifier Method

In [258]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_validate, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [259]:
#Load data
df = pd.read_csv(os.path.abspath("../data/normalized_labeled_training_data.csv"))
df_not_normalized = pd.read_csv(os.path.abspath("../data/labeled_training_data.csv"))

In [260]:
# Adding features

# Day or night
df['day_or_night'] = df_not_normalized['hour_of_day'].apply(lambda x: 1 if 8 <= x < 21 else 0)

df['cold'] = df_not_normalized['temp'].apply(lambda x: 1 if x <= 8 else 0)

df['opt_wind'] = df_not_normalized['windspeed'].apply(lambda x: 1 if 5 < x <= 31 else 0)

#df['atemp'] = (243.04 * (np.log(df_not_normalized['humidity']/100)
#                        + (17.625 * df_not_normalized['dew']) / (243.04 + df_not_normalized['dew']))) / (17.625 - np.log(df_not_normalized['humidity']/100)
#                        - (17.625 * df_not_normalized['dew']) / (243.04 + df_not_normalized['dew']))

In [261]:
#df = df.drop(["summertime", "snow", "snowdepth", "precip", "holiday", "visibility"], axis=1)

#df = df.drop(["snow", "snowdepth", "holiday", "visibility", "precip", "dew"], axis=1)
#df = df.drop(["snow", "snowdepth", "holiday", "visibility", "precip", "dew"], axis=1)
#df = df.drop(["snow", "snowdepth", "holiday", "visibility", "precip", "dew", "cloudcover"], axis=1)
df = df.drop(["snow", "snowdepth", "holiday", "visibility", "precip", "dew", "cloudcover"], axis=1)
#df = df.drop(["snow", "snowdepth", "holiday", "visibility", "precip", "dew", "temp", "cloudcover", "windspeed"], axis=1)

In [262]:
#Select all columns except the last
X = df.drop(columns=["increase_stock"])
#Select label column
y = df['increase_stock']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [263]:
X.head()

Unnamed: 0,hour_of_day,day_of_week,month,weekday,summertime,temp,humidity,windspeed,day_or_night,cold,opt_wind
0,0.217391,0.833333,0.0,0.0,0.0,0.042506,0.450143,0.372146,0,1,1
1,0.913043,0.666667,0.0,1.0,0.0,0.174497,0.298905,0.545662,0,1,1
2,0.913043,0.5,0.636364,1.0,1.0,0.805369,0.684674,0.0,0,0,0
3,0.043478,1.0,0.0,0.0,0.0,0.272931,0.522251,0.438356,0,1,1
4,0.73913,0.0,0.181818,1.0,0.0,0.465324,0.034031,0.239726,1,0,1


In [264]:
# RANDOM SEARCH
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

param_distributions = {
    "rf__n_neighbors": np.arange(1, 50),
    "rf__weights": ["uniform", "distance"],
    "rf__metric": ["euclidean", "manhattan", "chebyshev", "minkowski"],
    "rf__algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "rf__leaf_size": np.arange(10, 100, 10),
    "rf__p": [1, 2, 3],
    "rf__n_jobs": [-1],
}


model = KNeighborsClassifier(
    #n_neigbors = 25
)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', model)])

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    scoring='f1_weighted',
    n_iter=3000,  # Number of random samples to draw
    cv=skf,
    random_state=42,
    n_jobs=-1,
)

# Fit
random_search.fit(X_train, y_train)

tuned_model = random_search.best_estimator_

print("Tuned model: ", tuned_model)
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation F1 Score:", random_search.best_score_)

Tuned model:  Pipeline(steps=[('scaler', StandardScaler()),
                ('rf',
                 KNeighborsClassifier(algorithm='brute', leaf_size=70,
                                      metric='manhattan', n_jobs=-1,
                                      n_neighbors=22, p=3,
                                      weights='distance'))])
Best Parameters: {'rf__weights': 'distance', 'rf__p': 3, 'rf__n_neighbors': 22, 'rf__n_jobs': -1, 'rf__metric': 'manhattan', 'rf__leaf_size': 70, 'rf__algorithm': 'brute'}
Best Cross-Validation F1 Score: 0.8901267926197949


In [265]:
y_pred  = tuned_model.predict(X_test)
y_proba = tuned_model.predict_proba(X_test)[:, 1]

In [266]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Test F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test Recall-Weighted: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test Precision-Weighted: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")

              precision    recall  f1-score   support

         0.0       0.92      0.92      0.92       402
         1.0       0.60      0.59      0.59        78

    accuracy                           0.87       480
   macro avg       0.76      0.76      0.76       480
weighted avg       0.87      0.87      0.87       480

Test Accuracy: 0.8688
Test F1-Weighted: 0.8684
Test Recall-Weighted: 0.8688
Test Precision-Weighted: 0.8681
Test ROC AUC: 0.8922


In [267]:
from sklearn.inspection import permutation_importance

# Perform permutation importance on the final model
perm_importance = permutation_importance(tuned_model, X, y, scoring='f1', n_repeats=10, random_state=42)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': perm_importance.importances_mean
}).sort_values(by='importance', ascending=False)

print(feature_importance)


         feature  importance
8   day_or_night    0.304075
6       humidity    0.286822
0    hour_of_day    0.240677
1    day_of_week    0.170449
3        weekday    0.166162
5           temp    0.153906
7      windspeed    0.135266
2          month    0.134167
4     summertime    0.102938
9           cold    0.062699
10      opt_wind    0.035036
