In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, roc_auc_score

In [65]:
#Load data
df = pd.read_csv(os.path.abspath("../data/normalized_labeled_training_data.csv"))
df_not_normalized = pd.read_csv(os.path.abspath("../data/labeled_training_data.csv"))

In [66]:
# Adding features
df['day_or_night'] = df_not_normalized['hour_of_day'].apply(lambda x: 1 if 6 <= x < 18 else 0)

In [67]:
# Dropping features
df = df.drop(["snow", "snowdepth", "holiday"], axis=1)

In [68]:
#Select all columns except the last
X = df.iloc[:, :-1]
#Select label column
y = df['increase_stock']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [69]:
# Paramaters
param_grid = {
    "n_neighbors": np.arange(1, 50),
    "weights"    : ["uniform", "distance"], 
    "metric"     : ["euclidean", "manhattan"],
    "algorithm"  : ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size"  : np.arange(10, 50, 10),
    "p"          : [1, 2],
    "n_jobs"     : [-1],
}

# Grid search with KNeighborsClassifier using our param_grid. 
# Optimizes on F1-score (because of our group decision) and does 10-fold CV.
grid_search = GridSearchCV(
    estimator  = KNeighborsClassifier(),
    param_grid = param_grid,
    scoring    = "f1",
    cv         = 10,
    n_jobs     = -1
)

# Fit
grid_search.fit(X_train, y_train)

tuned_model = grid_search.best_estimator_

print("Tuned model: ", tuned_model)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation F1 Score:", grid_search.best_score_)

Tuned model:  KNeighborsClassifier(leaf_size=10, metric='euclidean', n_jobs=-1, n_neighbors=1,
                     p=1)
Best Parameters: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'euclidean', 'n_jobs': -1, 'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
Best Cross-Validation F1 Score: 1.0


In [73]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Paramaters
param_grid = {
    "n_neighbors": np.arange(1, 50),
    "weights"    : ["uniform", "distance"], 
    "metric"     : ["euclidean", "manhattan"],
    "algorithm"  : ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size"  : np.arange(10, 50, 10),
    "p"          : [1, 2],
    "n_jobs"     : [-1],
}

# Grid search with KNeighborsClassifier using our param_grid. 
# Optimizes on F1-score (because of our group decision) and does 10-fold CV.
grid_search = GridSearchCV(
    estimator  = KNeighborsClassifier(),
    param_grid = param_grid,
    scoring    = "f1",
    cv         = skf,
    n_jobs     = -1
)

# Fit
grid_search.fit(X_train, y_train)

tuned_model = grid_search.best_estimator_

print("Tuned model: ", tuned_model)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation F1 Score:", grid_search.best_score_)

Tuned model:  KNeighborsClassifier(leaf_size=10, metric='euclidean', n_jobs=-1, n_neighbors=1,
                     p=1)
Best Parameters: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'euclidean', 'n_jobs': -1, 'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
Best Cross-Validation F1 Score: 1.0


In [74]:
y_pred  = tuned_model.predict(X_test)
y_proba = tuned_model.predict_proba(X_test)[:, 1]

In [75]:
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Test F1-Weighted: {f1_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test Recall-Weighted: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test Precision-Weighted: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Test ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")

Test Accuracy: 1.0000
Test F1-Weighted: 1.0000
Test Recall-Weighted: 1.0000
Test Precision-Weighted: 1.0000
Test ROC AUC: 1.0000


In [None]:
# ToDo test
# Reducera antal features
#      T.ex använd regn, fuktighet osv
# Utöka antal parametrar