In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

Load Training Data

In [2]:
# Load the dataset
file_path = 'training_data.csv'
data = pd.read_csv(file_path)
data['increase_stock_binary'] = data['increase_stock'].apply(lambda x: 1 if x == 'high_bike_demand' else 0)

# Display the first few rows of the dataframe
data.head()

Unnamed: 0,hour_of_day,day_of_week,month,holiday,weekday,summertime,temp,dew,humidity,precip,snow,snowdepth,windspeed,cloudcover,visibility,increase_stock,increase_stock_binary
0,5,5,1,0,0,0,-7.2,-15.0,53.68,0.0,0,0.0,16.3,31.6,16.0,low_bike_demand,0
1,21,4,1,0,1,0,-1.3,-12.8,40.97,0.0,0,0.0,23.9,85.7,16.0,low_bike_demand,0
2,21,3,8,0,1,1,26.9,21.8,73.39,0.0,0,0.0,0.0,81.1,16.0,low_bike_demand,0
3,1,6,1,0,0,0,3.1,-4.0,59.74,0.0,0,0.0,19.2,0.0,16.0,low_bike_demand,0
4,17,0,3,0,1,0,11.7,-11.4,18.71,0.0,0,0.0,10.5,44.6,16.0,low_bike_demand,0


In [26]:
# Separating features and the target variable
X = data.drop([
    'increase_stock', 
    'increase_stock_binary', 
    'snow'
], axis=1)
print(X.keys())
y = data['increase_stock_binary']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Index(['hour_of_day', 'day_of_week', 'month', 'holiday', 'weekday',
       'summertime', 'temp', 'dew', 'humidity', 'precip', 'snowdepth',
       'windspeed', 'cloudcover', 'visibility'],
      dtype='object')


In [12]:
# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Creating a KNN classifier
knn = KNeighborsClassifier(n_neighbors=2)

# Fitting the classifier to the training data
knn.fit(X_train, y_train)

# Making predictions on the test data
y_pred = knn.predict(X_test)

# Generating a classification report and confusion matrix
classification_report_result = classification_report(y_test, y_pred)
confusion_matrix_result = confusion_matrix(y_test, y_pred)
accuracy_score_data = accuracy_score(y_test,y_pred)
# Printing the results
print("Classification Report:\n", classification_report_result)
#print("Confusion Matrix:\n", confusion_matrix_result)

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.96      0.91       262
           1       0.63      0.33      0.43        58

    accuracy                           0.84       320
   macro avg       0.75      0.64      0.67       320
weighted avg       0.82      0.84      0.82       320



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [9]:
accuracy_score_data

0.84375

In [132]:
corr_coeffs = data.drop(['increase_stock', 'snow'], axis=1).corr().iloc[-1].sort_values()
print(corr_coeffs.round(2))
important_features = corr_coeffs.iloc[-7:-1].keys().tolist() + corr_coeffs.iloc[:2].keys().tolist()
important_features

humidity                -0.31
weekday                 -0.12
precip                  -0.06
snowdepth               -0.05
cloudcover              -0.05
holiday                 -0.00
month                    0.04
day_of_week              0.08
windspeed                0.10
visibility               0.11
dew                      0.13
summertime               0.22
hour_of_day              0.24
temp                     0.34
increase_stock_binary    1.00
Name: increase_stock_binary, dtype: float64


['windspeed',
 'visibility',
 'dew',
 'summertime',
 'hour_of_day',
 'temp',
 'humidity',
 'weekday']

In [133]:
X = data[important_features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Creating a KNN classifier
knn = KNeighborsClassifier(n_neighbors=2)

# Fitting the classifier to the training data
knn.fit(X_train, y_train)

# Making predictions on the test data
y_pred = knn.predict(X_test)

# Generating a classification report and confusion matrix
classification_report_result = classification_report(y_test, y_pred)
confusion_matrix_result = confusion_matrix(y_test, y_pred)
accuracy_score_data = accuracy_score(y_test,y_pred)
# Printing the results
print("Classification Report:\n", classification_report_result)
#print("Confusion Matrix:\n", confusion_matrix_result)

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.96      0.91       262
           1       0.66      0.33      0.44        58

    accuracy                           0.85       320
   macro avg       0.76      0.64      0.67       320
weighted avg       0.83      0.85      0.83       320



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [134]:
from sklearn.model_selection import KFold, cross_val_score

# Assuming X and y are already defined with your dataset and labels
# Also assuming important_features is defined

X = data[important_features]

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Creating a KNN classifier
knn = KNeighborsClassifier(n_neighbors=2)

# Setting up k-fold cross-validation
kf = KFold(n_splits=5, random_state=0, shuffle=True)

# Evaluating the classifier using cross-validation
scores = cross_val_score(knn, X_scaled, y, cv=kf)

# The scores for each fold
print("Scores for each fold:", scores)

# Average score over all folds
print("Average score:", scores.mean())


Scores for each fold: [0.84375  0.85625  0.85     0.853125 0.8875  ]
Average score: 0.8581249999999999


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
