In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing
import pandas as pd
import MySQLdb

In [None]:
URI = "..."
PORT = 3306
NAME = "..."
USER = "..."
PASS = "..."

CONNECTION_STRING = f"mysql://{USER}:{PASS}@{URI}:{PORT}/{NAME}"

In [None]:
connection = CONNECTION_STRING

In [None]:
sql = f"""
SELECT DATE(date_time) as date, DAYOFWEEK(date_time) as day, CAST(date_time AS time) as time, HOUR(date_time) as hour, CAST(sunrise AS time) as sunrise, CAST(sunset as time) as sunset, main_description, wind_speed, ROUND(feels_like - 270) as temp
FROM dublin_weather;
"""

In [None]:
weather_data = pd.read_sql(sql, connection)
weather_data

In [None]:
workday = (1 < weather_data.day) & (weather_data.day < 7) * 1.0
weather_data['is_workday'] = workday.astype('int')

In [None]:
rain_yn = []
for description in weather_data['main_description']:
    rain_yn.append('rain' in description.lower())
weather_data['rain_yn'] = rain_yn
weather_data['rain_yn'] = weather_data['rain_yn'].astype('int')

In [None]:
weather_data['daytime'] = (weather_data.time > weather_data.sunrise) & (weather_data.time < weather_data.sunset)

In [None]:
weather_data

In [None]:
weather_data['morning-rush'] = (weather_data.hour >= 8) & (weather_data.hour < 10) & (weather_data.is_workday == 1)

In [None]:
weather_data['evening-rush'] = (weather_data.hour >= 4) & (weather_data.hour < 7) & (weather_data.is_workday == 1)

In [None]:
for_model = weather_data.drop(['sunrise', 'sunset', 'time', 'day', 'main_description'], axis='columns')

In [None]:
station_numbers_sql = """
SELECT number
FROM stations
ORDER BY number ASC"""
station_numbers = pd.read_sql(station_numbers_sql, connection)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

data_for_models2 = {}

for station_number in station_numbers['number']:
    sql = f"""
SELECT DATE(retrieved) as date, HOUR(retrieved) as hour, ROUND(avg(available_bikes)) as avg_available_bikes
FROM station_update
WHERE number = {station_number} 
GROUP BY HOUR(retrieved), DATE(retrieved)"""
    station_target = pd.read_sql(sql, connection)
    station_weather = pd.merge(weather_data, station_target, on=['date','hour'])
    data_for_models2[station_number] = station_weather

    

In [None]:
k2nn_unnormed_models = {}
for station_number, df in data_for_models2.items():
    neigh = KNeighborsRegressor(n_neighbors=2)
    neigh.fit(df[['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]], df['avg_available_bikes'])
    k2nn_unnormed_models[station_number] = neigh

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [None]:
test_2knn_model = {}
for station_number, df in data_for_models2.items():
    X = df[['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y = df['avg_available_bikes']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
    neigh = KNeighborsRegressor(n_neighbors=2)
    neigh.fit(X_train, y_train)
    predicted = neigh.predict(X_test)
    test_2knn_model[station_number] = { "Mean absolute error": mean_absolute_error(predicted, y_test),
                                      "Median absolute error": median_absolute_error(predicted, y_test),
                                      "R squared score": r2_score(predicted, y_test)}
    
test_2knn_model

In [None]:
results_test_2knn_normed_model = {}
for station_number, df in data_for_models2.items():
    X = df[['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y = df['avg_available_bikes']
    
    x = X.values
    weather_scaler = preprocessing.MinMaxScaler()
    weather_scaler.fit(x) 
    scaled = weather_scaler.transform(x)
    X_normed=pd.DataFrame(scaled, columns=X.columns)
    
    
    X_train, X_test, y_train, y_test = train_test_split(X_normed, y, test_size=0.3, random_state=1)
    neigh = KNeighborsRegressor(n_neighbors=2)
    neigh.fit(X_train, y_train)
    predicted = neigh.predict(X_test)
    results_test_2knn_normed_model[station_number] = { "Mean absolute error": mean_absolute_error(predicted, y_test),
                                      "Median absolute error": median_absolute_error(predicted, y_test),
                                      "R squared score": r2_score(predicted, y_test)}
results_test_2knn_normed_model

In [None]:
for key, value in test_2knn_model.items():
    print(f"""Station {key}
    Mean AE with normalisation: {value['Mean absolute error']}\tMean AE without:[key]['Mean absolute error']}
    Median AE with normalisation: {value['Median absolute error']}\tMean AE without: {results_test_2knn_normed_model[key]['Median absolute error']}
    R squared score with noram: {value['R squared score']}\tR squared score without: {results_test_2knn_normed_model[key]['R squared score']}""")

In [None]:
k2nn_unnormed_models_with_hour = {}
for station_number, df in data_for_models2.items():
    neigh = KNeighborsRegressor(n_neighbors=2)
    neigh.fit(df[['wind_speed', 'hour', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]], df['avg_available_bikes'])
    k2nn_unnormed_models_with_hour[station_number] = neigh

In [None]:
test_2knn_models_unnormed_with_hour = {}
for station_number, df in data_for_models2.items():
    X = df[['wind_speed', 'hour', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y = df['avg_available_bikes']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
    neigh = KNeighborsRegressor(n_neighbors=2)
    neigh.fit(X_train, y_train)
    predicted = neigh.predict(X_test)
    test_2knn_models_unnormed_with_hour[station_number] = { "Mean absolute error": mean_absolute_error(predicted, y_test),
                                      "Median absolute error": median_absolute_error(predicted, y_test),
                                      "R squared score": r2_score(predicted, y_test)}
    
test_2knn_models_unnormed_with_hour

In [None]:
count = 0
for key, item in test_2knn_models_unnormed_with_hour.items():
    if item['R squared score'] < -1:
        count += 1
count

In [None]:
count = 0
for key, item in test_2knn_model.items():
    if item['R squared score'] < -1:
        count += 1
count

In [None]:
count = 0
for key, item in results_test_2knn_normed_model.items():
    if item['R squared score'] < -1:
        count += 1
count

Conclusion - normed without hour is most effective. Let's test it using everything until April 4th as training data and last week's data as test data.

In [None]:
training_data = {}
testing_data = {}

In [None]:
import datetime
split_date_str = '2021-04-04 23:00:00'
split_date = datetime.datetime.strptime(split_date_str, '%Y-%m-%d %H:%M:%S')

for key, df in data_for_models2.items():
    training_data[key] = df.loc[df['date'] <= split_date.date()]
    testing_data[key] = df.loc[df['date'] > split_date.date()]

In [None]:
test_2knn_models_normed_without_hour = {}
for station_number, df in data_for_models2.items():
    X_train = training_data[station_number][['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y_train = training_data[station_number]['avg_available_bikes']
    
    X_test = testing_data[station_number][['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y_test = testing_data[station_number]['avg_available_bikes']
    
    weather_scaler = preprocessing.MinMaxScaler()
    weather_scaler.fit(X_train) 
    X_train_scaled=pd.DataFrame(weather_scaler.transform(X_train), columns=X_train.columns)
    
    X_test_scaled=pd.DataFrame(weather_scaler.transform(X_test), columns=X_test.columns)

    neigh = KNeighborsRegressor(n_neighbors=2)
    neigh.fit(X_train_scaled, y_train)
    predicted = neigh.predict(X_test_scaled)
    test_2knn_models_normed_without_hour[station_number] = { 
                                    "Mean absolute error": mean_absolute_error(predicted, y_test),
                                      "Median absolute error": median_absolute_error(predicted, y_test),
                                      "R squared score": r2_score(predicted, y_test)}
    
test_2knn_models_normed_without_hour

In [None]:
count = 0
for key, item in test_2knn_models_normed_without_hour.items():
    if item['R squared score'] < -1:
        count += 1
count

In [None]:
test_2knn_models_normed_with_hour = {}
for station_number, df in data_for_models2.items():
    X_train = training_data[station_number][['wind_speed', 'hour', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y_train = training_data[station_number]['avg_available_bikes']
    
    X_test = testing_data[station_number][['wind_speed', 'hour', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y_test = testing_data[station_number]['avg_available_bikes']
    
    weather_scaler = preprocessing.MinMaxScaler()
    weather_scaler.fit(X_train) 
    X_train_scaled=pd.DataFrame(weather_scaler.transform(X_train), columns=X_train.columns)
    
    X_test_scaled=pd.DataFrame(weather_scaler.transform(X_test), columns=X_test.columns)

    neigh = KNeighborsRegressor(n_neighbors=2)
    neigh.fit(X_train_scaled, y_train)
    predicted = neigh.predict(X_test_scaled)
    test_2knn_models_normed_with_hour[station_number] = { 
                                    "Mean absolute error": mean_absolute_error(predicted, y_test),
                                      "Median absolute error": median_absolute_error(predicted, y_test),
                                      "R squared score": r2_score(predicted, y_test)}
    
test_2knn_models_normed_with_hour

In [None]:
count = 0
for key, item in test_2knn_models_normed_with_hour.items():
    if item['R squared score'] < -1:
        count += 1
count

In [None]:
test_2knn_models_not_normed_with_hour = {}
for station_number, df in data_for_models2.items():
    X_train = training_data[station_number][['wind_speed', 'hour', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y_train = training_data[station_number]['avg_available_bikes']
    
    X_test = testing_data[station_number][['wind_speed', 'hour', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y_test = testing_data[station_number]['avg_available_bikes']
    
#     weather_scaler = preprocessing.MinMaxScaler()
#     weather_scaler.fit(X_train) 
#     X_train_scaled=pd.DataFrame(weather_scaler.transform(X_train), columns=X_train.columns)
    
#     X_test_scaled=pd.DataFrame(weather_scaler.transform(X_test), columns=X_test.columns)

    neigh = KNeighborsRegressor(n_neighbors=2)
    neigh.fit(X_train, y_train)
    predicted = neigh.predict(X_test)
    test_2knn_models_not_normed_with_hour[station_number] = { 
                                    "Mean absolute error": mean_absolute_error(predicted, y_test),
                                      "Median absolute error": median_absolute_error(predicted, y_test),
                                      "R squared score": r2_score(predicted, y_test)}
    


In [None]:
count = 0
for key, item in test_2knn_models_not_normed_with_hour.items():
    if item['R squared score'] < -1:
        count += 1
count

In [None]:
test_2knn_models_not_normed_without_hour = {}
for station_number, df in data_for_models2.items():
    X_train = training_data[station_number][['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y_train = training_data[station_number]['avg_available_bikes']
    
    X_test = testing_data[station_number][['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y_test = testing_data[station_number]['avg_available_bikes']
    
#     weather_scaler = preprocessing.MinMaxScaler()
#     weather_scaler.fit(X_train) 
#     X_train_scaled=pd.DataFrame(weather_scaler.transform(X_train), columns=X_train.columns)
    
#     X_test_scaled=pd.DataFrame(weather_scaler.transform(X_test), columns=X_test.columns)

    neigh = KNeighborsRegressor(n_neighbors=2)
    neigh.fit(X_train, y_train)
    predicted = neigh.predict(X_test)
    test_2knn_models_not_normed_without_hour[station_number] = { 
                                    "Mean absolute error": mean_absolute_error(predicted, y_test),
                                      "Median absolute error": median_absolute_error(predicted, y_test),
                                      "R squared score": r2_score(predicted, y_test)}
    


In [None]:
count = 0
for key, item in test_2knn_models_not_normed_without_hour.items():
    if item['R squared score'] < -1:
        count += 1
count

In [None]:
count = 0
for key, item in test_2knn_models_not_normed_without_hour.items():
    if item['Median absolute error'] <= 3:
        count += 1
count

In [None]:
count = 0
for key, item in test_2knn_models_not_normed_without_hour.items():
    if item['Median absolute error'] >= 5:
        count += 1
count

In [None]:
count = 0
for key, item in test_2knn_models_not_normed_with_hour.items():
    if item['Median absolute error'] <= 3:
        count += 1
count

In [None]:
count = 0
for key, item in test_2knn_models_not_normed_with_hour.items():
    if item['Median absolute error'] >= 5:
        count += 1
count

In [None]:
count = 0
for key, item in test_2knn_models_normed_without_hour.items():
    if item['Median absolute error'] >= 5:
        count += 1
count

In [None]:
count = 0
for key, item in test_2knn_models_normed_without_hour.items():
    if item['Median absolute error'] <= 3:
        count += 1
count

In [None]:
count = 0
for key, item in test_2knn_models_normed_with_hour.items():
    if item['Median absolute error'] <= 3:
        count += 1
count

In [None]:
count = 0
for key, item in test_2knn_models_normed_with_hour.items():
    if item['Median absolute error'] >= 5:
        count += 1
count

In [None]:
training_data2 = {}
testing_data2 = {}

In [None]:
split_date_str = '2021-04-07 23:00:00'
split_date = datetime.datetime.strptime(split_date_str, '%Y-%m-%d %H:%M:%S')

for key, df in data_for_models2.items():
    training_data2[key] = df.loc[df['date'] <= split_date.date()]
    testing_data2[key] = df.loc[df['date'] > split_date.date()]

In [None]:
test_2knn_models_normed_without_hour2 = {}
for station_number in data_for_models2.keys():
    X_train = training_data2[station_number][['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y_train = training_data2[station_number]['avg_available_bikes']
    
    X_test = testing_data2[station_number][['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y_test = testing_data2[station_number]['avg_available_bikes']
    
    weather_scaler = preprocessing.MinMaxScaler()
    weather_scaler.fit(X_train) 
    X_train_scaled=pd.DataFrame(weather_scaler.transform(X_train), columns=X_train.columns)
    
    X_test_scaled=pd.DataFrame(weather_scaler.transform(X_test), columns=X_test.columns)

    neigh = KNeighborsRegressor(n_neighbors=2)
    neigh.fit(X_train_scaled, y_train)
    predicted = neigh.predict(X_test_scaled)
    test_2knn_models_normed_without_hour2[station_number] = { 
                                    "Mean absolute error": mean_absolute_error(predicted, y_test),
                                      "Median absolute error": median_absolute_error(predicted, y_test),
                                      "R squared score": r2_score(predicted, y_test)}
    
test_2knn_models_normed_without_hour2

In [None]:
count = 0
for key, item in test_2knn_models_normed_without_hour2.items():
    if item['Median absolute error'] >= 5:
        count += 1
count

In [None]:
count = 0
for key, item in test_2knn_models_normed_without_hour2.items():
    if item['Median absolute error'] <= 3:
        count += 1
count

In [None]:
best_station = 2
for key, item in test_2knn_models_normed_without_hour2.items():
    if item['Median absolute error'] < test_2knn_models_normed_without_hour2[best_station]['Median absolute error']:
        best_station = key
        
best_station

In [None]:
worst_station = 2
for key, item in test_2knn_models_normed_without_hour2.items():
    if item['Median absolute error'] > test_2knn_models_normed_without_hour2[worst_station]['Median absolute error']:
        worst_station = key
        
worst_station

In [None]:
models_v2_demo = {}
for station_number in [6, 62]:
    X_train = training_data2[station_number][['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y_train = training_data2[station_number]['avg_available_bikes']
    
    weather_scaler = preprocessing.MinMaxScaler()
    weather_scaler.fit(X_train) 
    X_train_scaled=pd.DataFrame(weather_scaler.transform(X_train), columns=X_train.columns)
    
    model = KNeighborsRegressor(n_neighbors=2)
    model.fit(X_train_scaled, y_train)
    
    models_v2_demo[station_number] = [model, weather_scaler]

In [None]:
demo_6 = {}
demo_6['actual'] = testing_data2[6]['avg_available_bikes'].values

In [None]:
x_test = testing_data2[6][['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
demo_6['predicted'] = models_v2_demo[6][0].predict(models_v2_demo[6][1].transform(x_test))

In [None]:
demo_6

In [None]:
import numpy as np
import matplotlib.pyplot as plt

date_and_time = list(zip(testing_data2[6]['date'].values, testing_data2[6]['time'].values))
x = []
for date_time in date_and_time:
    x.append(np.datetime64(date_time[0]) + date_time[1])

fig2, ax2 = plt.subplots()
plt.plot(x,demo_6['actual'], label='Actual')
plt.plot(x,demo_6['predicted'],label='Predicted')

# Add some text for labels, title and custom x-axis tick labels, etc.
plt.ylabel('number of bikes')
plt.title('Actual and predicted bike availability with k=2 for station 6, April 8 - 11 (inclusive)')
plt.xticks(rotation=45)
plt.legend()
fig2.tight_layout()

plt.show()
plt.savefig('station6k2.png')

Repeat for worst station....

In [None]:
demo_62 = {}
demo_62['actual'] = testing_data2[62]['avg_available_bikes'].values

In [None]:
x_test_62 = testing_data2[62][['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
demo_62['predicted'] = models_v2_demo[62][0].predict(models_v2_demo[62][1].transform(x_test))

In [None]:
demo_62['time'] = testing_data2[62]['hour'].values

In [None]:
import numpy as np
import matplotlib.pyplot as plt

fig2, ax2 = plt.subplots()
plt.plot(x, demo_62['actual'], label='Actual')
plt.plot(x, demo_62['predicted'],label='Predicted')

# Add some text for labels, title and custom x-axis tick labels, etc.
plt.ylabel('number of bikes')
plt.xlabel('hour')
plt.title('Actual and predicted bike availability with k=2 for station 62, April 8 - 11 (inclusive)')
plt.xticks(rotation=45)
plt.legend()
fig2.tight_layout()

plt.show()
plt.savefig('station62k2.png')

In [None]:
models_v3_demo = {}
for station_number in [6, 62]:
    X_train = training_data2[station_number][['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y_train = training_data2[station_number]['avg_available_bikes']
    
    weather_scaler = preprocessing.MinMaxScaler()
    weather_scaler.fit(X_train) 
    X_train_scaled=pd.DataFrame(weather_scaler.transform(X_train), columns=X_train.columns)
    
    model = KNeighborsRegressor()
    model.fit(X_train_scaled, y_train)
    
    models_v3_demo[station_number] = [model, weather_scaler]

In [None]:
demo_6_v2 = {}
demo_6_v2['actual'] = testing_data2[6]['avg_available_bikes'].values

In [None]:
x_test_2 = testing_data2[6][['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
demo_6_v2['predicted'] = models_v3_demo[6][0].predict(models_v3_demo[6][1].transform(x_test))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import datetime

date_and_time = list(zip(testing_data2[6]['date'].values, testing_data2[6]['time'].values))
x = []
for date_time in date_and_time:
    x.append(np.datetime64(date_time[0]) + date_time[1])

plt.plot(x, demo_6_v2['actual'], label='Actual')
plt.plot(x, demo_6_v2['predicted'],label='Predicted')

# Add some text for labels, title and custom x-axis tick labels, etc.
plt.ylabel('number of bikes')
plt.xlabel('date')
plt.title('Actual and predicted bike availability for station 6 with k=5, April 8 - 11 (inclusive)')
plt.xticks(rotation=45)
plt.legend()
fig2.tight_layout()

plt.show()
plt.savefig('station6k5.png')

In [None]:
demo_62_v2 = {}
demo_62_v2['actual'] = testing_data2[62]['avg_available_bikes'].values
x_test_62_v2 = testing_data2[62][['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
demo_62_v2['predicted'] = models_v3_demo[62][0].predict(models_v3_demo[62][1].transform(x_test))

In [None]:
fig2, ax2 = plt.subplots()
plt.plot(x, demo_62_v2['actual'], label='Actual')
plt.plot(x, demo_62_v2['predicted'],label='Predicted')

plt.ylabel('number of bikes')
plt.title('Actual and predicted bike availability with k=5 for station 62, April 8 - 11 (inclusive)')
plt.legend()
plt.xticks(rotation=45)
fig2.tight_layout()

plt.show()
plt.savefig('station62k5.png')

In [None]:
test_5knn_models_normed_without_hour = {}
for station_number in data_for_models2.keys():
    X_train = training_data2[station_number][['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y_train = training_data2[station_number]['avg_available_bikes']
    
    X_test = testing_data2[station_number][['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y_test = testing_data2[station_number]['avg_available_bikes']
    
    weather_scaler = preprocessing.MinMaxScaler()
    weather_scaler.fit(X_train) 
    X_train_scaled=pd.DataFrame(weather_scaler.transform(X_train), columns=X_train.columns)
    
    X_test_scaled=pd.DataFrame(weather_scaler.transform(X_test), columns=X_test.columns)

    neigh = KNeighborsRegressor()
    neigh.fit(X_train_scaled, y_train)
    predicted = neigh.predict(X_test_scaled)
    test_5knn_models_normed_without_hour[station_number] = { 
                                    "Mean absolute error": mean_absolute_error(predicted, y_test),
                                      "Median absolute error": median_absolute_error(predicted, y_test),
                                      "R squared score": r2_score(predicted, y_test)}
    
test_5knn_models_normed_without_hour

In [None]:
count = 0
for key, item in test_5knn_models_normed_without_hour.items():
    if item['Median absolute error'] <= 3:
        count += 1
count

In [None]:
count = 0
for key, item in test_5knn_models_normed_without_hour.items():
    if item['Median absolute error'] >= 5:
        count += 1
count

In [None]:
# get average median absolute error
total = 0
for key, results in test_5knn_models_normed_without_hour.items():
    total += results['Median absolute error']
av = total / 109
av

In [None]:
test_train_split_5knn_models_normed_without_hour = {}
for station_number, df in data_for_models2.items():
    X = df[['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush',]]
    y = df['avg_available_bikes']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1) 
    
    weather_scaler = preprocessing.MinMaxScaler()
    weather_scaler.fit(X_train) 
    X_train_scaled=pd.DataFrame(weather_scaler.transform(X_train), columns=X_train.columns)
    
    X_test_scaled=pd.DataFrame(weather_scaler.transform(X_test), columns=X_test.columns)

    neigh = KNeighborsRegressor()
    neigh.fit(X_train_scaled, y_train)
    predicted = neigh.predict(X_test_scaled)
    test_train_split_5knn_models_normed_without_hour[station_number] = { 
                                    "Mean absolute error": mean_absolute_error(predicted, y_test),
                                      "Median absolute error": median_absolute_error(predicted, y_test),
                                      "R squared score": r2_score(predicted, y_test)}
    
test_train_split_5knn_models_normed_without_hour

In [None]:
total = 0
for key, results in test_train_split_5knn_models_normed_without_hour.items():
    total += results['Median absolute error']
av = total / 109
av