In [23]:
from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing
import pandas as pd
import MySQLdb

In [24]:
URI = "..."
PORT = 3306
NAME = "..."
USER = "..."
PASS = "..."

CONNECTION_STRING = f"mysql://{USER}:{PASS}@{URI}:{PORT}/{NAME}"

In [25]:
connection = CONNECTION_STRING

In [26]:
sql = f"""
SELECT DATE(date_time) as date, DAYOFWEEK(date_time) as day, CAST(date_time AS time) as time, HOUR(date_time) as hour, CAST(sunrise AS time) as sunrise, CAST(sunset as time) as sunset, main_description, wind_speed, ROUND(feels_like - 270) as temp
FROM dublin_weather;
"""

In [27]:
#pull in all historical data

weather_data = pd.read_sql(sql, connection)
weather_data

Unnamed: 0,date,day,time,hour,sunrise,sunset,main_description,wind_speed,temp
0,2021-02-26,6,0 days 19:45:51,19,0 days 07:19:29,0 days 17:56:46,Clouds,3.09,8.0
1,2021-03-02,3,0 days 08:11:07,8,0 days 07:10:20,0 days 18:04:23,Fog,1.54,0.0
2,2021-03-02,3,0 days 09:16:07,9,0 days 07:10:20,0 days 18:04:23,Mist,1.54,2.0
3,2021-03-02,3,0 days 10:15:34,10,0 days 07:10:20,0 days 18:04:23,Mist,1.54,5.0
4,2021-03-02,3,0 days 11:14:47,11,0 days 07:10:20,0 days 18:04:23,Mist,2.06,5.0
...,...,...,...,...,...,...,...,...,...
991,2021-04-12,2,0 days 14:20:50,14,0 days 05:32:00,0 days 19:19:11,Rain,4.12,8.0
992,2021-04-12,2,0 days 15:17:14,15,0 days 05:32:00,0 days 19:19:11,Clouds,4.63,8.0
993,2021-04-12,2,0 days 16:15:19,16,0 days 05:32:00,0 days 19:19:11,Clouds,4.63,8.0
994,2021-04-12,2,0 days 17:20:02,17,0 days 05:32:00,0 days 19:19:11,Rain,4.12,8.0


In [28]:
#transfrom features

In [29]:
workday = (1 < weather_data.day) & (weather_data.day < 7) * 1.0
weather_data['is_workday'] = workday.astype('int')

In [30]:
rain_yn = []
for description in weather_data['main_description']:
    rain_yn.append('rain' in description.lower())
weather_data['rain_yn'] = rain_yn
weather_data['rain_yn'] = weather_data['rain_yn'].astype('int')

In [31]:
weather_data['daytime'] = (weather_data.time > weather_data.sunrise) & (weather_data.time < weather_data.sunset)

In [32]:
weather_data['daytime'] = weather_data['daytime'].astype('int')

In [33]:
weather_data['morning-rush'] = (weather_data.hour >= 8) & (weather_data.hour < 10) & (weather_data.is_workday == 1)

In [34]:
weather_data['morning-rush'] = weather_data['morning-rush'].astype('int')

In [35]:
weather_data['evening-rush'] = (weather_data.hour >= 4) & (weather_data.hour < 7) & (weather_data.is_workday == 1)

In [36]:
weather_data['evening-rush'] = weather_data['evening-rush'].astype('int')

In [37]:
for_scaler = weather_data.drop(['date', 'hour', 'sunrise', 'sunset', 'time', 'day', 'main_description'], axis='columns')

In [38]:
# create scaler
x = for_scaler.values
weather_scaler = preprocessing.MinMaxScaler()
weather_scaler.fit(x) 

MinMaxScaler()

In [39]:
for_scaler

Unnamed: 0,wind_speed,temp,is_workday,rain_yn,daytime,morning-rush,evening-rush
0,3.09,8.0,1,0,0,0,0
1,1.54,0.0,1,0,1,1,0
2,1.54,2.0,1,0,1,1,0
3,1.54,5.0,1,0,1,0,0
4,2.06,5.0,1,0,1,0,0
...,...,...,...,...,...,...,...
991,4.12,8.0,1,1,1,0,0
992,4.63,8.0,1,0,1,0,0
993,4.63,8.0,1,0,1,0,0
994,4.12,8.0,1,1,1,0,0


In [40]:
weather_scaler.transform(x)

array([[0.17292225, 0.54545455, 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.06903485, 0.18181818, 1.        , ..., 1.        , 1.        ,
        0.        ],
       [0.06903485, 0.27272727, 1.        , ..., 1.        , 1.        ,
        0.        ],
       ...,
       [0.27613941, 0.54545455, 1.        , ..., 1.        , 0.        ,
        0.        ],
       [0.2419571 , 0.54545455, 1.        , ..., 1.        , 0.        ,
        0.        ],
       [0.17292225, 0.59090909, 1.        , ..., 1.        , 0.        ,
        0.        ]])

In [41]:
import pickle

In [42]:
pickle.dump(weather_scaler, open('scaler_v2.sav', 'wb'))

In [43]:
station_numbers_sql = """
SELECT number
FROM stations
ORDER BY number ASC"""
station_numbers = pd.read_sql(station_numbers_sql, connection)

In [44]:
#pull in historical availability data and map with weather data for each station, before storing resulting dataframe
# in dictionary.

data_for_models_v2 = {}

for station_number in station_numbers['number']:
    sql = f"""
SELECT DATE(retrieved) as date, HOUR(retrieved) as hour, ROUND(avg(available_bikes)) as avg_available_bikes
FROM station_update
WHERE number = {station_number} 
GROUP BY HOUR(retrieved), DATE(retrieved)"""
    station_target = pd.read_sql(sql, connection)
    weather_with_availability = pd.merge(weather_data, station_target, on=['date','hour'])
    data_for_models_v2[station_number] = weather_with_availability

In [45]:
data_for_models_v2[6]

Unnamed: 0,date,day,time,hour,sunrise,sunset,main_description,wind_speed,temp,is_workday,rain_yn,daytime,morning-rush,evening-rush,avg_available_bikes
0,2021-02-26,6,0 days 19:45:51,19,0 days 07:19:29,0 days 17:56:46,Clouds,3.09,8.0,1,0,0,0,0,4.0
1,2021-03-02,3,0 days 08:11:07,8,0 days 07:10:20,0 days 18:04:23,Fog,1.54,0.0,1,0,1,1,0,0.0
2,2021-03-02,3,0 days 09:16:07,9,0 days 07:10:20,0 days 18:04:23,Mist,1.54,2.0,1,0,1,1,0,0.0
3,2021-03-02,3,0 days 10:15:34,10,0 days 07:10:20,0 days 18:04:23,Mist,1.54,5.0,1,0,1,0,0,0.0
4,2021-03-02,3,0 days 11:14:47,11,0 days 07:10:20,0 days 18:04:23,Mist,2.06,5.0,1,0,1,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,2021-04-12,2,0 days 14:20:50,14,0 days 05:32:00,0 days 19:19:11,Rain,4.12,8.0,1,1,1,0,0,5.0
992,2021-04-12,2,0 days 15:17:14,15,0 days 05:32:00,0 days 19:19:11,Clouds,4.63,8.0,1,0,1,0,0,4.0
993,2021-04-12,2,0 days 16:15:19,16,0 days 05:32:00,0 days 19:19:11,Clouds,4.63,8.0,1,0,1,0,0,4.0
994,2021-04-12,2,0 days 17:20:02,17,0 days 05:32:00,0 days 19:19:11,Rain,4.12,8.0,1,1,1,0,0,3.0


In [46]:
from sklearn.neighbors import KNeighborsRegressor

models_v2 = {}

for station, df in data_for_models_v2.items():
    X = df[['wind_speed', 'temp', 'is_workday', 'rain_yn', 'daytime', 'morning-rush', 'evening-rush']]
    y = df['avg_available_bikes']
    
    X = weather_scaler.transform(X)
    
    model = KNeighborsRegressor()
    model.fit(X, y)
    
    models_v2[station] = model

In [47]:
pickle.dump(models_v2, open('models_v2.sav', 'wb'))