In [1]:
URI = "..."
PORT = 3306
NAME = "..."
USER = "..."
PASS = "..."

CONNECTION_STRING = f"mysql://{USER}:{PASS}@{URI}:{PORT}/{NAME}"

In [2]:
from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing
import pandas as pd
import MySQLdb

In [3]:
connection = CONNECTION_STRING

In [4]:
def get_weather_forecast(date, time):
    """
    Pulls data from five_day_forecast table. Uses closest previous hour to call hour. (e.g. will return forecast for
    midday if you request 14, rather than for 3pm)
    :param date: string or datetime object
    :param time: integer
    :return: dataframe (single row) but format allows for easier manipulation later.
    """
:

In [9]:
%%time
forecast = get_weather_forecast('2021-03-30', 21)

Wall time: 2.53 s


In [6]:
def get_historical_weather_data(date, time):
    """Takes in a date argument as a string or a datetime object, and time as an integer 0-23.
    Pulls historical weather data from database for weather on the day and the week and time.
    Returns dataframe with the weather records."""
    sql = f"""
SELECT date_time, description, wind_speed, feels_like - 270
FROM dublin_weather 
WHERE DAYOFWEEK(date_time) = DAYOFWEEK("{date}") AND HOUR(date_time) = {time} ;"""
    historical = pd.read_sql(sql, connection, index_col='date_time')
    return historical

In [10]:
%%time
historical_data = get_historical_weather_data('2021-03-30', 21)

Wall time: 2.53 s


In [11]:
def transform_and_normalize_weather_data(historical_df, forecast_df):
    """Takes in the historical and predicted weather forecasts in dataframe format.
    Concatenates them, transforms the description column to extract a binary value for rain_yn, drops description column
    and finally normalises the data so all values are between 0 and 1.
    Returns dataframe of the cleaned and normalised data."""
    merged = pd.concat([historical_df, forecast_df])
    rain_yn = []
    for description in merged['description']:
        rain_yn.append('rain' in description)
    merged['rain_yn'] = rain_yn
    merged.drop('description', axis='columns', inplace=True)

    x = merged.values  # returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled, columns=merged.columns)

    return df

In [13]:
%%time
df = transform_and_normalize_weather_data(historical_data, forecast)

Wall time: 9.93 ms


In [14]:
def get_nearest_neighbour_index(normalized_df_with_input_as_final_row):
    """Takes in normalised dataframe with the row seeking a match as the final row, and
    finds the nearest neighbour in the rest of the dataframe.
    It returns the index of the nearest neighbour in the dataframe. (this can then be used to extract the date of
    the nearest neighbour.)"""
    samples = [normalized_df_with_input_as_final_row.iloc[i].to_numpy() for i in
               range(normalized_df_with_input_as_final_row.shape[0] - 1)]
    neigh = NearestNeighbors(n_neighbors=1)
    neigh.fit(samples)
    nn = neigh.kneighbors(
        [normalized_df_with_input_as_final_row.iloc[normalized_df_with_input_as_final_row.shape[0] - 1].to_numpy()])
    index_nn = nn[1][0][0]
    return index_nn

In [15]:
%%time
get_nearest_neighbour_index(df)

Wall time: 0 ns


3

In [16]:
def get_average_data_all_stations(date, hour):
    """
    Returns bke data for all stations
    :param date: formatted as string or datetime object.
    :param hour: Integer 0 - 23
    :return: Dictionary of total capacity and predicted bike availability for all stations on map.
    """
    sql = f"""
    SELECT number, name, address, bike_stands, latitude, longitude, banking, 
    ROUND(avg(available_bikes)) as available_bikes, status
    FROM stations 
    INNER JOIN station_update USING (`number`)
    WHERE DATE(retrieved) = DATE("{date}") AND HOUR(retrieved) = {hour} 
    GROUP BY number;"""
    df = pd.read_sql(sql.format(date, hour), connection)
    result = dict()
    result["rowcount"] = len(df)
    for field in df:
        result[field] = df[field].values.tolist()

    return result

In [None]:
%%time
get_average_data_all_stations('2021-03-20', 15)

In [4]:
def get_weather_data(date, time):
    sql = f"""
SELECT date_time, description, wind_speed, feels_like - 270
FROM dublin_weather
WHERE DAYOFWEEK(date_time) = DAYOFWEEK("{date}") AND HOUR(date_time) = {time}
UNION
SELECT date_time, description, wind_speed, feels_like - 270
FROM five_day_forecast
WHERE DATE(date_time) = DATE("{date}") AND HOUR(date_time) = {time} OR HOUR(date_time) = {time - 1} OR HOUR(date_time) = {time - 2}
ORDER BY date_time;"""
    forecast = pd.read_sql(sql, connection, index_col='date_time')
    return forecast

In [5]:
%%time
get_weather_data('2021-04-06', 12)

Wall time: 4.59 s


Unnamed: 0_level_0,description,wind_speed,feels_like - 270
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-03-02 12:15:11,mist,3.09,6.84
2021-03-09 12:18:17,broken clouds,6.17,8.35
2021-03-16 12:15:02,scattered clouds,8.23,9.58
2021-03-23 12:16:42,few clouds,7.72,8.11
2021-03-30 12:19:45,few clouds,3.6,15.36
2021-04-06 12:00:00,broken clouds,6.6,3.98


In [None]:
def get_target_outcomes(date, hour):
    