In [1]:
import os
import pathlib
import re
import shutil

import pickle
import joblib
import sklearn
import requests
from bs4 import BeautifulSoup
import pandas as pd

from datetime import datetime, timedelta, timezone

# Integrating Models

This notebook is a short one for the sake of experimenting with how the pickle files work, and preparing them for the database notebook.

## Storing Models

The format of the models is subject to implementation changes. As of writing this, it is prefix_route_number_direction, where the middle number can be ignored.

In [2]:
os.listdir("./models/decision-tree")[0]

'decisionTreeModelWhole_102_8_1.pkl'

I'll simplify the naming a little for the ones to be put in storage.

In [4]:
src_folder = "./models/decision-tree"
out_folder = "./models/out"
pattern = "decisionTreeModelWhole_([\dA-Z]+)_(\d+)_(\d+)\.pkl"

# Create a common output folder.
pathlib.Path(out_folder).mkdir(parents=True, exist_ok=True)

originals = os.listdir(src_folder)
for src_name in originals:
    match = re.match(pattern, src_name)
    if not match:
        print("Failed to match:", src_name)
        continue
        
    line, _, direction = match.groups()

    # Convert from the historical dublin bus direction to the
    # GTFS direction. This just means subtracting one.
    direction =str(int(direction) - 1)
    
    out_name = "model_" + line + "_" + direction + ".pkl"
    
    src_path = src_folder + "/" + src_name
    out_path = out_folder + "/" + out_name
    
    shutil.copy(src_path, out_path)

## Using Models

Let's try with one. The goal is to get it to give a somewhat coherent answer, which means getting the inputs right.

In [3]:
def get_model(line, direction):
    path = f"./models/out/model_{line}_{direction}.pkl"
    return get_model2(path)

def get_model2(path):
    with open(path, "rb") as file:
        return pickle.load(file)

13 Features. That's a good sanity check.<br>
They should be the features described below:
```
STOPPOINTID      int64
Month            int64
Day              int64
Hour             int64
School           int64
RushHour         int64
Weekend          int64
Holiday          int64
rain           float64
temp           float64
wdsp             int64
vis              int64
clamt            int64
```

The stop number can be pulled from the database. The time features can be recreated from the current time. The weather feature, on the other hand, needs to be pulled from somehwere. OpenWeather and Met. both have some of the data we need. Neither has all. I'm sticking with OpenWeather for now for simplicity.

The response time seems to be about 300ms, which is fine - we don't need to cache it or store it.

## Time

```
Month            int64
Day              int64
Hour             int64
School           int64
RushHour         int64
Weekend          int64
Holiday          int64
```

In [4]:
def get_datetime_features(dt):
    
    # 'astimezone' with no arguments converts the datetime
    # to the current datetime - at some point we need to make sure that 
    # this is configured correctly in Django.
    dt = dt.astimezone()

    month = dt.month
    day = dt.day
    hour = dt.hour

    school = not (6 <= month <= 8)

    # From 7:00 to 8:30 or from 16:00 to 18:00
    max_morning_hour = 8 if dt.minute <= 30 else 7
    morning_rush = 7 <= dt.hour <= max_morning_hour
    evening_rush = 16 <= dt.hour <= 18
    rush_hour = morning_rush or evening_rush

    weekday = dt.weekday
    weekend = (weekday == 5) or (weekday == 6) 

    # Yes. These are hardcoded for 2021. This should be revisited.
    # Source: https://publicholidays.ie/2021-dates/
    holiday_options_2021 = [(1,1), (17,3), (5,4), (3,5),
            (7,6), (2,8), (25,10), (25,12), (26,12)]

    holiday = False
    for h_day, h_month in holiday_options_2021:
        holiday |= dt.day == h_day and dt.month == h_month
        
    return month, day, hour, school, rush_hour, weekend, holiday

## Weather

13 Features. That's a good sanity check.<br>
They should be the features described below:
```
rain           float64
temp           float64
wdsp             int64
vis              int64
clamt            int64
```

Example output from open weather:

```
{'dt': 1628618400,
 'main': {'temp': 297.71,
  'feels_like': 297.4,
  'temp_min': 297.71,
  'temp_max': 303.02,
  'pressure': 1012,
  'sea_level': 1012,
  'grnd_level': 1001,
  'humidity': 45,
  'temp_kf': -5.31},
 'weather': [{'id': 800,
   'main': 'Clear',
   'description': 'clear sky',
   'icon': '01d'}],
 'clouds': {'all': 0},
 'wind': {'speed': 1.07, 'deg': 265, 'gust': 0.98},
 'visibility': 10000,
 'pop': 0,
 'sys': {'pod': 'd'},
 'dt_txt': '2021-08-10 18:00:00'}
```

Example output from Met:

```
<time datatype="forecast" from="2021-08-09T18:00:00Z" to="2021-08-09T18:00:00Z">
<location altitude="34" latitude="53.3498" longitude="-6.2603">
<temperature id="TTT" unit="celsius" value="16.9"></temperature>
<winddirection deg="251.2" id="dd" name="W"></winddirection>
<windspeed beaufort="3" id="ff" mps="4.9" name="Lett bris"></windspeed>
<globalradiation unit="W/m^2" value="155.8"></globalradiation>
<humidity unit="percent" value="85.0"></humidity>
<pressure id="pr" unit="hPa" value="1008.3"></pressure>
<cloudiness id="NN" percent="81.7"></cloudiness>
<lowclouds id="LOW" percent="37.1"></lowclouds>
<mediumclouds id="MEDIUM" percent="4.5"></mediumclouds>
<highclouds id="HIGH" percent="58.0"></highclouds>
<dewpointtemperature id="TD" unit="celsius" value="14.5"></dewpointtemperature>
</location>
</time>
```

In [5]:
key = "23d2e5c8f2ccf80d120548326514d859"
name = "dublin"

url = f"http://api.openweathermap.org/data/2.5/forecast?q={name}&appid={key}"

def unix_to_dt(unix_seconds):
    return datetime(1970, 1, 1, tzinfo=timezone.utc) + timedelta(seconds=unix_seconds)

def get_weather_features():
    
    response = requests.get(url)
    weather_data = response.json()["list"]

    target_dt = datetime.now(timezone.utc)
    for item in weather_data:
        dt = unix_to_dt(item["dt"])

        if target_dt < dt:
            target_entry = item
            break
    else:
        # Default to the last entry if the target datetime is
        # beyond the forecast range.
        target_entry = weather_data[-1]

    # This is wrong, I'll come back to it.
    rain = target_entry["main"]["humidity"]
    # Temperature is in celcius, not kelvin.
    temp = target_entry["main"]["temp"] - 273.15 
    wdsp = target_entry["wind"]["speed"]
    vis = target_entry["visibility"]
    # This is temporary, the feature is going to be removed.
    clamt = 5
    
    return rain, temp, wdsp, vis, clamt

## Applying the model

In [6]:
data = [100, *get_datetime_features(datetime.now()), *get_weather_features()]
print(data)

[100, 8, 12, 12, False, False, False, False, 76, 15.930000000000007, 1.72, 10000, 5]


In [8]:
get_model("63", 1).predict([[0]*12])

array([-19.])

```
STOPPOINTID      int64       2023
Month            int64       8
Day              int64       11
Hour             int64       4
School           int64       False
RushHour         int64       False
Weekend          int64       False
Holiday          int64       False
rain           float64       0
temp           float64       17
wdsp             int64       0
vis              int64       10000
clamt            int64       5
```

In [34]:

get_model("63", 0).predict([[2023, 8, 11, 4, False, False, False, False, 0, 17, 1.11, 10000, 5]])

array([3375.])

In [23]:
(datetime(2021,8,11,3) - datetime(1970, 1, 1)).total_seconds()

1628650800.0

In [44]:
data = [[2023, 8, 11, 4, False, False, False, False, 0, 17, 1.11, 10000, 5]]
for path in os.listdir("./models/out"):
    try:
        model = get_model2("./models/out/" + path)
    except:
        continue
    print(path + ": ", end="")
    print(model.predict(data))
    

model_102_0.pkl: [-9.]
model_102_1.pkl: [-21.]
model_104_0.pkl: [11.]
model_104_1.pkl: [2.]
model_11_0.pkl: [30.]
model_11_1.pkl: [32.]
model_13_0.pkl: [-9.]
model_13_1.pkl: [186.5]
model_140_0.pkl: [13.72222222]
model_140_1.pkl: [28.39130435]
model_150_0.pkl: [-12.5]
model_150_1.pkl: [-45.]
model_151_0.pkl: [-1.]
model_151_1.pkl: [29.]
model_15A_0.pkl: [-8.]
model_15A_1.pkl: [94.]
model_17A_0.pkl: [5.66666667]
model_17A_1.pkl: [-10.]
model_17_0.pkl: [23.]
model_17_1.pkl: [-25.]
model_1_0.pkl: [-32.]
model_1_1.pkl: [102.]
model_238_0.pkl: [39.]
model_25B_0.pkl: [-10.]
model_25B_1.pkl: [59.]
model_25_0.pkl: [20.25]
model_25_1.pkl: [40.]
model_27B_0.pkl: [12.]
model_31A_0.pkl: [31.]
model_31A_1.pkl: [-1.]
model_31_0.pkl: [16.]
model_31_1.pkl: [-16.]
model_37_0.pkl: [-1.]
model_37_1.pkl: [36.]
model_38A_0.pkl: [697.]
model_38A_1.pkl: [54.]
model_40B_0.pkl: [3.]
model_40B_1.pkl: [37.]
model_41C_0.pkl: [-44.]
model_41C_1.pkl: [-23.]
model_41_0.pkl: [-93.]
model_41_1.pkl: [25.]
model_44_0.pk

## Aside: Efforts with Met Eireann

In [152]:
# A response can have multiple 'time' elements, correpsonding to 
# different forecasts for different times. 
# We could parse the datetimes, sort them, and chose the closest time slot. 

# That's a lot of effort though. Alternatively API allows requests to
# be limited to a window in time. However, if that window is too
# narrow, we might not get an update at all.

# For now, I'm using a window of 2 hours, which seems to be more than
# enough for current times anyways. If that window fails, a second attempt
# is made with no window at all, so that we have some sort of data to work
# with.

def get_met_data(url):
    
    weather_data = requests.get(url)
    soup = BeautifulSoup(weather_data.content.decode())
    
    try:
        # There might be many of these elements. 
        # We just take one, it doesn't matter which.
        soup = soup.html.body.weatherdata.product.time
    except AttributeError:
        # If no such element exists, return None.
        return None
    
    return soup

def get_weather_data():
    
    # Coordinates for Dublin City
    lat = 53.3498
    lon = -6.2603
    
    base_url = ( 
        "http://metwdb-openaccess.ichec.ie/metno-wdb2ts"
        f"/locationforecast?lat={lat};long={lon}"
    )
    
    # A window in time of two hours, starting now.
    start = datetime.now().strftime("%Y-%m-%dT%H:%M")
    end = (datetime.now() + timedelta(hours=2)).strftime("%Y-%m-%dT%H:%M")
    
    url = base_url + f";from={start};to={end};"
    
    # Try with a window, then fallback to no window.
    soup = get_met_data(url)
    if soup is None:
        soup = get_met_data(base_url)
        
    if soup is None:
        raise ValueError("Unable to retrieve weather forecast")
        
    return soup

In [153]:
start = datetime.now()
soup = get_weather_data()
end = datetime.now()
delta = (end - start)
print(f"Time to get met data: {delta.microseconds//1000}ms ")

Time to get met data: 271ms 


In [151]:
soup

<time datatype="forecast" from="2021-08-09T18:00:00Z" to="2021-08-09T18:00:00Z">
<location altitude="34" latitude="53.3498" longitude="-6.2603">
<temperature id="TTT" unit="celsius" value="16.9"></temperature>
<winddirection deg="251.2" id="dd" name="W"></winddirection>
<windspeed beaufort="3" id="ff" mps="4.9" name="Lett bris"></windspeed>
<globalradiation unit="W/m^2" value="155.8"></globalradiation>
<humidity unit="percent" value="85.0"></humidity>
<pressure id="pr" unit="hPa" value="1008.3"></pressure>
<cloudiness id="NN" percent="81.7"></cloudiness>
<lowclouds id="LOW" percent="37.1"></lowclouds>
<mediumclouds id="MEDIUM" percent="4.5"></mediumclouds>
<highclouds id="HIGH" percent="58.0"></highclouds>
<dewpointtemperature id="TD" unit="celsius" value="14.5"></dewpointtemperature>
</location>
</time>

In [157]:
rain = 
temp = soup.temperature["value"]
wdsp = 
vis = 
clamt = temp

'16.9'

In [114]:
for tag in soup:
    if tag.name == "model":
        tag.extract()
print(soup.prettify())

<weatherdata created="2021-08-09T15:37:48Z" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:nonamespaceschemalocation="http://api.met.no/weatherapi/locationforecast/1.9/schema">
 <meta/>
 <product class="pointData">
  <time datatype="forecast" from="2021-08-09T16:00:00Z" to="2021-08-09T16:00:00Z">
   <location altitude="34" latitude="53.3498" longitude="-6.2603">
    <temperature id="TTT" unit="celsius" value="17.6">
    </temperature>
    <winddirection deg="245.5" id="dd" name="SW">
    </winddirection>
    <windspeed beaufort="4" id="ff" mps="5.6" name="Laber bris">
    </windspeed>
    <globalradiation unit="W/m^2" value="270.0">
    </globalradiation>
    <humidity unit="percent" value="79.1">
    </humidity>
    <pressure id="pr" unit="hPa" value="1008.1">
    </pressure>
    <cloudiness id="NN" percent="78.7">
    </cloudiness>
    <lowclouds id="LOW" percent="55.7">
    </lowclouds>
    <mediumclouds id="MEDIUM" percent="27.8">
    </mediumclouds>
    <highclouds id="

In [42]:
src = "./models/out"

pass_count = 0
fail_count = 0

for name in os.listdir(src):
    path = src + "/" + name
    try:
        with open(path, "rb") as file:
            x = pickle.load(file)
        pass_count += 1
    except:
        fail_count += 1
    break
    
print(pass_count, fail_count)
    

0 1
