In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle
import json
import re

from datetime import datetime, timedelta
import time
import os

import pickle
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression

## Load all data

Load a subset of the dublin bus data.

In [10]:
def get_bus_data(line=None, direction=None, show_tables=False):
    
    name = "MasterCleanedLeaving&TripsDataCombined.feather"
    return pd.read_feather(name)

In [11]:
all_data = get_bus_data()
all_data.head(2)

Unnamed: 0,index,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,LINEID,ROUTEID,DIRECTION,ArrDel,DepDel,Waiting_Time
0,0,2018-01-01,5972116,12,119,48030,48030,48012,48012,2693211,1,1_37,1,18,18,0
1,29,2018-01-01,5972116,13,44,48079,48079,48058,48058,2693211,1,1_37,1,21,21,0


## Partition data on disk

Loading in all the data to select a subset is very slow, better to save all of it in segments now that can be loaded and acted upon individually.

Determine which segments already exist, write the remaining ones.

In [14]:
lines = all_data.LINEID.unique()
directions = all_data.DIRECTION.unique()

segments = set()
for line in lines:
    for direction in directions:
        segments.add((line, direction))

segments = sorted(list(segments))

def get_path(segment):
    line, direction = segment
    return "segments/" + line + "_" + str(direction) + ".feather"

def save_segment(line, direction):
    
    print(f"Filtering segment")
    
    line_mask = all_data.LINEID == line
    dir_mask = all_data.DIRECTION == direction
    current_data = all_data[dir_mask & line_mask].copy()
    
    if len(current_data) == 0:
        print(f"Skipping segment - no data")
        return
    
    print(f"Cleaning segment")
    
    current_data.drop(["ArrDel", "DepDel", "Waiting_Time", "index"], 
           axis=1, inplace=True)
    
    # I'm renaming the columns for convenience, but they can
    # be renamed back later if needed.
    current_data.columns = [
        "date", "trip_id", "stop_sequence", "stop_id", 
        "planned_arr", "planned_dep", "arr", "dep", "vehicle_id", 
        "line_id", "route_id", "direction"
    ]
    
    current_data.sort_values(by=['trip_id', 'date','stop_sequence'], 
                      ascending = [False,True,True], inplace=True)
    
    current_data.reset_index(drop=True, inplace=True)
    
    print(f"Writing segment")
    
    current_data.to_feather(get_path((line, direction)))

def save_all_segments():
    
    print(len(segments), "segments total.")

    todo = set()
    for segment in segments:
        if not os.path.exists(get_path(segment)):
            todo.add(segment)
    todo = sorted(list(todo))

    print(len(segments) - len(todo), "already complete.", len(todo), "remain.")
    
    
    for i, segment in enumerate(todo):

        line, direction = segment
        path = get_path(segment)

        print(f"Working on segment {i+1}/{len(todo)}: {line}, {direction}")
        save_segment(line, direction)

In [15]:
save_segment("16", 1)
print("Complete")

Filtering segment
Cleaning segment
Writing segment
Complete


A function for loading a specific route/direction

In [3]:
def get_bus_data(line, direction):
    
    path = "segments/" + line + "_" + str(direction) + ".feather"
    
    return pd.read_feather(path)

In [4]:
get_bus_data("16", 1).head(2)

Unnamed: 0,date,trip_id,stop_sequence,stop_id,planned_arr,planned_dep,arr,dep,vehicle_id,line_id,route_id,direction
0,2018-12-27,8591828,1,7347,56880,56880,56891,56891,1000115,16,16_20,1
1,2018-12-27,8591828,2,3669,56950,56950,56972,57002,1000115,16,16_20,1


## Adding features

Define functions for adding various features, and test them on the test subset of data.

In [5]:
test_data = get_bus_data("16", 1)
display(len(test_data))

1557835

### Add journey time

This is the target feature. A row corresponds to a stop in a trips sequence on a date. The journey time is the time taken to arrive at this stop, starting from when the bus **arrived** at the last stop. (Not from when it departed)

Put another way, we're not separating dwell times here, and we are including the dwell for one stop in the time taken to reach the next step.

In [22]:
def add_journey_time(table):

    journey_times = np.zeros(len(table))
    journey_times[1:] = table.dep.values[1:] - table.arr.values[:-1]

    planned_journey_times = np.zeros(len(table))
    planned_journey_times[1:] = table.planned_dep.values[1:] - table.planned_arr.values[:-1]

    table["journey_time"] = journey_times
    table["planned_journey_time"] = planned_journey_times
    table["delay"] = journey_times - planned_journey_times
    table.drop(["planned_arr", "planned_dep", "dep"], axis=1, inplace=True)

    # Drop rows where the actual time is negative.
    # This includes all rows with sequence 1, naturally!
    # It also includes some rows that seem to be errors in the data.
    labels = table.index[table.journey_time <= 0]
    table.drop(labels, inplace=True)

In [22]:
test_data_2 = test_data.copy()
add_journey_time(test_data_2)
test_data_2.head(2)
display(len(test_data_2))

1538748

### Add datetime features

I know I said I tend to default to lists, but because of the size of all the routes put together I rewrote them as numpy statements. 
<br>Not the most readable code, but it is much faster than the list based approach I think and there are so many rows.

In [23]:
def add_datetime_features(table):

    # General breakdown of time
    
    month = table.date.dt.month.values
    day = table.date.dt.day.values
    hour = np.floor(table.arr.values/3600)
    
    # Are people going to school?

    school = ~((6 <= month) & (month <= 8))

    # Is it rush hour?

    def s(time): # s for "to seconds"
        h, m = int(time[:2]), int(time[3:])
        return h*3600 + m*60

    arr = table.arr.values
    rush_hour = np.full(len(table), False)
    for a, b in [("07:00", "08:30"), ("16:00", "18:00")]:
        rush_hour |= (arr >= s(a)) & (arr <= s(b))

    # Is it a weekend?

    weekday = table.date.dt.weekday.values
    weekend = (weekday == 5) | (weekday == 6)

    # Is it an official holiday?

    holiday_options = [
        '2018-01-01', '2018-03-17', '2018-04-02', '2018-05-07','2018-06-04', 
        '2018-08-06', '2018-10-29', '2018-12-25', '2018-12-26'
    ]

    holiday = np.full(len(table), False)
    for option in holiday_options:
        holiday |= table.date == np.datetime64(option)

    # Add these to the dataframe.

    table["month"] = month
    table["day"] = day
    table["hour"] = hour
    
    table["school"] = school
    table["rush_hour"] = rush_hour
    table["weekend"] = weekend
    table["holiday"] = holiday

In [24]:
test_data_3 = test_data_2.copy()
add_datetime_features(test_data_3)
test_data_3.head(2)
display(len(test_data_3))

1538748

### Add weather features

Load Met Eireann Historical 2018 weather data. We're only keeping certain columns.

In [2]:
def get_weather():
    
    weather = pd.read_csv("historic_weather.csv", parse_dates=[0])
    weather = weather[["weather_date", "rain", "temp", "wetb", "wdsp", "vis", "clamt"]]
    
    return weather

In [3]:
weather = get_weather()
weather.head(2)
display(len(weather))

8760

Merge with bus data based on date and hour.

In [25]:
def merged_with_weather(table):
    
    weather = get_weather()
    weather["date"] = pd.to_datetime(weather.weather_date.dt.date)
    weather["hour"] = weather.weather_date.dt.hour
    weather.drop(["weather_date"], axis=1, inplace=True)
    
    combined = pd.merge(table, weather, on=["date", "hour"])
    return combined

In [30]:
test_data_4 = merged_with_weather(test_data_3.copy())
test_data_4.head(3)
print(len(test_data_4))

1533242


## Summary of Features & Merging

A function to summarize adding features and merging with weather.

In [26]:
def add_features(bus_data):
    
    add_journey_time(bus_data)
    add_datetime_features(bus_data)

    combined_data = merged_with_weather(bus_data)

    return combined_data

In [33]:
test_data = get_bus_data("16", 1)
test_data = add_features(test_data)
test_data.head()
print(len(test_data))

1533242


## Comparison to Leah's Data

In [27]:
def get_bus_data2(line, direction):
    
    path = "wholeRoutesLeah/df_" + line + "_" + str(direction) + ".csv"
    
    return pd.read_csv(path, index_col=0)

In [28]:
data = get_bus_data("16", 1)
data2 = get_bus_data2("16", 1)

# data2 has all the features already added.
data = add_features(data)

From a basic display, there are some differences to be seen.

In [29]:
display(data.head(2))
display(data2.head(2))

print("Counts")
print("1:", len(data))
print("2:", len(data2))

Unnamed: 0,date,trip_id,stop_sequence,stop_id,arr,vehicle_id,line_id,route_id,direction,journey_time,...,school,rush_hour,weekend,holiday,rain,temp,wetb,wdsp,vis,clamt
0,2018-12-27,8591828,2,3669,56972,1000115,16,16_20,1,111.0,...,True,False,False,False,0.0,9.3,8.6,5,40000,7
1,2018-12-27,8591828,3,7349,57085,1000115,16,16_20,1,113.0,...,True,False,False,False,0.0,9.3,8.6,5,40000,7


Unnamed: 0,index,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,...,RushHour,Weekend,Holiday,weather_date,rain,temp,wetb,wdsp,vis,clamt
0,114812203,2018-12-27,8591828,2,3669,15:49:10,15:49:10,15:49:32,15:50:02,1000115,...,0,0,0,2018-12-27 15:00:00,0.0,9.3,8.6,5,40000,7
1,114812204,2018-12-27,8591828,3,7349,15:50:03,15:50:03,15:51:25,15:51:25,1000115,...,0,0,0,2018-12-27 15:00:00,0.0,9.3,8.6,5,40000,7


Counts
1: 1533242
2: 1536438


The column names need to be matched to go further with this.

In [32]:
display(data.columns); display(data2.columns)

Index(['date', 'trip_id', 'stop_sequence', 'stop_id', 'arr', 'vehicle_id',
       'line_id', 'route_id', 'direction', 'journey_time',
       'planned_journey_time', 'delay', 'month', 'day', 'hour', 'school',
       'rush_hour', 'weekend', 'holiday', 'rain', 'temp', 'wetb', 'wdsp',
       'vis', 'clamt'],
      dtype='object')

Index(['index', 'DAYOFSERVICE', 'TRIPID', 'PROGRNUMBER', 'STOPPOINTID',
       'PLANNEDTIME_ARR', 'PLANNEDTIME_DEP', 'ACTUALTIME_ARR',
       'ACTUALTIME_DEP', 'VEHICLEID', 'LINEID', 'ROUTEID', 'DIRECTION',
       'ArrDel', 'DepDel', 'Waiting_Time', 'priorstops_actualTime_Arr',
       'JourneyTime', 'planned_journey_time', 'delay', 'Month', 'Day', 'Hour',
       'School', 'RushHour', 'Weekend', 'Holiday', 'weather_date', 'rain',
       'temp', 'wetb', 'wdsp', 'vis', 'clamt'],
      dtype='object')

In [36]:
data2_v2 = data2.drop(["index", "VEHICLEID", "weather_date"], axis=1)
data2_v2.columns = ["date", "trip_id", "stop_sequence", "stop_id", "planned_arr", "planned_dep", "arr", "dep",
                   "line_id", "route_id", "direction", ]
display(data2_v2.columns)

Index(['TRIPID', 'PROGRNUMBER', 'STOPPOINTID', 'PLANNEDTIME_ARR',
       'PLANNEDTIME_DEP', 'ACTUALTIME_ARR', 'ACTUALTIME_DEP', 'LINEID',
       'DIRECTION', 'ArrDel', 'DepDel', 'Waiting_Time',
       'priorstops_actualTime_Arr', 'JourneyTime', 'planned_journey_time',
       'delay', 'Month', 'Day', 'Hour', 'School', 'RushHour', 'Weekend',
       'Holiday', 'rain', 'temp', 'wetb', 'wdsp', 'vis', 'clamt'],
      dtype='object')

In [37]:
data2.Waiting_Time

0          30
1           0
2           0
3           0
4           0
           ..
1536433     0
1536434     0
1536435     0
1536436     0
1536437     0
Name: Waiting_Time, Length: 1536438, dtype: int64

## Model Training & Eval

I'm keeping model training & eval in the same notebook. It's not a very long notebook since it's just retracing steps and shortening things.

In [90]:
# This is the list of features being included here, including the target feature.
# I imagine this will change later.

features = [
    #"journey_time",
    "delay",
    "stop_id",
    "month",
    "day",
    "hour",
    "rain",
    "temp",
    "wetb",
    "wdsp",
    "vis",
    "clamt"
]

target_feature = "delay"
#target_feature = "journey_time"

# The model to be used.

# I don't know if models should be reused, esp. with#
# with random_state being there, hence the function.

get_model = lambda: LinearRegression(copy_X=False)

def prepare_for_training(table):
    
    # Is there a less esoteric way to specify columns to keep
    # with inplace?
    to_remove = set(table.columns) - set(features)
    table.drop(to_remove, axis=1, inplace=True)
    
    y = table[target_feature]
    table.drop(target_feature, axis=1, inplace=True)
    X = table
    
    return X,y
    
    
def train(X, y):
    
    model = get_model()
    model.fit(X, y)
    
    return model
    

def eval_metrics(X, y):
    
    model = get_model()
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=1)
    model.fit(X_train, y_train)
    
    prediction = model.predict(X_test)
    
    ae = abs(y_test - prediction)
    mae = np.mean(ae)
    mape = np.mean(100*ae/y_test)
    
    return mae, mape


def get_path(line, direction, kind):
    
    ext = "pkl" if kind == "pickle" else "job"
    path = f"pickles/{line}_{direction}.{ext}"
    
    return path


def store(line, direction, model, kind):
    
    path = get_path(line, direction , kind)
    
    with open(path, "wb") as file:
        if kind == "pickle":
            pickle.dump(model, file)
        else:
            joblib.dump(model, file, compress=3)
        

In [77]:
def eval_single(line, direction):
    
    data = add_features(get_bus_data(line, direction))

    X, y = prepare_for_training(data)

    model = DecisionTreeRegressor(
        random_state = 10,
        max_depth = 25,
        max_leaf_nodes = 206000
    )
    
    #model = LinearRegression(copy_X=False)
    
    #model = MLPRegressor(random_state=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=1)
    model.fit(X_train, y_train)
    
    prediction = model.predict(X_test)
    
    ae = abs(y_test - prediction)
    mae = np.mean(ae)
    mape = np.mean(100*ae/y_test)
    
    print("MAE:", mae, "  MAPE:", mape)

In [78]:
eval_single("16", 1)

MAE: 45.72280904771383   MAPE: 44.272834951004704


Applying this to all routes

In [91]:
def get_segments():

    segments = set()
    for name in os.listdir("segments"):
        match = re.match("(\w+)_(\d)\.feather", name)
        if match is None:
            print("Skipping: "  +name)
        line, direction = match.groups()
        segments.add((line, int(direction)))
        
    return segments
    
def store_all(kind):
    
    segments = list(get_segments())
    segments = sorted(segments)
    
    paths = list()
    for line, direction in segments:
        paths.append(get_path(line, direction, kind))
        
    for i, path in enumerate(paths):
        if not os.path.exists(path):
            start_i = i - 1
            break
    else:
        start_i = len(paths) - 1
        
    if start_i == -1:
        start_i = 0
    
    print(len(segments), "total segements.")
    print(start_i, "already done. ", end="")
    print(len(segments) - start_i, " to do.")
    
    for i in range(start_i, len(segments)):
        
        line, direction = segments[i]
        path = paths[i]
        
        print("Segment:", i, f"({line}, {direction})")
        
        data = add_features(get_bus_data(line, direction))
        
        if len(data) < 5:
            print("SKIPPING - ", len(data), "data points")
            continue
        
        X, y = prepare_for_training(data)
        
        mae, mape = eval_metrics(X,y)
        print("MAE:", mae, "  MAPE:", mape)
        
        model = train(X,y)
        store(line, direction, model, kind)


In [92]:
def get_stored_size(kind):
    
    count = 0
    total_size = 0
    for line, direction in get_segments():
        path = get_path(line, direction, kind)
        if not os.path.exists(path):
            continue
        total_size += os.path.getsize(path)
        count += 1
        
    return total_size, count


In [93]:
byte_count, count = get_stored_size("pickle")
print(f"Total size: {byte_count/(2**30):.02f}Gb")
print(f"Total count: {count}")
print(f"(Segment count: {len(get_segments())})")

Total size: 0.00Gb
Total count: 0
(Segment count: 2)


In [95]:
store_all("job")

2 total segements.
0 already done. 2  to do.
Segment: 0 (16, 1)
MAE: 34.309222715229154   MAPE: inf
Segment: 1 (46A, 1)
MAE: 33.60453587969265   MAPE: inf


In [21]:
def count_records():
    
    segments = list(get_segments())
    
    error_count = 0
    record_counts = list()
    
    for i, (line, direction) in enumerate(segments):
    
        print(f"Checking segment {i+1}/{len(segments)}")
        print("Line:", line, "  Direction:", direction)
    
        try:
            data = get_bus_data(line, direction)
        except:
            error_count += 1
            continue
            
        count = len(data)
        record_counts.append([
            line, direction, count
        ])
        
        print("Count:", count)
    
    result = pd.DataFrame(
        record_counts, columns=["line", "direction", "count"]
    )
    
    return result, error_count

In [22]:
counts, error_count = count_records()

Checking segment 1/252
Line: 27   Direction: 1
Count: 1889983
Checking segment 2/252
Line: 41A   Direction: 2
Count: 21383
Checking segment 3/252
Line: 38B   Direction: 2
Count: 76262
Checking segment 4/252
Line: 29A   Direction: 1
Count: 574003
Checking segment 5/252
Line: 42   Direction: 1
Count: 629663
Checking segment 6/252
Line: 40E   Direction: 1
Count: 13153
Checking segment 7/252
Line: 31A   Direction: 2
Count: 210397
Checking segment 8/252
Line: 18   Direction: 2
Count: 667371
Checking segment 9/252
Line: 17A   Direction: 1
Count: 846350
Checking segment 10/252
Line: 41D   Direction: 1
Count: 192
Checking segment 11/252
Line: 66B   Direction: 1
Count: 239701
Checking segment 12/252
Line: 66X   Direction: 2
Count: 82388
Checking segment 13/252
Line: 17   Direction: 2
Count: 526751
Checking segment 14/252
Line: 33   Direction: 2
Count: 536045
Checking segment 15/252
Line: 27X   Direction: 2
Count: 19638
Checking segment 16/252
Line: 116   Direction: 1
Count: 4837
Checking segmen

Count: 233551
Checking segment 133/252
Line: 76A   Direction: 1
Count: 21051
Checking segment 134/252
Line: 15   Direction: 2
Count: 1781706
Checking segment 135/252
Line: 25X   Direction: 2
Count: 8008
Checking segment 136/252
Line: 122   Direction: 1
Count: 832950
Checking segment 137/252
Line: 118   Direction: 2
Count: 5957
Checking segment 138/252
Line: 38A   Direction: 1
Count: 327341
Checking segment 139/252
Line: 40D   Direction: 1
Count: 572341
Checking segment 140/252
Line: 33B   Direction: 2
Count: 124181
Checking segment 141/252
Line: 7B   Direction: 2
Count: 64761
Checking segment 142/252
Line: 238   Direction: 1
Count: 94047
Checking segment 143/252
Line: 102   Direction: 1
Count: 439915
Checking segment 144/252
Line: 46A   Direction: 1
Count: 2033310
Checking segment 145/252
Line: 75   Direction: 1
Count: 676884
Checking segment 146/252
Line: 41C   Direction: 2
Count: 723675
Checking segment 147/252
Line: 66A   Direction: 2
Count: 174554
Checking segment 148/252
Line: 70D