In [1]:
import dask.dataframe as dd
import pandas as pd
from datetime import datetime, timedelta, time
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import pickle
import joblib
import os
from tqdm import tqdm

In [2]:
# reading the data of a dask directory and converting into a pandas dataframe
def read_dd_to_pandas(route):
    route_dask_df = dd.read_csv("~/data/leavetimes_split_by_route/{}/*.part".format(route))
    route_dask_df = route_dask_df.drop('Unnamed: 0', axis=1)
    route_pd_df = route_dask_df.compute()
    route_pd_df = route_pd_df.reset_index(drop=True)
    # changing the datatypes
    route_pd_df["DAYOFSERVICE"] = pd.to_datetime(route_pd_df["DAYOFSERVICE"])
    route_pd_df["LASTUPDATE"] = pd.to_datetime(route_pd_df["LASTUPDATE"])

    # categorical things
    route_pd_df["TRIPID"] = route_pd_df["TRIPID"].astype("category")
    route_pd_df["VEHICLEID"] = route_pd_df["VEHICLEID"].astype("category")
    route_pd_df["SUPPRESSED"] = route_pd_df["SUPPRESSED"].astype("category")
    route_pd_df["STOPPOINTID"] = route_pd_df["STOPPOINTID"].astype("category")
    #route_pd_df["PROGRNUMBER"] = route_pd_df["PROGRNUMBER"].astype("category")
    
    # NUMBERICAL tings
    # all other features, will keep these the same as before
    # drop the features to save memory
    route_pd_df = route_pd_df.drop(["LASTUPDATE", "VEHICLEID", "PLANNEDTIME_DEP", "ACTUALTIME_DEP", "SUPPRESSED", "JUSTIFICATIONID"], axis = 1)
    return route_pd_df

def holiday(df):
    holiday = [6,7,8, 12]
    if df["DAYOFSERVICE"].month in holiday:
        return 1
    elif df["DAYOFSERVICE"].month == 3 and df["DAYOFSERVICE"].day == 17:
        return 1
    elif df["DAYOFSERVICE"].month == 1 and df["DAYOFSERVICE"].day == 1:
        return 1
    elif df["DAYOFSERVICE"].month == 5 and df["DAYOFSERVICE"].day == 1:
        return 1
    else:
        return 0
    
# create mapping function
def seconds_to_time(n):
    return timedelta(seconds=n)

# creating new features
def create_features(route_pd_df):
    # we need to merge the data for the route and the weather data
    #merge with respect to the time, and eazch hour
    # for example, 1:30 should be emrged with the data for 1:00 as weather
    # is on hourly basis

    # CREATE  a new feature in the routes that contains the hours so that we can 
    # merge with the weather data
    route_time_planned = route_pd_df["PLANNEDTIME_ARR"]
    route_time_actual = route_pd_df["ACTUALTIME_ARR"]

    actual_time_hour = []
    expected_time_hour = []
    for i in range(len(route_time_actual)):
        actual_time_hour.append(route_time_actual[i] // 3600)
        expected_time_hour.append(route_time_planned[i] // 3600)

    actual_date = route_pd_df["DAYOFSERVICE"]


    # parse it into datetime with this  any hours
    date_time = []
    for i in range(len(actual_time_hour)):
        new_datetime = actual_date[i] + timedelta(hours = int(actual_time_hour[i]))
        date_time.append(new_datetime)

    # add in the new column, 
    # new column means that we can remove the other date column
    route_pd_df["date_and_time"] = date_time

    route_pd_df["delay_amount"] = route_pd_df["ACTUALTIME_ARR"] - route_pd_df["PLANNEDTIME_ARR"]

    #combined_df["planned_hour_arr"]
    hours = route_pd_df["PLANNEDTIME_ARR"]//3600
    #combined_df["planned_minute_arr"]

    minutes= (route_pd_df["PLANNEDTIME_ARR"]%3600) // 60

    route_pd_df["planned_arr_hours"] = hours
    route_pd_df["planned_arr_minutes"] = minutes
    # get the day of the week form the 
    route_pd_df["day_of_week"] = route_pd_df["DAYOFSERVICE"].dt.day_name()
    route_pd_df["holiday"] = route_pd_df.apply(holiday, axis = 1)
    route_pd_df[ "datetime_exact"] = route_pd_df["DAYOFSERVICE"] + pd.Series(map(seconds_to_time, route_pd_df["PLANNEDTIME_ARR"]), name="time")

    return route_pd_df

In [3]:
# read in the trips data
trips_df = dd.read_csv("data/rt_trips_DB_2018.txt", sep = ";")
trips_df= trips_df[["TRIPID", "DIRECTION"]].compute()

In [8]:
trips_df = dd.read_csv("data/rt_trips_DB_2018.txt", sep = ";")
trips_df.compute().nunique()

DATASOURCE              1
DAYOFSERVICE          360
TRIPID             658964
LINEID                130
ROUTEID               588
DIRECTION               2
PLANNEDTIME_ARR     64461
PLANNEDTIME_DEP       791
ACTUALTIME_ARR      68122
ACTUALTIME_DEP      66771
BASIN                   1
TENDERLOT               0
SUPPRESSED              1
JUSTIFICATIONID      3526
LASTUPDATE            360
NOTE                46690
dtype: int64

In [6]:
trips_df.nunique()

TRIPID       658964
DIRECTION         2
dtype: int64

In [4]:
trips_df.drop_duplicates().shape

(658964, 2)

In [7]:
# save the 
trips_df.to_csv("data/trips_extract.csv", index = False)

In [4]:
# load the encoders from disk
loaded_encoder = joblib.load("models/days_of_week_one_hot_encoder.sav")
weather_encoder = joblib.load("models/weather_encoder.sav")

In [5]:
# read weather data
weather_df = pd.read_csv("~/data/openweatherapi_2018_data_with_columns_removed2021-06-26 01:55:47.555341.csv")

# change the dtypes of the weather features
weather_df["dt"] = pd.to_datetime(weather_df["dt"]).dt.tz_localize(None)

# categorical data
weather_df["weather_main"] = weather_df["weather_main"].astype("category")
weather_df["weather_icon"] = weather_df["weather_icon"].astype("category")
# reduce the amount of things stored in memory
weather_df = weather_df[["dt", "humidity", "rain_1h", "weather_main"]]

In [6]:
directory = 'data/leavetimes_split_by_route'
for filename in tqdm(os.listdir(directory)):
    try:
        route = filename

        # reading the dataframe
        route_pd_df = read_dd_to_pandas(route)
        # add new features
        route_pd_df = create_features(route_pd_df)
        # create different features
    #     route_pd_df["holiday"] = route_pd_df.apply(holiday, axis = 1)
    #     route_pd_df[ "datetime_exact"] = route_pd_df["DAYOFSERVICE"] + pd.Series(map(seconds_to_time, route_pd_df["PLANNEDTIME_ARR"]), name="time")

        # get the directions
        route_pd_df = pd.merge(route_pd_df ,trips_df) 

        # merge the weather and bus data. then remove the 
        total_df = pd.merge(route_pd_df, weather_df, left_on = "date_and_time", right_on = "dt")
        # drop any duplicates with respect to the time and tripid,
        # should be unique
        total_df.drop_duplicates(["TRIPID", "datetime_exact"], inplace = True)
        # removing outliers
        weather_train_no_out = total_df[total_df["delay_amount"] < 1800]
        # setting up the training data fro the models
        weather_train_target = weather_train_no_out["delay_amount"]
        # for linear regression we cannot use the weekdays encoded as [1-7] 
        # as it implies a heirarchy
        # so we must use the one that is one hot encoded
        # see if we wnat humidity in the mix
        training = weather_train_no_out[["PROGRNUMBER", "planned_arr_hours", "planned_arr_minutes", "holiday",  "DIRECTION", "humidity", "rain_1h"]]

        # if we one hot encode the days of the week
        train_days = loaded_encoder.transform(weather_train_no_out["day_of_week"].values.reshape(-1,1))
        # encode the wetaher types
        weather_train = weather_encoder.transform(np.array(list(weather_train_no_out["weather_main"])).reshape(-1,1))

        # design matrix, the training matrix
        design_matrix = np.concatenate((training.values, train_days.toarray(), weather_train.toarray()), axis = 1)

        # training the model
        rf_weather = RandomForestRegressor(n_estimators = 100, max_depth = 5, n_jobs = -1, oob_score = True)
        rf_weather.fit(design_matrix, weather_train_target)
        filename = 'models/rf_{}.sav'.format(route)
        pickle.dump(rf_weather, open(filename, 'wb'))

        # take into consideration if we run into memory error, as resources are sharedc
        # so that we know which files tro extract to the loacl device to train the models
    except MemoryError:
        # so that we can know which routes need to be trained locally
        # as iut caused a memopry error on the server
        print(route, "had an error with memory. train locally")

    

 11%|█         | 14/130 [42:32<7:38:04, 236.94s/it]

route39A had an error with memory. train locally


 21%|██        | 27/130 [1:11:31<6:20:43, 221.78s/it]

route145 had an error with memory. train locally


100%|██████████| 130/130 [4:54:58<00:00, 136.14s/it]  
