In [1]:
import numpy as np
import pandas as pd
import datetime
import time

In [2]:
df = pd.read_csv("additional_avl_data_all_in_one_file.csv", sep=";")
print(df.columns, df.dtypes)

df = df[df.scheduled_arrival_time != "Okänd"]
df = df[df.observed_arrival_time != "Okänd"]

Index(['LineNumber', 'direction', 'DateName', 'DayNameOfWeek',
       'planed_departure', 'StopPointName', 'BKVehicleID',
       'scheduled_arrival_time', 'observed_arrival_time',
       'scheduled_departure_time', 'observed_departure_time', 'DwellTime',
       'StopTime', 'Boardings', 'Alightings', 'CurrentLoad', 'CoveredDistance',
       'RunTime'],
      dtype='object') LineNumber                    int64
direction                    object
DateName                     object
DayNameOfWeek                object
planed_departure             object
StopPointName                object
BKVehicleID                   int64
scheduled_arrival_time       object
observed_arrival_time        object
scheduled_departure_time     object
observed_departure_time      object
DwellTime                     int64
StopTime                      int64
Boardings                   float64
Alightings                  float64
CurrentLoad                 float64
CoveredDistance               int64
RunTime     

In [3]:
def timestamp_to_second(timestr):
    x = time.strptime(timestr, '%H:%M:%S')
    a = datetime.timedelta(hours=x.tm_hour,minutes=x.tm_min,seconds=x.tm_sec).total_seconds()
    return a

In [4]:
route_map = {}
routes = df['direction'].unique()

for route in routes:
    start_point = route.split("-")[0].strip()
    start_point = " ".join(elem for elem in start_point.split(" ")[1:])
    route_map[route] = {'start':  start_point, start_point: 0, 'num_stops': 0}

In [5]:
counter = []

for stop, route in zip(*(df['StopPointName'], df['direction'])):
    
    if stop not in route_map[route]:
        route_map[route]['num_stops'] += 1
        route_map[route][stop] = route_map[route]['num_stops']
        
    counter.append(route_map[route][stop])

In [9]:
df['counter'] = counter

In [7]:
df['scheduled_departure_time'] = df['scheduled_departure_time'].apply(timestamp_to_second)
df['observed_departure_time'] = df['observed_departure_time'].apply(timestamp_to_second)
df['scheduled_arrival_time'] = df['scheduled_arrival_time'].apply(timestamp_to_second)
df['observed_arrival_time'] = df['observed_arrival_time'].apply(timestamp_to_second)

In [10]:
df.head()

Unnamed: 0,LineNumber,direction,DateName,DayNameOfWeek,planed_departure,StopPointName,BKVehicleID,scheduled_arrival_time,observed_arrival_time,scheduled_departure_time,observed_departure_time,DwellTime,StopTime,Boardings,Alightings,CurrentLoad,CoveredDistance,RunTime,counter
0,1,1 Frihamnen - Essingetorget,14-Mar-2016,Måndag,00:05:00,Frihamnen,104028,300.0,333.0,300.0,333.0,0,0,,,,0,0,0
1,1,1 Frihamnen - Essingetorget,14-Mar-2016,Måndag,00:05:00,Frihamnsporten,104028,356.0,398.0,356.0,398.0,0,0,,,,363,65,1
2,1,1 Frihamnen - Essingetorget,14-Mar-2016,Måndag,00:05:00,Sehlstedtsgatan,104028,382.0,425.0,382.0,425.0,0,0,,,,169,27,2
3,1,1 Frihamnen - Essingetorget,14-Mar-2016,Måndag,00:05:00,Östhammarsgatan,104028,417.0,465.0,417.0,465.0,0,0,,,,222,40,3
4,1,1 Frihamnen - Essingetorget,14-Mar-2016,Måndag,00:05:00,Rökubbsgatan,104028,465.0,516.0,465.0,526.0,0,10,,,,303,51,4


In [15]:
df['CurrentLoad'] = df['CurrentLoad'].fillna(df['CurrentLoad'].median())

In [16]:
df_train = pd.DataFrame(columns=df.columns)
df_test = pd.DataFrame(columns=df.columns)

In [17]:
for line_num in df.LineNumber.unique():
    df_subset = df[df.LineNumber == line_num]
    
    df_subset_tr = df[(df.scheduled_departure_time >= 28800) & (df.scheduled_departure_time <= 36000)]
    df_subset_test = df[(df.scheduled_departure_time >= 57600) & (df.scheduled_departure_time <= 61200)]
    
    train_samples = df_subset_tr.sample(n=4, axis=0)
    test_samples = df_subset_tr.sample(n=2, axis=0)
    df_train = df_train.append(train_samples)
    df_test = df_test.append(test_samples)

In [18]:
df_train.head()

Unnamed: 0,LineNumber,direction,DateName,DayNameOfWeek,planed_departure,StopPointName,BKVehicleID,scheduled_arrival_time,observed_arrival_time,scheduled_departure_time,observed_departure_time,DwellTime,StopTime,Boardings,Alightings,CurrentLoad,CoveredDistance,RunTime,counter
2641381,76,2 Ljusterögatan - Ropsten,14-Apr-2016,Torsdag,08:57:00,Djurgårdsbron,107530,33511.0,33747.0,33511.0,33768.0,0,21,,,14.0,409,45,12
2041080,55,1 Motalavägen - Tanto,07-Apr-2016,Torsdag,08:24:00,Ropsten,107471,30565.0,30531.0,30565.0,30555.0,0,24,4.0,1.0,14.0,693,105,5
2083247,55,2 Tanto - Motalavägen,16-Mar-2016,Onsdag,08:04:00,Timmermansgränd,107485,29341.0,29349.0,29341.0,29369.0,0,20,,,14.0,286,48,4
2425062,67,2 Skansen - Frösundavik,17-Mar-2016,Torsdag,08:37:00,Solna stadshus,104055,33060.0,33420.0,33060.0,33439.0,0,19,0.0,2.0,14.0,283,43,22
2556295,76,1 Ropsten - Ljusterögatan,13-Apr-2016,Onsdag,09:12:00,Berwaldhallen,105393,33959.0,33897.0,33959.0,33897.0,0,0,,,14.0,267,37,12


In [19]:
delay_train = (df_train['observed_arrival_time'] - df_train['scheduled_arrival_time']).to_numpy()
load_train = df_train['CurrentLoad'].to_numpy()
stops_train = df_train['counter'].to_numpy()

delay_test = (df_test['observed_arrival_time'] - df_test['scheduled_arrival_time']).to_numpy()
load_test = df_test['CurrentLoad'].to_numpy()
stops_test = df_test['counter'].to_numpy()

print(delay_train.shape, load_train.shape, stops_train.shape)
print(delay_test.shape, load_test.shape, stops_test.shape)

(44,) (44,) (44,)
(22,) (22,) (22,)


In [20]:
X_train = np.concatenate([delay_train.reshape(-1, 1), load_train.reshape(-1, 1), stops_train.reshape(-1, 1)], axis=-1)
X_train = np.concatenate([np.ones((len(X_train), 1)), X_train], axis=-1).astype(float)

y_train = df_train['CoveredDistance'].to_numpy().reshape(-1, 1)
X_train.shape, y_train.shape

((44, 4), (44, 1))

In [21]:
X_test = np.concatenate([delay_test.reshape(-1, 1), load_test.reshape(-1, 1), stops_test.reshape(-1, 1)], axis=-1)
X_test = np.concatenate([np.ones((len(X_test), 1)), X_test], axis=-1).astype(float)

y_test = df_test['CoveredDistance'].to_numpy().reshape(-1, 1)
X_test.shape, y_test.shape

((22, 4), (22, 1))

In [22]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train_t = scaler.fit_transform(X_train)
X_test_t = scaler.transform(X_test)

In [23]:
W_opt = np.linalg.pinv(X_train).dot(y_train)
W_opt_minmax = np.linalg.pinv(X_train_t).dot(y_train)

print(W_opt.shape, W_opt_minmax.shape)

(4, 1) (4, 1)


In [24]:
y_pred = X_test.dot(W_opt)
y_pred_minmax = X_test_t.dot(W_opt_minmax)

In [25]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_pred=y_pred, y_true=y_test)
mse_minmax = mean_squared_error(y_pred=y_pred_minmax, y_true=y_test)

print(mse, mse_minmax)

29957.398866694126 51049.2156496279
