In [48]:
!pip install river
#order goes:
#imputing -> scaling -> encoding -> ARF instantiation -> incremental training with ADWIN




**Installing packages**

In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import river
from river import preprocessing, stats, compose, metrics
from river import drift, ensemble, forest
from river.drift import ADWIN
from river.compose import Pipeline

**Extracting data**

In [129]:
dataset = pd.read_excel('ttc-bus-delay-data-2023.xlsx')
X = dataset.iloc[:, :-1] #or whatever column we want the INDEPENDENT variables (Weather, route number, time of day, etc.)
y = dataset.iloc[:, -1].values #or whatever column we want the DEPENDENT variables (I'm guessing the delay time)


print(X)



         Date  Route   Time     Day                Location  \
0  2023-01-01   91.0  02:30  Sunday   WOODBINE AND MORTIMER   
1  2023-01-01   69.0  02:34  Sunday          WARDEN STATION   
2  2023-01-01   35.0  03:06  Sunday            JANE STATION   
3  2023-01-01    NaN  03:14  Sunday         KIPLING STATION   
4  2023-01-01    NaN  03:43  Sunday          MEADOWALE LOOP   
..        ...    ...    ...     ...                     ...   
89 2023-01-01   17.0  21:20  Sunday  WARDEN LOOP AT STEELES   
90 2023-01-01   60.0  21:39  Sunday       STEELES AND KEELE   
91 2023-01-01   60.0  21:39  Sunday       HOWARD MOSCOE WAY   
92 2023-01-01   25.0  21:42  Sunday       PAPE AND MORTIMER   
93 2023-01-01   14.0  22:48  Sunday      CHAPLIN AND ORIOLE   

                 Incident  Min Delay  Min Gap Direction  
0               Diversion         81      111         S  
1                Security         22       44         S  
2   Cleaning - Unsanitary         30       60         N  
3          

**Imputing (Filling out missing values)**

In [130]:
#NOT WORKING

imputer = preprocessing.StatImputer(('Direction', stats.Mode()))

for index, row in X.iterrows():

    row_dict = row.to_dict()

    imputer.learn_one(row_dict)

    transformed_row = imputer.transform_one(row_dict)
    X.at[index, "Direction"] = transformed_row.get("Direction", row["Direction"])

print(X)

         Date  Route   Time     Day                Location  \
0  2023-01-01   91.0  02:30  Sunday   WOODBINE AND MORTIMER   
1  2023-01-01   69.0  02:34  Sunday          WARDEN STATION   
2  2023-01-01   35.0  03:06  Sunday            JANE STATION   
3  2023-01-01    NaN  03:14  Sunday         KIPLING STATION   
4  2023-01-01    NaN  03:43  Sunday          MEADOWALE LOOP   
..        ...    ...    ...     ...                     ...   
89 2023-01-01   17.0  21:20  Sunday  WARDEN LOOP AT STEELES   
90 2023-01-01   60.0  21:39  Sunday       STEELES AND KEELE   
91 2023-01-01   60.0  21:39  Sunday       HOWARD MOSCOE WAY   
92 2023-01-01   25.0  21:42  Sunday       PAPE AND MORTIMER   
93 2023-01-01   14.0  22:48  Sunday      CHAPLIN AND ORIOLE   

                 Incident  Min Delay  Min Gap Direction  
0               Diversion         81      111         S  
1                Security         22       44         S  
2   Cleaning - Unsanitary         30       60         N  
3          

In [128]:
print(dataset.iloc[0:10, 8])

0      S
1      S
2      N
3      S
4      S
5    NaN
6    NaN
7      E
8      W
9    NaN
Name: Direction, dtype: object


**Scaling (to values with mean = 0 and variance +- 1)**

In [70]:
scaler = preprocessing.StandardScaler()

for index, row in X.iterrows():
    row_dict = row.to_dict()

    scaler.learn_one({"Min Delay": row["Min Delay"]})

    scaled_value = scaler.transform_one({"Min Delay": row["Min Delay"]})

    X.at[index, "Min Delay"] = scaled_value["Min Delay"]

print(X)


         Date  Route   Time     Day                Location  \
0  2023-01-01   91.0  02:30  Sunday   WOODBINE AND MORTIMER   
1  2023-01-01   69.0  02:34  Sunday          WARDEN STATION   
2  2023-01-01   35.0  03:06  Sunday            JANE STATION   
3  2023-01-01    NaN  03:14  Sunday         KIPLING STATION   
4  2023-01-01    NaN  03:43  Sunday          MEADOWALE LOOP   
..        ...    ...    ...     ...                     ...   
89 2023-01-01   17.0  21:20  Sunday  WARDEN LOOP AT STEELES   
90 2023-01-01   60.0  21:39  Sunday       STEELES AND KEELE   
91 2023-01-01   60.0  21:39  Sunday       HOWARD MOSCOE WAY   
92 2023-01-01   25.0  21:42  Sunday       PAPE AND MORTIMER   
93 2023-01-01   14.0  22:48  Sunday      CHAPLIN AND ORIOLE   

                 Incident  Min Delay  Min Gap Direction  
0               Diversion   0.000000      111         S  
1                Security  -1.000000       44         S  
2   Cleaning - Unsanitary  -0.548494       60         N  
3          

  X.at[index, "Min Delay"] = scaled_value["Min Delay"]


**Encoding (Giving value = 1 to group categorical data)**

In [71]:
encoder = preprocessing.OneHotEncoder()

for index, row in X.iterrows():

    row_dict = row.to_dict()

    encoder.learn_one({"Incident": row["Incident"]})

    encoded_row = encoder.transform_one({"Incident": row["Incident"]})

    for key, value in encoded_row.items():
        if key not in X.columns:
            X[key] = 0

        X.at[index, key] = value

print(X)

         Date  Route   Time     Day                Location  \
0  2023-01-01   91.0  02:30  Sunday   WOODBINE AND MORTIMER   
1  2023-01-01   69.0  02:34  Sunday          WARDEN STATION   
2  2023-01-01   35.0  03:06  Sunday            JANE STATION   
3  2023-01-01    NaN  03:14  Sunday         KIPLING STATION   
4  2023-01-01    NaN  03:43  Sunday          MEADOWALE LOOP   
..        ...    ...    ...     ...                     ...   
89 2023-01-01   17.0  21:20  Sunday  WARDEN LOOP AT STEELES   
90 2023-01-01   60.0  21:39  Sunday       STEELES AND KEELE   
91 2023-01-01   60.0  21:39  Sunday       HOWARD MOSCOE WAY   
92 2023-01-01   25.0  21:42  Sunday       PAPE AND MORTIMER   
93 2023-01-01   14.0  22:48  Sunday      CHAPLIN AND ORIOLE   

                 Incident  Min Delay  Min Gap Direction  Incident_Diversion  \
0               Diversion   0.000000      111         S                   1   
1                Security  -1.000000       44         S                   0   
2   Cl

**ARF with ADWIN**

In [84]:
print(X.dtypes)

Date                              datetime64[ns]
Route                                    float64
Time                                      object
Day                                       object
Location                                  object
Incident                                  object
Min Delay                                float64
Min Gap                                    int64
Direction                                 object
Incident_Diversion                         int64
Incident_Security                          int64
Incident_Cleaning - Unsanitary             int64
Incident_Emergency Services                int64
Incident_Collision - TTC                   int64
Incident_Mechanical                        int64
Incident_Operations - Operator             int64
Incident_Investigation                     int64
dtype: object


In [89]:
arf = forest.ARFClassifier()
adwin = drift.ADWIN()
metric = metrics.Accuracy()
#whats next??