In [1]:
import pandas as pd
# !pip install river

In [2]:
dataset = pd.read_csv("hospital_wait.csv")

In [3]:
dataset.head()
print(type(dataset))

<class 'pandas.core.frame.DataFrame'>


In [4]:
params = {'converters': {'value': float},'parse_dates': {'time': "%Y-%m-%d %H:%M:%S"}}

dataset = dict()
from river import stream


In [5]:
from river import compose
from river import linear_model
from river import metrics
from river import evaluate
from river import preprocessing
from river import feature_extraction
from river import stats
from river import optim
from river import facto
from river import model_selection
from river import time_series
from river import tree
from river import dummy

In [19]:
import math
import datetime
import numpy as np

hour_list = [str(i) for i in range (0,24)]
minute_list = [f'{str(i)} min' for i in range (0, 60)]
def get_hour(x):
    x['h'] = x['time'].hour
    return x


def get_minute(x):
    x['m'] = x['time'].minute//15 + 1
    return x

def get_day(x):
    
    return {'d' : x['time'].day}

def get_hour_sin_and_cos(x):
    return {'sin_h' : np.sin(np.pi*(x['time'].hour)/12), 'cos_h': np.cos(np.pi*(x['time'].hour)/12)}


def get_minute_distances(x):
    x['sin_m'] = np.sin(np.pi*(x['time'].minute)/30)
    x['cos_m'] = np.cos(np.pi*(x['time'].minute)/30)
    return {'sin_m' : np.sin(np.pi*(x['time'].minute)/30), 'cos_m': np.cos(np.pi*(x['time'].minute)/30)}

def get_date_progress(x):
    return {'date': x['time'].toordinal() - datetime.datetime(2022, 1, 1, 0, 0).toordinal()}

i = 0
temp = [323,323,323,323]
cache = [temp]
my_dict = {}
for x, y in stream.iter_csv('hospital_wait.csv', target = 'value', **params):
    if i < 4:
        t = temp.copy()
        t[i] = y
        cache.append(t)      
        temp = t
        my_dict[x['time']] = t
    else:
        t = temp.copy()
        t.pop(0)
        t.append(y)
        cache.append(t)
        temp = t
        my_dict[x['time']] = t
    i += 1



def get_lag(x):
    lag_values = my_dict[x['time']]
    return {'lag_1': lag_values[0], 'lag_2': lag_values[1], 'lag_3': lag_values[2], 'lag_4': lag_values[3]}
    

In [20]:
def make_linear(pipeline, optimizer=None):
    model = pipeline
    if optimizer:
        model |= linear_model.LinearRegression(intercept_lr=.1, optimizer = optimizer)
    else:
        model |= linear_model.LinearRegression(intercept_lr=.1)
    return model

In [21]:

models = [linear_model.LinearRegression(optimizer=optim.SGD(lr=lr)) for lr in [0.05, 0.02, 0.01, 0.005, 0.002, 0.0001]]


model = compose.Pipeline(
    ('features', compose.TransformerUnion(
        ('date_progress', compose.FuncTransformer(get_date_progress)),
        ('lags', compose.FuncTransformer(get_lag))
    )))

model += (
    get_hour | 
        feature_extraction.TargetAgg(
            by=['h'], how=stats.Mean()


))
model += (
    get_minute | 
        feature_extraction.TargetAgg(
            by=['m'], how=stats.Mean()


))

model |=  preprocessing.StandardScaler()
# model |= linear_model.LinearRegression(intercept_lr=.1)


# model |=  model_selection.EpsilonGreedyRegressor(models, epsilon=0.025, decay=0.1, burn_in=100, seed=1)
# model |= tree.HoeffdingAdaptiveTreeRegressor(grace_period=100, leaf_prediction='adaptive', model_selector_decay=0.9, seed=0)
# model = preprocessing.TargetStandardScaler(regressor=model)



In [11]:
metric = metrics.MAE() + metrics.R2()
evaluate.progressive_val_score(stream.iter_csv('hospital_wait.csv', target = 'value', **params), make_linear(model), metric, print_every=50)
# evaluate.progressive_val_score(stream.iter_csv('hospital_wait.csv', target = 'value', **params), dummy.StatisticRegressor(stats.Shift(1)), metric, print_every=50)
model.transform_one(x)

[50] MAE: 45.879082, R2: -0.004024
[100] MAE: 36.566037, R2: 0.694548
[150] MAE: 27.610535, R2: 0.818795
[200] MAE: 23.715372, R2: 0.838152
[250] MAE: 20.122645, R2: 0.860365
[300] MAE: 18.389003, R2: 0.88454
[350] MAE: 16.422354, R2: 0.912944
[400] MAE: 15.160021, R2: 0.92184
[450] MAE: 14.434028, R2: 0.92707
[500] MAE: 13.822398, R2: 0.932736
[550] MAE: 12.841361, R2: 0.938584
[600] MAE: 12.427435, R2: 0.941154
[650] MAE: 11.665425, R2: 0.946559
[700] MAE: 10.994886, R2: 0.948179
[750] MAE: 10.445997, R2: 0.951443
[800] MAE: 9.965217, R2: 0.952663
[850] MAE: 9.590534, R2: 0.956341
[900] MAE: 9.162267, R2: 0.958786
[950] MAE: 8.888767, R2: 0.961491
[1,000] MAE: 8.593528, R2: 0.962387


{'y_mean_by_m': -0.9807493324966388,
 'y_mean_by_h': -1.2949853252408325,
 'lag_1': -0.7824379451711704,
 'lag_2': -0.7807918309726335,
 'lag_3': -0.7347856615567347,
 'lag_4': -0.7332484358217568,
 'date': 1.8460647891520903}

In [15]:
evaluate.progressive_val_score(stream.iter_csv('hospital_wait.csv', target = 'value', **params), make_linear(model, optim.AMSGrad()), metric, print_every=50)

[50] MAE: 10.900357, R2: 0.937807
[100] MAE: 13.020433, R2: 0.92481
[150] MAE: 13.709595, R2: 0.922444
[200] MAE: 15.077885, R2: 0.917045
[250] MAE: 15.38864, R2: 0.915097
[300] MAE: 16.453746, R2: 0.908695
[350] MAE: 16.504566, R2: 0.90819
[400] MAE: 17.055613, R2: 0.905723
[450] MAE: 17.632432, R2: 0.90363
[500] MAE: 18.298425, R2: 0.900693
[550] MAE: 18.132511, R2: 0.901909
[600] MAE: 18.701719, R2: 0.896317
[650] MAE: 18.384302, R2: 0.8991
[700] MAE: 18.447953, R2: 0.898513
[750] MAE: 18.56945, R2: 0.898439
[800] MAE: 18.60872, R2: 0.898174
[850] MAE: 18.8057, R2: 0.898835
[900] MAE: 18.830628, R2: 0.899834
[950] MAE: 18.973025, R2: 0.899999
[1,000] MAE: 19.067957, R2: 0.899113


MAE: 19.067957, R2: 0.899113

In [16]:
evaluate.progressive_val_score(stream.iter_csv('hospital_wait.csv', target = 'value', **params), make_linear(model, optim.AdaMax()), metric, print_every=50)

[50] MAE: 20.003544, R2: 0.887407
[100] MAE: 20.959441, R2: 0.883191
[150] MAE: 21.199265, R2: 0.882055
[200] MAE: 21.870961, R2: 0.87963
[250] MAE: 21.957461, R2: 0.878298
[300] MAE: 22.54175, R2: 0.874101
[350] MAE: 22.521101, R2: 0.873612
[400] MAE: 22.859125, R2: 0.871535
[450] MAE: 23.197518, R2: 0.870005
[500] MAE: 23.63944, R2: 0.867683
[550] MAE: 23.491418, R2: 0.868462
[600] MAE: 23.871411, R2: 0.863874
[650] MAE: 23.610801, R2: 0.866027
[700] MAE: 23.669135, R2: 0.865163
[750] MAE: 23.800724, R2: 0.864285
[800] MAE: 23.843014, R2: 0.863761
[850] MAE: 24.019836, R2: 0.86341
[900] MAE: 24.106358, R2: 0.863575
[950] MAE: 24.267559, R2: 0.862516
[1,000] MAE: 24.402022, R2: 0.861135


MAE: 24.402022, R2: 0.861135

In [22]:
evaluate.progressive_val_score(stream.iter_csv('hospital_wait.csv', target = 'value', **params), make_linear(model, optim.SGD(0.1)), metric, print_every=50)
optim.SGD(0.1)

[50] MAE: 8,559,925.680405, R2: -2,280,640,855,808.125488
[100] MAE: 5,168,516,862.146273, R2: -365,726,910,050,027,264.
[150] MAE: 11,684,481,521.879799, R2: -765,857,359,835,622,656.
[200] MAE: 13,021,669,960.02702, R2: -765,732,661,103,516,800.
[250] MAE: 15,932,287,423.780483, R2: -866,877,416,142,424,448.
[300] MAE: 20,223,738,313.502239, R2: -1,051,351,845,618,932,736.
[350] MAE: 33,586,929,534.451107, R2: -1,782,200,783,362,059,776.
[400] MAE: 36,099,868,764.819435, R2: -1,862,249,576,205,182,976.
[450] MAE: 37,873,016,190.13961, R2: -1,920,548,239,264,868,608.
[500] MAE: 40,190,785,601.061897, R2: -2,040,322,328,215,726,592.
[550] MAE: 46,031,425,078.601448, R2: -2,238,722,521,546,386,432.
[600] MAE: 48,681,830,150.416641, R2: -2,308,128,699,321,106,432.
[650] MAE: 58,569,073,699.093475, R2: -2,769,104,415,200,223,232.
[700] MAE: 57,817,969,585.02079, R2: -2,748,390,399,962,011,648.
[750] MAE: 64,007,297,062.187157, R2: -3,005,706,265,930,449,920.
[800] MAE: 63,483,562,497.8910

SGD({'lr': Constant({'learning_rate': 0.1}), 'n_iterations': 0})

In [23]:
evaluate.progressive_val_score(stream.iter_csv('hospital_wait.csv', target = 'value', **params), make_linear(model, optim.Momentum()), metric, print_every=50)

[50] MAE: 59,203,749,741.711433, R2: -2,871,156,824,588,147,200.
[100] MAE: 58,640,530,785.481369, R2: -2,767,489,139,922,445,824.
[150] MAE: 58,315,060,991.175331, R2: -2,750,831,297,366,760,960.
[200] MAE: 59,972,305,563.013306, R2: -2,846,261,279,746,800,640.
[250] MAE: 70,866,516,239.097549, R2: -4,590,995,181,177,077,760.
[300] MAE: 109,302,798,983.656387, R2: -22,634,570,180,669,104,128.
[350] MAE: 124,632,512,588.713425, R2: -25,848,916,491,619,618,816.
[400] MAE: 194,058,988,516.042633, R2: -69,389,790,511,187,697,664.
[450] MAE: 251,933,165,095.091949, R2: -113,759,927,129,072,418,816.
[500] MAE: 286,657,576,219.223633, R2: -140,762,825,526,725,443,584.
[550] MAE: 303,198,403,227.450989, R2: -144,403,048,360,930,394,112.
[600] MAE: 332,391,567,458.433899, R2: -153,503,194,495,734,022,144.
[650] MAE: 349,128,567,802.829834, R2: -157,011,154,800,823,369,728.
[700] MAE: 362,224,191,082.246094, R2: -160,531,415,141,834,358,784.
[750] MAE: 387,424,149,155.195007, R2: -168,164,569,9

MAE: 1,074,015,630,172.842285, R2: -1,709,720,221,086,712,856,576.

In [None]:
from river import metrics
import matplotlib.pyplot as plt
import collections


queue = collections.deque([], 4)

def evaluate_model(model): 

    metric = metrics.Rolling(metrics.MAE(), 10)
    metric_b = metrics.Rolling(metrics.MAE(), 10)
    
    dates = []
    y_trues = []
    y_preds = []
    
    baseline = 0
    y_baseline = []
    for x, y in stream.iter_csv('hospital_wait.csv', target = 'value', **params):
        
        new_feats = {f"lag_{i}": v for i, v in enumerate(queue)}

        # copy of x
        x_ = dict(x)
        x_.update(new_feats)

        y_pred = model.predict_one(x_)
        model.learn_one(x_, y)

        queue.append(y)

        # Obtain the prior prediction and update the model in one go
        y_pred = model.predict_one(x)
        model.learn_one(x, y)

        # Update the error metric
        metric.update(y, y_pred)
        metric_b.update(y, baseline)
        
        # Store the true value and the prediction
        dates.append(x['time'])
        y_trues.append(y)
        y_preds.append(y_pred)
        y_baseline.append(baseline)
        baseline = y
        
    print(metric, metric_b)

    # Plot the results
    fig, ax = plt.subplots(figsize=(20, 6))
    ax.grid(alpha=0.75)
    ax.plot(dates, y_trues, lw=3, color='#2ecc71', alpha=800, label='Ground truth')
    ax.plot(dates, y_preds, lw=3, color='#e74c3c', alpha=800, label='Prediction')
    ax.plot(dates, y_baseline, lw=3, color='#e74c3c', alpha=800, label='Baseline')
    ax.legend()
    ax.set_title(metric)
evaluate_model(model)


In [None]:
def make_model(alpha):
    models = [linear_model.LinearRegression(optimizer=optim.SGD(lr=lr), loss=optim.losses.Quantile(alpha=alpha)) for lr in [0.05, 0.02, 0.01, 0.005, 0.002, 0.0001]]


    model = compose.Pipeline(
        ('features', compose.TransformerUnion(
            ('date_progress', compose.FuncTransformer(get_date_progress)),
            ('lags', compose.FuncTransformer(get_lag))
        )))

    model += (
        get_hour | 
            feature_extraction.TargetAgg(
                by=['h'], how=stats.Mean()


    ))
    # model += (
    #     get_minute | 
    #         feature_extraction.TargetAgg(
    #             by=['m'], how=stats.Mean()


    # ))

    model |=  preprocessing.StandardScaler()
    model |= preprocessing.TargetStandardScaler( 
        model_selection.UCBRegressor(
            models,
            delta=0.01, burn_in=100, seed=1
        )
    )
    return model


In [None]:
models = {
    'lower': make_model(alpha=0.05),
    'center': make_model(alpha=0.5),
    'upper': make_model(alpha=0.95)
}

dates = []
y_trues = []
y_preds = {
    'lower': [],
    'center': [],
    'upper': []
}

for x, y in stream.iter_csv('hospital_wait.csv', target = 'value', **params):
    y_trues.append(y)
    dates.append(x['time'])

    for name, model in models.items():
        y_preds[name].append(model.predict_one(x))
        model.learn_one(x, y)

    # Update the error metric
    metric.update(y, y_preds['center'][-1])

# Plot the results
fig, ax = plt.subplots(figsize=(10, 6))
ax.grid(alpha=0.75)
ax.plot(dates, y_trues, lw=3, color='#2ecc71', alpha=0.8, label='Truth')
ax.plot(dates, y_preds['center'], lw=3, color='#e74c3c', alpha=0.8, label='Prediction')
ax.fill_between(dates, y_preds['lower'], y_preds['upper'], color='#e74c3c', alpha=0.3, label='Prediction interval')
ax.legend()
ax.set_title(metric);