In [104]:
import data_loader
import eval_framework
import numpy as np
import pandas as pd
import sklearn as sk
import sklearn.linear_model
import sklearn.metrics
import sklearn.preprocessing
import sklearn.ensemble
import datetime

In [3]:
demands_train, demands_test, weather_train, weather_test = data_loader.load_splitted_data(split_strategy="final_weeks", split_size_w=1, week_selection=0, start_first_monday=False)

### Previous week baseline

In [25]:
prev_week = demands_train.loc[list(map(lambda idx: idx - datetime.timedelta(weeks=1), demands_test.index))]

In [92]:
baseline = eval_framework.performance_indicators(demands_test.fillna(0), prev_week.fillna(0))
baseline

Unnamed: 0_level_0,PI1,PI2,PI3
name_long,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DMA_A,1.652083,5.53,1.360035
DMA_B,1.131563,3.8625,0.891337
DMA_C,1.201354,3.5125,0.898924
DMA_D,3.958854,9.7075,2.916059
DMA_E,2.222708,6.635,2.036181
DMA_F,1.043333,2.965,1.082014
DMA_G,1.734375,5.515,2.049913
DMA_H,21.37875,27.705,12.495035
DMA_I,1.864583,9.605,1.144358
DMA_J,2.383333,10.625,1.29474


### Hour-Wise-LinReg

In [112]:
def predict(demands_train, weather_train, weather_test, method='direct', normalize=True):

    assert method in ['direct', 'delta'], 'Unknown Method'

    # Create hour_day indices
    idcs_hour_day = dict()
    for day_of_week in range(7):
        for hour_of_day in range(24):
            idcs_hour_day[(day_of_week, hour_of_day)] = list(filter(lambda timestamp: timestamp.day_of_week == day_of_week and timestamp.hour == hour_of_day, demands_train.index))

    # Create hour_day indices
    idcs_hour_day_test = dict()
    for day_of_week in range(7):
        for hour_of_day in range(24):
            idcs_hour_day_test[(day_of_week, hour_of_day)] = list(filter(lambda timestamp: timestamp.day_of_week == day_of_week and timestamp.hour == hour_of_day, weather_test.index))

    # Create DMA idcs 
    dmas = demands_train.columns

    # Initialize predictions
    preds = pd.DataFrame(index=weather_test.index, columns=dmas)

    # Loop over all combinations and predict
    for dma in dmas:
        print(f'DMA: {dma}...')
        for i, idx_hour_day in enumerate(idcs_hour_day.keys()):
            # Select data
            cur_demands = demands_train.loc[idcs_hour_day[idx_hour_day]][dma].to_numpy()
            cur_weather = weather_train.loc[idcs_hour_day[idx_hour_day]].to_numpy()

            # Remove NaNs
            cur_nan_mask = np.logical_or(np.isnan(cur_demands), np.any(np.isnan(cur_weather), axis=1))
            cur_demands = cur_demands[~cur_nan_mask]
            cur_weather = cur_weather[~cur_nan_mask]

            # Combination indices
            cur_combination_idcs = np.array([[x, y] for x in range(cur_weather.shape[0]) for y in range(cur_weather.shape[0]) if x != y])

            # Weather diff
            cur_weather_diff = cur_weather[cur_combination_idcs[:,0]] - cur_weather[cur_combination_idcs[:,1]]
            
            # Create X and y
            X = np.hstack((cur_demands[cur_combination_idcs[:,0], None], cur_weather_diff))
            y_direct = cur_demands[cur_combination_idcs[:,1]]
            y_delta = cur_demands[cur_combination_idcs[:,1]] - cur_demands[cur_combination_idcs[:,0]]


            if method == 'direct':
                y = y_direct
            elif method == 'delta':
                y = y_delta

            # Apply normalization
            if normalize:
                scaler_x = sk.preprocessing.StandardScaler()
                scaler_y = sk.preprocessing.StandardScaler()
                X = scaler_x.fit_transform(X)
                y = scaler_y.fit_transform(y[:,None])[:,0]

            # Apply linReg
            model = sk.ensemble.RandomForestRegressor(n_estimators=10)
            model.fit(X, y)

            ### Predict step
            # Select test weather data
            test_weather = weather_test.loc[idcs_hour_day_test[idx_hour_day]].to_numpy()

            # Combination indices
            test_combination_idcs = np.array([[x, y] for x in range(cur_weather.shape[0]) for y in range(test_weather.shape[0]) if x != y])

            # Weather diff
            test_weather_diff = cur_weather[test_combination_idcs[:,0]] - test_weather[test_combination_idcs[:,1]]

            # Create X
            X_test = np.hstack((cur_demands[test_combination_idcs[:,0], None], test_weather_diff))

            # Normalize
            if normalize:
                X_test = scaler_x.transform(X_test)

            # Predict
            preds_raw = model.predict(X_test)
            
            # Unnormalize
            if normalize:
                preds_raw = scaler_y.inverse_transform(preds_raw[:,None])[:,0]
                
            if method == 'direct':
                cur_pred = np.mean(preds_raw)
            elif method == 'delta':
                cur_pred = np.mean(cur_demands[test_combination_idcs[:,0]] + preds_raw)

            preds[dma][idcs_hour_day_test[idx_hour_day]] = cur_pred

    return preds

In [117]:
pred_direct = predict(demands_train, weather_train, weather_test, method='direct', normalize=False)
pred_delta = predict(demands_train, weather_train, weather_test, method='delta', normalize=False)

DMA: DMA_A...
0...
10...
20...
30...
40...
50...
60...
70...
80...
90...
100...
110...
120...
130...
140...
150...
160...
DMA: DMA_B...
0...
10...
20...
30...
40...
50...
60...
70...
80...
90...
100...
110...
120...
130...
140...
150...
160...
DMA: DMA_C...
0...
10...
20...
30...
40...
50...
60...
70...
80...
90...
100...
110...
120...
130...
140...
150...
160...
DMA: DMA_D...
0...
10...
20...
30...
40...
50...
60...
70...
80...
90...
100...
110...
120...
130...
140...
150...
160...
DMA: DMA_E...
0...
10...
20...
30...
40...
50...
60...
70...
80...
90...
100...
110...
120...
130...
140...
150...
160...
DMA: DMA_F...
0...
10...
20...
30...
40...
50...
60...
70...
80...
90...
100...
110...
120...
130...
140...
150...
160...
DMA: DMA_G...
0...
10...
20...
30...
40...
50...
60...
70...
80...
90...
100...
110...
120...
130...
140...
150...
160...
DMA: DMA_H...
0...
10...
20...
30...
40...
50...
60...
70...
80...
90...
100...
110...
120...
130...
140...
150...
160...
DMA: DMA_I...
0...
10...

In [99]:
pred_direct_norm = predict(demands_train, weather_train, weather_test, method='direct', normalize=True)
pred_delta_norm = predict(demands_train, weather_train, weather_test, method='delta', normalize=True)

#### Results

In [115]:
eval_direct = eval_framework.performance_indicators(demands_test.fillna(0), pred_direct)
eval_direct

Unnamed: 0_level_0,PI1,PI2,PI3
name_long,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DMA_A,1.476215,4.802739,0.917648
DMA_B,0.941909,4.041846,0.803571
DMA_C,0.72038,2.60993,0.6357
DMA_D,2.781159,9.116727,2.023624
DMA_E,2.571026,5.982705,2.722481
DMA_F,1.141154,2.750902,1.255049
DMA_G,3.905833,7.216696,4.536028
DMA_H,1.19292,3.82382,1.337555
DMA_I,1.89965,3.628094,1.605949
DMA_J,1.769981,5.169232,1.449787


In [118]:
eval_delta = eval_framework.performance_indicators(demands_test.fillna(0), pred_delta)
eval_delta

Unnamed: 0_level_0,PI1,PI2,PI3
name_long,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DMA_A,1.463872,4.541449,0.927132
DMA_B,0.942105,3.846079,0.772269
DMA_C,0.704661,2.375269,0.616634
DMA_D,2.744082,9.506176,2.011798
DMA_E,2.443941,5.583743,2.582157
DMA_F,1.152615,2.620938,1.342835
DMA_G,3.657773,7.053272,4.343361
DMA_H,1.172838,3.723355,1.311356
DMA_I,1.912375,3.908551,1.569556
DMA_J,1.736818,4.999879,1.400504


In [100]:
eval_direct_norm = eval_framework.performance_indicators(demands_test.fillna(0), pred_direct_norm)
eval_direct_norm

Unnamed: 0_level_0,PI1,PI2,PI3
name_long,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DMA_A,1.534711,6.504001,1.039783
DMA_B,1.167635,5.203425,1.135145
DMA_C,1.055837,3.012829,0.845277
DMA_D,2.489922,8.766477,2.201887
DMA_E,2.888766,5.81645,3.199963
DMA_F,0.923799,2.42865,1.12792
DMA_G,4.489091,7.657941,5.011383
DMA_H,1.099208,3.934942,1.414858
DMA_I,1.680342,3.415067,1.596955
DMA_J,2.016123,5.436623,1.619652


In [101]:
eval_delta_norm = eval_framework.performance_indicators(demands_test.fillna(0), pred_delta_norm)
eval_delta_norm

Unnamed: 0_level_0,PI1,PI2,PI3
name_long,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DMA_A,1.534711,6.504001,1.039783
DMA_B,1.167635,5.203425,1.135145
DMA_C,1.055837,3.012829,0.845277
DMA_D,2.489922,8.766477,2.201887
DMA_E,2.888766,5.81645,3.199963
DMA_F,0.923799,2.42865,1.12792
DMA_G,4.489091,7.657941,5.011383
DMA_H,1.099208,3.934942,1.414858
DMA_I,1.680342,3.415067,1.596955
DMA_J,2.016123,5.436623,1.619652


#### Compare to baseline

In [116]:
baseline - eval_direct

Unnamed: 0_level_0,PI1,PI2,PI3
name_long,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DMA_A,0.175869,0.727261,0.442386
DMA_B,0.189653,-0.179346,0.087766
DMA_C,0.480974,0.90257,0.263224
DMA_D,1.177695,0.590773,0.892435
DMA_E,-0.348318,0.652295,-0.686301
DMA_F,-0.09782,0.214098,-0.173035
DMA_G,-2.171458,-1.701696,-2.486114
DMA_H,20.18583,23.88118,11.157479
DMA_I,-0.035066,5.976906,-0.461591
DMA_J,0.613352,5.455768,-0.155048


In [119]:
baseline - eval_delta

Unnamed: 0_level_0,PI1,PI2,PI3
name_long,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DMA_A,0.188211,0.988551,0.432902
DMA_B,0.189457,0.016421,0.119068
DMA_C,0.496694,1.137231,0.28229
DMA_D,1.214772,0.201324,0.904261
DMA_E,-0.221232,1.051257,-0.545976
DMA_F,-0.109281,0.344062,-0.260821
DMA_G,-1.923398,-1.538272,-2.293448
DMA_H,20.205912,23.981645,11.183679
DMA_I,-0.047792,5.696449,-0.425199
DMA_J,0.646516,5.625121,-0.105765
