In [2]:
import pandas as pd
import numpy as np

In [3]:
import optuna.integration.lightgbm as gbm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optuna as opt


In [30]:
class Data:
    def __init__(self, path: str) -> None:    
        demand_path = path + "/Demand_history.csv"
        existingEV_path = path + "/existing_EV_infrastructure_2018.csv"
        self.df_orig = pd.read_csv(demand_path)
        self.df_trans = self.df_orig
        self.years_window = 2
        self.y_cols = [f"n-{y}" for y in range(1, self.years_window + 1)]
        
    def clean(self):
        self.df_orig.loc[(self.df_orig != 0).any(1)]
        print(self.df_orig)
        
    def split(self):
        
        x_train = pd.DataFrame(columns=["x_coordinate", "y_coordinate", *self.y_cols])
        x_test = pd.DataFrame(columns=["x_coordinate", "y_coordinate", *self.y_cols])
        y_train = pd.Series(dtype=np.float64)
        y_test = pd.Series(dtype=np.float64)
        for y in self.df_trans.columns[self.df_trans.columns.str.startswith('20')]:
            y = int(y)
            if y < 2010 + self.years_window:
                continue
            y_cols = [f"{y - i}" for i in range(1, self.years_window + 1)]
            stack_x = self.df_trans.loc[:, self.df_trans.columns.isin(["x_coordinate", "y_coordinate", *y_cols])]
            y_dict = {f"{y - i}": f"n-{i}" for i in range(1, self.years_window + 1)}
            stack_x = stack_x.rename(columns=y_dict)
            stack_y = self.df_trans.loc[:, f"{y}"]
            if y == 2018:
                x_test = pd.concat([x_test, stack_x], axis=0, ignore_index=True)
                y_test = pd.concat([y_test, stack_y], axis=0, ignore_index=True)
                x_train = pd.concat([x_train, stack_x], axis=0, ignore_index=True)
                y_train = pd.concat([y_train, stack_y], axis=0, ignore_index=True)
            else:
                x_train = pd.concat([x_train, stack_x], axis=0, ignore_index=True)
                y_train = pd.concat([y_train, stack_y], axis=0, ignore_index=True)
        
        data.x_train = x_train
        data.x_test = x_test
        data.y_train = y_train
        data.y_test = y_test
        
    def addYearDemandfromForecast(self, year: int, predicted: np.array) -> None:
        predicted = predicted.reshape(-1, 1)
        self.df_trans[f"{year}"] = predicted        
                  
path = "data"
data = Data(path)
data.split()

In [31]:
class Model:
    def __init__(self, data: Data) -> None:
        self.data = data
    
    def fit(self) -> None:
        params = {
            "n_estimators": 10000,
            "objective": "regression",
            "metric": "rmse",
            "verbosity": -1,
            "boosting_type": "gbdt"
         }
        dtrain = gbm.Dataset(data.x_train, data.y_train, categorical_feature=[0])
        print(dtrain)
        deval = gbm.Dataset(data.x_test, data.y_test, reference=dtrain, categorical_feature=[0])
        self.model = gbm.train(params, train_set=dtrain, valid_sets=deval,
            early_stopping_rounds=100)
        
        pred = self.model.predict(data.x_train)
        train_rmse = np.sqrt(mean_squared_error(data.y_train, pred))
        train_mae = mean_absolute_error(data.y_train, pred)   
        train_r2 = r2_score(data.y_train, pred) 
        pred = self.model.predict(data.x_test)
        test_rmse = np.sqrt(mean_squared_error(data.y_test, pred))
        test_mae = mean_absolute_error(data.y_test, pred)   
        test_r2 = r2_score(data.y_test, pred)    
        
        print(f"training: rmse={train_rmse}, mae={train_mae}, r2={train_r2}")   
        print(f"test: rmse={test_rmse}, mae={test_mae}, r2={test_r2}")                        
    
    def predict(self, year: int) -> pd.Series:
        y_cols = [f"{year - i}" for i in range(1, data.years_window + 1)]
        x_forecast = \
            data.df_trans.loc[:,
                              data.df_trans.columns.isin(["x_coordinate", "y_coordinate", *y_cols])]        
        return self.model.predict(x_forecast)
    
    def set_params(self, params):
        params['n_estimators'] = int(params['n_estimators'])
        params['max_depth'] = int(params['max_depth'])
        self.model.set_params(**params)
        

In [32]:
model = Model(data)

In [33]:
model.fit()

[32m[I 2022-09-14 18:10:09,397][0m A new study created in memory with name: no-name-5b81bac5-3862-44e2-91d6-49931c85560c[0m


<lightgbm.basic.Dataset object at 0x283214af0>



[A
[A

Training until validation scores don't improve for 100 rounds




Did not meet early stopping. Best iteration is:
[10000]	valid_0's rmse: 0.535549



[A
[A[32m[I 2022-09-14 18:12:12,212][0m Trial 0 finished with value: 0.5355485619484984 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.5355485619484984.[0m

[A

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[10000]	valid_0's rmse: 0.535549



[A
[A[32m[I 2022-09-14 18:14:26,200][0m Trial 1 finished with value: 0.5355485619484984 and parameters: {'feature_fraction': 1.0}. Best is trial 0 with value: 0.5355485619484984.[0m

[A

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[10000]	valid_0's rmse: 0.829493



[A
[A[32m[I 2022-09-14 18:16:19,321][0m Trial 2 finished with value: 0.8294928199814869 and parameters: {'feature_fraction': 0.7}. Best is trial 0 with value: 0.5355485619484984.[0m

[A

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[10000]	valid_0's rmse: 0.829493



[A
[A[32m[I 2022-09-14 18:18:25,064][0m Trial 3 finished with value: 0.8294928199814869 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with value: 0.5355485619484984.[0m

[A

Training until validation scores don't improve for 100 rounds


In [22]:
best_params = model.model.params
print("Best params:", best_params)
print("  Params: ")
for key, value in best_params.items():
    print("    {}: {}".format(key, value))

Best params: {'objective': 'regression', 'metric': 'rmse', 'verbosity': -1, 'boosting_type': 'gbdt', 'feature_pre_filter': False, 'lambda_l1': 0.22159388388878123, 'lambda_l2': 8.736663550797822e-06, 'num_leaves': 31, 'feature_fraction': 0.6, 'bagging_fraction': 0.409253204352181, 'bagging_freq': 1, 'min_child_samples': 20, 'num_iterations': 10000, 'early_stopping_round': 100, 'categorical_column': [0]}
  Params: 
    objective: regression
    metric: rmse
    verbosity: -1
    boosting_type: gbdt
    feature_pre_filter: False
    lambda_l1: 0.22159388388878123
    lambda_l2: 8.736663550797822e-06
    num_leaves: 31
    feature_fraction: 0.6
    bagging_fraction: 0.409253204352181
    bagging_freq: 1
    min_child_samples: 20
    num_iterations: 10000
    early_stopping_round: 100
    categorical_column: [0]


In [23]:
predicted_2019 = model.predict(2019)

In [24]:
data.addYearDemandfromForecast(2019, predicted_2019)

In [25]:
data.df_trans

Unnamed: 0,demand_point_index,x_coordinate,y_coordinate,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,0,0.5,0.5,0.352242,0.667932,0.958593,2.911901,4.338274,6.561995,8.454417,10.595324,13.119572,22.301643
1,1,1.5,0.5,0.325940,0.591964,0.862652,2.589068,4.196034,5.745551,8.753195,11.126995,12.020091,21.209306
2,2,2.5,0.5,0.373752,0.591890,0.969733,2.641432,3.541772,5.469161,8.414627,10.115336,14.018254,22.015564
3,3,3.5,0.5,0.420686,0.584055,0.906547,2.378577,3.888121,5.846089,9.083868,12.424885,15.012302,23.939516
4,4,4.5,0.5,0.475621,0.647940,0.981544,2.665400,4.218711,6.776609,8.851107,11.731131,16.355563,24.457957
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4091,4091,59.5,63.5,0.171015,0.334565,0.556055,1.373291,1.837586,2.517146,3.352280,4.149888,5.426193,7.024246
4092,4092,60.5,63.5,0.041716,0.061741,0.131291,0.386540,0.755846,0.941116,1.107797,1.309479,2.057450,6.398077
4093,4093,61.5,63.5,0.100895,0.180352,0.296299,0.705373,1.300220,1.608609,1.822806,2.333681,3.218519,6.822145
4094,4094,62.5,63.5,0.155353,0.290825,0.557803,1.516066,2.399426,2.719197,4.494515,6.096858,6.262574,13.666316


In [26]:
predicted_2020 = model.predict(2020)

In [27]:
data.addYearDemandfromForecast(2020, predicted_2020)

In [28]:
data.df_trans

Unnamed: 0,demand_point_index,x_coordinate,y_coordinate,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,0,0.5,0.5,0.352242,0.667932,0.958593,2.911901,4.338274,6.561995,8.454417,10.595324,13.119572,22.301643,26.205078
1,1,1.5,0.5,0.325940,0.591964,0.862652,2.589068,4.196034,5.745551,8.753195,11.126995,12.020091,21.209306,26.202895
2,2,2.5,0.5,0.373752,0.591890,0.969733,2.641432,3.541772,5.469161,8.414627,10.115336,14.018254,22.015564,26.407450
3,3,3.5,0.5,0.420686,0.584055,0.906547,2.378577,3.888121,5.846089,9.083868,12.424885,15.012302,23.939516,28.186608
4,4,4.5,0.5,0.475621,0.647940,0.981544,2.665400,4.218711,6.776609,8.851107,11.731131,16.355563,24.457957,28.743430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4091,4091,59.5,63.5,0.171015,0.334565,0.556055,1.373291,1.837586,2.517146,3.352280,4.149888,5.426193,7.024246,12.078931
4092,4092,60.5,63.5,0.041716,0.061741,0.131291,0.386540,0.755846,0.941116,1.107797,1.309479,2.057450,6.398077,11.255776
4093,4093,61.5,63.5,0.100895,0.180352,0.296299,0.705373,1.300220,1.608609,1.822806,2.333681,3.218519,6.822145,11.568899
4094,4094,62.5,63.5,0.155353,0.290825,0.557803,1.516066,2.399426,2.719197,4.494515,6.096858,6.262574,13.666316,19.844765


In [29]:
data.df_trans.to_csv("data/forecast.csv")