In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
#creating features
def io_times_daygap_diff1(df,method,bycolumn,calcolumn):
    import math
    if method == 'max':
        test_t = df.groupby(bycolumn)[calcolumn].max().reset_index(
                name = 'history_date'+calcolumn+'max')
    elif method == 'min':
        test_t = df.groupby(bycolumn)[calcolumn].min().reset_index(
                name = 'history_date'+calcolumn+'min')        
    elif method == 'range':
        test_t = pd.DataFrame(df.groupby(bycolumn)[calcolumn].max()
                 - df.groupby(bycolumn)[calcolumn].min())
        test_t.rename(columns = {calcolumn: 'history_date'+calcolumn +'range'},inplace = True)
        test_t[bycolumn] = test_t.index
        test_t.reset_index(drop=True,inplace = True)
    elif method == 'std':
        test_t = df.groupby(bycolumn)[calcolumn].std().reset_index(
                name = 'history_date'+calcolumn+'std')    
    elif method == 'avg':
        test_t = df.groupby(bycolumn)[calcolumn].mean().reset_index(
                name =  'history_date'+calcolumn+'avg')  
    elif method == 'skew':
        test_t = df.groupby(bycolumn)[calcolumn].skew().reset_index(
                name =  'history_date'+calcolumn+'skew')     
    elif method == 'kurt':
        test_t = df.groupby(bycolumn)[calcolumn].apply(pd.DataFrame.kurt).reset_index(
                name = 'history_date'+calcolumn+'kurt')   
    return test_t

In [3]:
#define features and load datasets
features = ['season', 'month', 'hour', 'hour_sin', 'hour_cos', 'hourlyAverage_OAT', 
            'hourlyHumidity', 
            'hourlyUV_Index', 
            'NT', 
            'ST', 
            'hourlyCoolingLoad', 
            'T-1', 
            'T-2',
            'T-3',
            'T-4',
            'T-5',
            'Max',
            'Min',
            'Range',
            'Std',
            'Kurt',
            'Skew',
            'Median']

train = pd.read_csv("datasets/hourly_training.csv")

verifying = pd.read_csv("datasets/hourly_verifying.csv")

test = pd.read_csv("datasets/hourly_testing.csv")
test

Unnamed: 0,Timestamp,hour,hour_sin,hour_cos,hourlyAverage_OAT,hourlyHumidity,hourlyUV_Index,hourlyNT,hourlyST,hourlyCoolingLoad,...,T-3,T-4,T-5,Max,Min,Range,Std,Kurt,Skew,Median
0,2021-09-24 0:00,0,0.000000,1.000000,28.954708,80.921200,0.0,,,1062.719164,...,885.710086,894.918349,1434.127740,1434.127740,885.710086,548.417654,189.110512,3.429324,1.798087,979.353555
1,2021-09-24 1:00,1,0.258819,0.965926,28.262833,85.972092,0.0,,,945.652157,...,922.273511,885.710086,894.918349,1062.719164,885.710086,177.009078,68.009676,-1.635463,0.695508,933.962834
2,2021-09-24 2:00,2,0.500000,0.866025,28.747625,82.472033,0.0,,,1051.510853,...,1036.433600,922.273511,885.710086,1062.719164,885.710086,177.009078,68.854089,-2.426079,-0.213766,991.042878
3,2021-09-24 3:00,3,0.707107,0.707107,28.895333,81.559533,0.0,,,1126.059879,...,1062.719164,1036.433600,922.273511,1126.059879,922.273511,203.786368,69.896861,-1.030628,-0.266835,1043.972227
4,2021-09-24 4:00,4,0.866025,0.500000,28.618458,82.609533,0.0,,,862.674206,...,945.652157,1062.719160,1036.433600,1126.059879,862.674206,263.385673,86.034919,0.078714,-0.795924,1043.972227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,2021-09-30 19:00,19,-0.965926,0.258819,29.724292,85.423775,0.1,,,1855.430844,...,6677.294998,6625.920220,6611.132620,6677.294998,1855.430844,4821.864154,1787.059578,2.235586,-1.680962,6564.979117
164,2021-09-30 20:00,20,-0.866025,0.500000,29.926167,83.867875,0.1,,,1278.982302,...,6518.825614,6677.295000,6625.920220,6677.295000,1278.982302,5398.312698,2260.787003,-2.067106,-0.613846,5519.947949
165,2021-09-30 21:00,21,-0.707107,0.707107,29.770333,86.922117,0.1,,,1133.854094,...,4521.070283,6518.825610,6677.295000,6677.295000,1133.854094,5543.440906,2356.742641,-2.571736,0.271166,3188.250564
166,2021-09-30 22:00,22,-0.500000,0.866025,29.572625,89.122967,0.1,,,1150.155113,...,1855.430844,4521.070280,6518.825610,6518.825610,1133.854094,5384.971516,2060.583459,-0.005284,1.236492,1567.206573


In [4]:
#define features for training, verifing, and testing
train_input = train[[#'season', 'month', 
                     'hour', 'hour_sin', 'hour_cos', 
    'hourlyAverage_OAT', 'hourlyHumidity', 'hourlyUV_Index',  
            'T-1', 
            'T-2',
            'T-3',
            'T-4',
            'T-5',
            'Max',
            'Min',
            'Range',
            'Std',
            'Kurt',
            'Skew',
            'Median']]


train_output = train.hourlyCoolingLoad

verifying_input = verifying[[#'season', 'month', 
                             'hour', 'hour_sin', 'hour_cos', 
    'hourlyAverage_OAT', 'hourlyHumidity', 'hourlyUV_Index',  
            'T-1', 
            'T-2',
            'T-3',
            'T-4',
            'T-5',
            'Max',
            'Min',
            'Range',
            'Std',
            'Kurt',
            'Skew',
            'Median']]

verifying_output = verifying.hourlyCoolingLoad

test_input = test[[#'season', 'month', 
                   'hour', 'hour_sin', 'hour_cos', 
    'hourlyAverage_OAT', 'hourlyHumidity', 'hourlyUV_Index', 
            'T-1', 
            'T-2',
            'T-3',
            'T-4',
            'T-5',
            'Max',
            'Min',
            'Range',
            'Std',
            'Kurt',
            'Skew',
            'Median']]
test_output = test.hourlyCoolingLoad



In [9]:
#parameter tuning

params={
    'booster':'gbtree',
	'objective': 'reg:linear',
	'eval_metric': 'rmse',
    'n_estimators':800,
    'max_depth':11,
    'min_child_weight':7, 
	'gamma':1.2,
	'subsample':0.5,
	'colsample_bytree':0.88,
    'alpha': 0.1,
    'lambda':2,
	'eta': 0.057,
    'scale_pos_weight':1,
	'seed':0,
    'silent':0
}


#model training
model = XGBRegressor(**params)
model.fit(train_input, train_output)

"""
importances  = pd.DataFrame(model.feature_importances_)
importances.to_csv('imporatnces.csv')

print(importances)

"""

#making predictions

predictions = model.predict(test_input)
rmse = math.sqrt(mean_squared_error(test_output, predictions))

#finding feature importance
importances  = pd.DataFrame(model.feature_importances_)
print(importances)


pd.DataFrame(predictions).to_csv('predictions.csv')

#prints importance
print(rmse)
print(model.score(test_input, test_output)*100)

Parameters: { "silent" } are not used.

           0
0   0.102257
1   0.016148
2   0.185613
3   0.001468
4   0.000472
5   0.000814
6   0.397745
7   0.115369
8   0.000842
9   0.000665
10  0.000625
11  0.096054
12  0.006748
13  0.022019
14  0.017441
15  0.005092
16  0.028342
17  0.002286
102.66273009571589
99.82505614740832
