In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost
import shap
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
total = pd.read_csv("final_baseline_data.csv")

In [3]:
features = ['temp', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'rain_1h',
       'rain_3h', 'snow_3h', 'clouds_all', 'weather_main_clear',
       'weather_main_clouds', 'weather_main_drizzle', 'weather_main_fog',
       'weather_main_mist', 'weather_main_rain', 'time_of_day_day',
       'time_of_day_morning', 'time_of_day_night', 'season_fall',
       'season_spring', 'season_summer', 'season_winter', 'generation biomass',
       'generation fossil brown coal/lignite', 'generation fossil gas',
       'generation fossil hard coal', 'generation fossil oil',
       'generation hydro pumped storage consumption',
       'generation hydro run-of-river and poundage',
       'generation hydro water reservoir', 'generation nuclear',
       'generation other', 'generation other renewable', 'generation solar',
       'generation waste', 'generation wind onshore', 'total load actual',
       'price actual']

In [6]:
subset = total[features]
training, testing = train_test_split(subset, test_size=0.30)
X_train, y_train = training.to_numpy()[:, :-1], training.to_numpy()[:, -1]
X_test, y_test = testing.to_numpy()[:, :-1], testing.to_numpy()[:, -1]

In [7]:
model_xgb = xgboost.XGBRegressor(random_state=42, max_depth=8, n_estimators=800, learning_rate=0.06)
model_xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.06, max_delta_step=0,
             max_depth=8, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=800, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [11]:
y_train_pred = model_xgb.predict(X_train)
print(mean_squared_error(y_train, y_train_pred))
r2_score(y_train, y_train_pred)

1.706993478843713


0.9914906961629674

In [12]:
y_test_pred = model_xgb.predict(X_test)
print(mean_squared_error(y_test, y_test_pred))
r2_score(y_test, y_test_pred)

17.568081209194666


0.9134986726984762

In [13]:
cross_val_score(model_xgb, X_train, y_train)

array([0.90600563, 0.90774365, 0.90475972, 0.90113558, 0.9032996 ])

In [None]:
cross_val_score(model_xgb, X_test, y_test)

### Save model

In [None]:
import pickle
file_name = "XGBOOST_predict_price.pkl"
pickle.dump(model_xgb, open(file_name, "wb"))