In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [None]:
def split_datetime(data, col="datetime"):
    # What columns are of type datetime?
    datetime_columns = data.select_dtypes(include='datetime64').columns
    
    for c in datetime_columns:
        print(f"Timezone for {c} is {data[c].dt.tz}")

    # Adding columns for date & time
    data['year']    = data[col].dt.year
    # data['quarter'] = data[col].dt.quarter
    data['month']   = data[col].dt.month
    data['week']    = data[col].dt.isocalendar().week
    data['hour']    = data[col].dt.hour 

    data['day_of_year']  = data[col].dt.day_of_year
    data['day_of_month'] = data[col].dt.day
    data['day_of_week']  = data[col].dt.day_of_week

    return data

In [None]:
merged_df = pd.read_parquet('../data/merged_df.parquet')

In [None]:
# mapping days of the week names and converting to categorical variable
if 'day_of_week' in merged_df.columns:
    weekday_map = {
        0: 'Monday',
        1: 'Tuesday',
        2: 'Wednesday',
        3: 'Thursday',
        4: 'Friday',
        5: 'Saturday',
        6: 'Sunday'
    }
    merged_df['day_of_week'] = merged_df['day_of_week'].map(weekday_map).astype('category')

In [None]:
# encode categories to category datetype

merged_df['county'] = merged_df['county'].astype('category')
merged_df['product_type'] = merged_df['product_type'].astype('category')


In [None]:
# workday vs weekend
# estonian holidays
# increasing in the capacity
# aggregated weather of previous and/or future period
# aggregated prices of previous period
# target from previous year
# ...

In [None]:
# copy df for modelling
model_df = merged_df

# model is not able to handle object type
model_df.drop('time_of_day', axis=1, inplace=True)

# split datetime into meaningful features of int types
model_df = split_datetime(model_df)

# model is not able to handle datetime
model_df = model_df.drop(model_df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, EET]']).columns, axis=1)

# drop na from target
model_df.dropna(subset=['target'], inplace=True)


In [None]:
X_train, X_test, y_train,  y_test = train_test_split(model_df.drop('target', axis=1), model_df['target'], test_size=0.3, random_state=0)

bst = XGBRegressor(enable_categorical=True)
bst.fit(X_train, y_train)
y_pred = bst.predict(X_test)

# main optimisation metric
print('Mean absolute error test', mean_absolute_error(y_test, y_pred))
print('Mean absolute error train', mean_absolute_error(y_train, bst.predict(X_train)))

In [None]:
# first attempt gave us 50.75 mean absolute error

In [None]:
model_df['data_block_id'].describe()

In [None]:
# split of old data to train and newer one to test

Xy_train = model_df[model_df.data_block_id < 450]
X_train = Xy_train.drop('target', axis=1)
y_train = Xy_train.target

Xy_test = model_df[model_df.data_block_id >= 450]
X_test = Xy_test.drop('target', axis=1)
y_test = Xy_test.target

bst = XGBRegressor(enable_categorical=True)
bst.fit(X_train, y_train)
y_pred = bst.predict(X_test)

# main optimisation metric
print('Mean absolute error test', mean_absolute_error(y_test, y_pred))
print('Mean absolute error train', mean_absolute_error(y_train, bst.predict(X_train)))

In [None]:
# divide by dates and use newer ones for validation

In [None]:
xgb.plot_importance(bst)
plt.title('Feature Importance')
plt.show()

In [None]:
# hours_ahead_forecast treated as important feature, probably smth to drop)

- visualisation
- split by date
- tweaking the parameters
- drop some features
- feature engineering
- overfitting with traditional train_test_split?
- try to models/ multiple_output/ other models

In [None]:
# model_df2 = model_df.copy()
# model_df2.drop(['row_id', ])

split_datablock = 300

Xy_train = model_df[model_df.data_block_id < split_datablock]
X_train = Xy_train.drop('target', axis=1)
y_train = Xy_train.target

Xy_test = model_df[model_df.data_block_id >= split_datablock]
X_test = Xy_test.drop('target', axis=1)
y_test = Xy_test.target

bst = XGBRegressor(enable_categorical=True)
bst.fit(X_train, y_train)

y_pred_test = bst.predict(X_test)
y_pred_train = bst.predict(X_train)

# main optimisation metric
print('Mean absolute error test', mean_absolute_error(y_test, y_pred_test))
print('Mean absolute error train', mean_absolute_error(y_train, y_pred_train))

In [None]:
px.scatter(x=Xy_train.index, y=y_pred_train-y_train, color=Xy_train.month)

In [None]:
px.scatter(data_frame=Xy_test, x=Xy_test.index, y=y_pred_test-y_test, color=Xy_test.month, hover_data='day_of_week')

In [None]:
Xy_test['residual'] = y_pred_test-y_test

Xy_test.head()

In [None]:
sns.heatmap(Xy_test.corr(numeric_only=True), annot=False, cmap='RdBu', center = 0)
plt.title('Correlation Heatmap')
plt.figure(
    figsize=(20, 20)
)
plt.show()

In [None]:
target_column = 'residual'

# Exclude non-numeric columns
numeric_columns = Xy_test.select_dtypes(include=['number']).columns
numeric_df = Xy_test[numeric_columns]
numeric_df_cons = numeric_df[numeric_df['is_consumption'] == 1]

# Calculate the correlation matrix
correlation_matrix = numeric_df_cons.corr()

# Select correlations based on the threshold
threshold = 0.15
significant_correlations = correlation_matrix[(correlation_matrix[target_column] > threshold) | (correlation_matrix[target_column] < -threshold)][target_column]

# Plot a heatmap of the significant correlations with the target
plt.figure(figsize=(12, 8))
sns.heatmap(significant_correlations.to_frame(), annot=True, cmap='coolwarm', fmt=".2f", cbar=False)
plt.title(f'Significant Correlations with {target_column}, CONSUM ONLY (Threshold: {threshold})')
plt.show()

In [None]:
target_column = 'residual'

# Exclude non-numeric columns
numeric_columns = Xy_test.select_dtypes(include=['number']).columns
numeric_df = Xy_test[numeric_columns]
numeric_df_cons = numeric_df[numeric_df['is_consumption'] == 0]

# Calculate the correlation matrix
correlation_matrix = numeric_df_cons.corr()

# Select correlations based on the threshold
threshold = 0.15
significant_correlations = correlation_matrix[(correlation_matrix[target_column] > threshold) | (correlation_matrix[target_column] < -threshold)][target_column]

# Plot a heatmap of the significant correlations with the target
plt.figure(figsize=(12, 8))
sns.heatmap(significant_correlations.to_frame(), annot=True, cmap='coolwarm', fmt=".2f", cbar=False)
plt.title(f'Significant Correlations with {target_column}, PRODUCTION ONLY (Threshold: {threshold})')
plt.show()

- residuals are bigger at the summer time, we quess because production is happening at this time

- residuals on the test data have weekly pattern
- last two month predicted very poorly

- residuals are different depending on how we split our data 
- we see unexpalinable patterns in residuals
- residuals for consumption and production correlate with different features

- try residual analysis with traditional test_train_split

- tweak the model

In [None]:
X_train, X_test, y_train,  y_test = train_test_split(model_df.drop('target', axis=1), model_df['target'], test_size=0.3, random_state=0)

bst = XGBRegressor(enable_categorical=True)
bst.fit(X_train, y_train)
y_pred = bst.predict(X_test)

# main optimisation metric
print('Mean absolute error test', mean_absolute_error(y_test, y_pred))
print('Mean absolute error train', mean_absolute_error(y_train, bst.predict(X_train)))

y_pred_test = bst.predict(X_test)
y_pred_train = bst.predict(X_train)



- pretty much the same patterns with different train test splits


In [None]:
model_df.columns

In [None]:
drop_columns = [
    'target',
    'hours_ahead_forecast_weather',
    'row_id',
    'data_block_id', 
    'prediction_unit_id',
    'longitude_hist_weather',
    'longitude_forecast_weather',
    'latitude_hist_weather',
    'latitude_forecast_weather'
]

X_train, X_test, y_train,  y_test = train_test_split(model_df.drop(drop_columns, axis=1), model_df['target'], test_size=0.3, random_state=0)

bst = XGBRegressor(enable_categorical=True)
bst.fit(X_train, y_train)
y_pred = bst.predict(X_test)

# main optimisation metric
print('Mean absolute error test', mean_absolute_error(y_test, y_pred))
print('Mean absolute error train', mean_absolute_error(y_train, bst.predict(X_train)))

In [None]:
px.scatter(x=X_train.index, y=y_pred_train-y_train, color=X_train.month)

In [None]:
px.scatter(data_frame=X_test, x=X_test.index, y=y_pred_test-y_test, color=X_test.month, hover_data='day_of_week')

In [None]:
xgb.plot_importance(bst)

# Hyperparameter tuning

## XGBoost Parameters

What parameters can we tune?
Source: https://xgboost.readthedocs.io/en/stable/parameter.html

- `booster` [default: `gbtree`]: 
  - Description: Specifies the booster type to use.
  - Options: 
    - `gbtree`: Uses tree-based models.
    - `dart`: Similar to `gbtree`, but with dropout.
    - `gblinear`: Uses linear functions.
    
- `eta` [default: `0.3`, alias: `learning_rate`]: 
  - Description: Step size shrinkage used in update to prevent overfitting. 
  - Range: `[0, 1]`

- `max_depth` [default: `6`]: 
  - Description: Maximum depth of a tree. Increasing this value will make the model more complex and likely to overfit. 
  - Range: `[0, ∞]` (0 indicates no limit)

- `subsample` [default: `1`]: 
  - Description: Subsample ratio of the training instances to prevent overfitting. 
  - Range: `(0, 1]`

- `lambda` [default: `1`, alias: `reg_lambda`]: 
  - Description: L2 regularization term on weights. 
  - Range: `[0, ∞]`

- `alpha` [default: `0`, alias: `reg_alpha`]: 
  - Description: L1 regularization term on weights. 
  - Range: `[0, ∞]`

- `eval_metric` [default: according to objective]: 
  - Description: Evaluation metrics for validation data. 
  - Note: A default metric is assigned according to the objective (e.g., `rmse` for regression, `logloss` for classification). Users can add multiple evaluation metrics.


## GridSearch

First only searching different tree level depths:

In [None]:
# hyperparameter tuning with gridsearch

from sklearn.model_selection import GridSearchCV

# Define a range of hyperparameters to tune
param_grid = {
    'max_depth': [3, 5, 7, 10],
    #'learning_rate': [0.01, 0.05, 0.1, 0.2],
    #'n_estimators': [100, 200, 300, 500],
    #'subsample': [0.7, 0.8, 0.9],
}

# Initialize the XGBRegressor with enable_categorical=True
xgb_reg = XGBRegressor(enable_categorical=True)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


In [None]:
best_params

In [None]:
best_model

In [None]:
# checking best model's MAE on test set

y_pred = best_model.predict(X_test)

# Calculate the Mean Absolute Error between the actual and predicted values
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")

In [None]:
# checking best model's MAE on train set

y_pred = best_model.predict(X_train)

# Calculate the Mean Absolute Error between the actual and predicted values
mae = mean_absolute_error(y_train, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")

## RandomizedSearch
Different parameters are tuned, and df is split into consumption/production.

In [None]:
# randomized search, but splitting the df into consumption/production, and choosing different parameters for tuning

from sklearn.model_selection import RandomizedSearchCV

drop_columns = [
    'target',
    'hours_ahead_forecast_weather',
    'row_id',
    'data_block_id',
    'prediction_unit_id',
    'longitude_hist_weather',
    'longitude_forecast_weather',
    'latitude_hist_weather',
    'latitude_forecast_weather'
]
# max_depth 15 leads to overfitting
params = {
    'gamma': [0, 0.1, 1, 10],
    'max_depth': [4, 6, 8],
    'min_child_weight': [0, 1, 4, 8],
    'lambda': [0, 0.01, 0.1, 1],
    'num_parallel_tree': [1, 2, 3],
}
# consumption model
X_train, X_test, y_train_cons,  y_test_cons = train_test_split(
    model_df.drop(drop_columns, axis=1).query('is_consumption == 1'),
    model_df.query('is_consumption == 1')['target'],
    test_size=0.3,
    random_state=0
)

bst_cons = RandomizedSearchCV(
    estimator=XGBRegressor(enable_categorical=True),
    param_distributions=params,
    scoring='neg_mean_absolute_error',
    n_iter=10,
    cv=2
)
bst_cons.fit(X_train, y_train_cons)
y_pred_test_cons = bst_cons.predict(X_test)
y_pred_train_cons = bst_cons.predict(X_train)
print('Mean absolute error train consumption', mean_absolute_error(y_train_cons, y_pred_train_cons))
print('Mean absolute error test consumption', mean_absolute_error(y_test_cons, y_pred_test_cons))
# production model
X_train, X_test, y_train_prod,  y_test_prod = train_test_split(
    model_df.drop(drop_columns, axis=1).query('is_consumption == 0'),
    model_df.query('is_consumption == 0')['target'],
    test_size=0.3,
    random_state=0
)

bst_prod = RandomizedSearchCV(
    estimator=XGBRegressor(enable_categorical=True),
    param_distributions=params,
    scoring='neg_mean_absolute_error',
    n_iter=10,
    cv=2
)
bst_prod.fit(X_train, y_train_prod)
y_pred_test_prod = bst_prod.predict(X_test)
y_pred_train_prod = bst_prod.predict(X_train)
print('Mean absolute error train production', mean_absolute_error(y_train_prod, y_pred_train_prod))
print('Mean absolute error test production', mean_absolute_error(y_test_prod, y_pred_test_prod))
# overall score
print(
    'Mean absolute error train overall',
    mean_absolute_error(
          pd.concat([pd.Series(y_train_cons), pd.Series(y_train_prod)]),
          pd.concat([pd.Series(y_pred_train_cons), pd.Series(y_pred_train_prod)])
    )
)
print(
    'Mean absolute error test overall',
    mean_absolute_error(
        pd.concat([pd.Series(y_test_cons), pd.Series(y_test_prod)]),
        pd.concat([pd.Series(y_pred_test_cons), pd.Series(y_pred_test_prod)])
    )
)



MAE is quite similar with the two hyperparameter search, max tree depth level is probably somewhere between 8 and 10.
We need to validate our model on the test dataset, to see its reliability.