In [None]:
# Import the necessary libraries
%matplotlib inline
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
import pickle

from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score as acs_score

from src.data.data_fetcher import get_raw_data
from src.features.preprocess_data import get_preprocessed_test_data, fetch_preprocessed_data
pd.set_option('display.max_columns', 200)

from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

In [None]:
X_train_obs_combined, X_val_obs_combined, y_train_obs_combined, y_val_obs_combined, X_train_est_combined, X_val_est_combined, y_train_est_combined, y_val_est_combined = fetch_preprocessed_data()
x_test_whole = get_preprocessed_test_data()

x_whole = pd.concat([X_train_obs_combined, X_val_obs_combined, X_train_est_combined, X_val_est_combined])
y_whole = pd.concat([y_train_obs_combined, y_val_obs_combined, y_train_est_combined, y_val_est_combined])
x_whole.reset_index(drop=True, inplace=True)
y_whole.reset_index(drop=True, inplace=True)

x_whole.head()

In [None]:
x_whole.describe()

In [None]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
import numpy as np

num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

total_mae = 0
reg_models = []

def compute_sample_weight(data):
    # Assign weight of 2 for estimated data and 1 for observed data
    return np.where(data['time_since_prediction'] > 0, 2, 1)

for train_index, test_index in kf.split(x_whole):
    reg = CatBoostRegressor(
        iterations=10000000,
        depth=8,
        learning_rate=0.001,
        loss_function='MAE',
        verbose=200
    )
    
    X_train, X_test = x_whole.iloc[train_index], x_whole.iloc[test_index]
    y_train, y_test = y_whole.iloc[train_index], y_whole.iloc[test_index]
    
    # Compute sample weights for training and testing data
    train_weight = compute_sample_weight(X_train)
    test_weight = compute_sample_weight(X_test)

    # Create Pool for training and testing
    train_pool = Pool(data=X_train, label=y_train, weight=train_weight)
    test_pool = Pool(data=X_test, label=y_test, weight=test_weight)

    # Fit the model using the sample weights
    reg.fit(train_pool, eval_set=test_pool, early_stopping_rounds=100)

    reg_models.append(reg)
    predictions = reg.predict(test_pool)
    
    # Compute weighted MAE manually
    weighted_mae = np.sum(test_weight * np.abs(y_test - predictions)) / np.sum(test_weight)
    total_mae += weighted_mae
    
    print(f"Fold {len(reg_models)}, Weighted Mean Absolute Error: {weighted_mae}")

average_mae = total_mae / num_folds
print(f"Average Weighted Mean Absolute Error: {average_mae}")

In [None]:
def multi_predict(x_values :pd.DataFrame, models) -> pd.DataFrame:
    """
    Function for predicting on multiple models and averaging the results
    """
    results = models[0].predict(x_values)
    for model in models[1:]:
        prediction = model.predict(x_values)
        results += prediction
    
    results = results / len(models)

    return results


In [None]:
y_pred_val_obs_combined = multi_predict(x_whole_obs, reg_models)
y_pred_val_est_combined = multi_predict(x_whole_est, reg_models)

# Evaluate the model's performance using Mean Absolute Error (MAE) on the combined validation observed data
mae_obs_combined = mean_absolute_error(y_whole_obs, y_pred_val_obs_combined)
mae_est_combined = mean_absolute_error(y_whole_est, y_pred_val_est_combined)
print('MAE on validation observed data: ', mae_obs_combined)
print('MAE on validation estimated data: ', mae_est_combined)


# Visualization

In [None]:
import matplotlib.pyplot as plt

train_prediction = multi_predict(x_whole, reg_models)

test_prediction = multi_predict(X_val_est_combined, reg_models)
# Observed Data
# Set up the plotting area
plt.figure(figsize=(12, 6))
# Line plot of Actual values
plt.plot(y_whole.reset_index(drop=True), label='Actual', linestyle='-', marker='o', markersize=5, alpha=0.7, color='blue')
# Line plot of Predicted values
plt.plot(train_prediction, label='Predicted', linestyle='--', marker='x', markersize=5, alpha=0.7, color='orange')
# Titles and labels
plt.title('Actual vs Predicted - Observed Data', fontsize=16)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Visualise the monthly predictions
# Observed Data
plt.figure(figsize=(12, 6))
# Line plot of Actual values
plt.plot(y_whole.reset_index(drop=True)[:24*7*4], label='Actual', linestyle='-', marker='o', markersize=5, alpha=0.7, color='blue')
# Line plot of Predicted values
plt.plot(train_prediction[:24*7*4], label='Predicted', linestyle='--', marker='x', markersize=5, alpha=0.7, color='orange')
# Titles and labels
plt.title('Actual vs Predicted - Observed Data', fontsize=16)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Estimated Data
# Set up the plotting area
plt.figure(figsize=(12, 6))
# Line plot of Actual values
plt.plot(y_val_est_combined.reset_index(drop=True), label='Actual', linestyle='-', marker='o', markersize=5, alpha=0.7, color='blue')
# Line plot of Predicted values
plt.plot(test_prediction, label='Predicted', linestyle='--', marker='x', markersize=5, alpha=0.7, color='orange')
# Titles and labels
plt.title('Actual vs Predicted - Estimated Data', fontsize=16)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


# Visualise the monthly predictions
# Estimated Data
plt.figure(figsize=(12, 6))
# Line plot of Actual values
plt.plot(y_val_est_combined.reset_index(drop=True)[:24*7*4], label='Actual', linestyle='-', marker='o', markersize=5, alpha=0.7, color='blue')
# Line plot of Predicted values
plt.plot(test_prediction[:24*7*4], label='Predicted', linestyle='--', marker='x', markersize=5, alpha=0.7, color='orange')
plt.title('Actual vs Predicted - Estimated Data', fontsize=16)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
with open("catboost_models.pkl", "wb") as file:
    pickle.dump(reg_models, file)

In [None]:
with open("catboost_models.pkl", "rb") as file:
    loaded_reg_models = pickle.load(file)

In [None]:
y_pred = multi_predict(x_test_whole, reg_models)


In [None]:
# Save the model
from src.models.saving import save_predictions

save_predictions(y_pred, 'catboost')