In [7]:
# Import the necessary libraries
%pip install catboost
%matplotlib inline
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
import pickle

from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score as acs_score

from src.data.data_fetcher import get_raw_data
from src.features.preprocess_data import get_preprocessed_test_data, fetch_preprocessed_data
pd.set_option('display.max_columns', 200)

from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

Note: you may need to restart the kernel to use updated packages.


In [8]:
# Prepare data
train_a, train_b, train_c, X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_train_observed_a, X_train_observed_b, X_train_observed_c, X_test_estimated_a, X_test_estimated_b, X_test_estimated_c = get_raw_data()

X_train_obs_combined, X_val_obs_combined, y_train_obs_combined, y_val_obs_combined, X_train_est_combined, X_val_est_combined, y_train_est_combined, y_val_est_combined = fetch_preprocessed_data()
x_test_whole = get_preprocessed_test_data()

x_whole = pd.concat([X_train_obs_combined, X_val_obs_combined, X_train_est_combined, X_val_est_combined])
y_whole = pd.concat([y_train_obs_combined, y_val_obs_combined, y_train_est_combined, y_val_est_combined])
x_whole.reset_index(drop=True, inplace=True)
y_whole.reset_index(drop=True, inplace=True)

x_whole_obs = pd.concat([X_train_obs_combined, X_val_obs_combined])
y_whole_obs = pd.concat([y_train_obs_combined, y_val_obs_combined])

x_whole_est = pd.concat([X_train_est_combined, X_val_est_combined])
y_whole_est = pd.concat([y_train_est_combined, y_val_est_combined])

x_whole.head()

Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,is_day:idx,is_in_shadow:idx,precip_5min:mm,precip_type_5min:idx,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sun_azimuth:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,location_a,location_b,location_c,sin_day_of_year,cos_day_of_year,sin_hour,cos_hour,sun_product,modified_solar_elevation,effective_radiation,time_since_prediction,cloud_ratio,cloud_cover_over_30%,sun_addition,is_freezing,is_snow,is_rain,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55
0,6.45,1.259,902436.125,194.449997,2426.75,0.0,277.625,88.175003,348536.40625,70.375,271995.53125,62.900002,1.0,0.0,0.0,0.0,1018.400024,0.0,0.0,58.174999,242.404755,0.0,281.774994,100.0,43816.324219,0.725,-0.25,-0.7,0.0,1.0,0.0,0.0,0.984306,0.176471,-0.707107,-0.707107,6205.315918,0.242142,0.301401,0,0.629,1.0,158.550003,0.0,0.0,0.0,,,,,,,,,
1,4.925,1.2755,0.0,0.0,971.25,0.0,273.600006,0.0,0.0,0.0,0.0,99.425003,0.0,1.0,0.0,0.0,1010.150024,0.0,0.0,73.949997,309.972748,0.1,276.625,99.425003,33486.976562,2.825,1.125,2.6,0.0,1.0,0.0,0.0,-0.702148,0.712031,-0.866025,0.5,0.0,0.0,0.0,0,1.0,1.0,0.0,0.0,0.0,0.0,,,,,,,,,
2,6.975,1.2035,797556.125,175.350006,3255.375,0.0,279.075012,50.824997,206314.859375,120.599998,537617.25,5.25,1.0,0.0,0.0,0.0,995.099976,0.0,0.0,45.700001,230.077255,0.0,289.299988,6.05,57556.351562,4.925,-0.275,4.925,0.0,0.0,1.0,0.0,-0.999371,0.035473,-0.5,-0.866025,6129.494629,0.226685,0.674081,0,0.867769,0.0,171.424988,0.0,0.0,0.0,,,,,,,,,
3,4.675,1.23925,0.0,0.0,2067.925049,0.0,272.924988,0.0,0.0,0.0,0.0,91.949997,0.0,1.0,0.0,0.0,981.574951,0.0,0.0,65.949997,252.160492,0.0,279.774994,98.974998,45661.75,3.8,1.1,3.625,0.0,1.0,0.0,0.0,-0.638384,0.769718,-0.866025,-0.5,0.0,0.0,0.0,0,0.929022,1.0,0.0,0.0,0.0,0.0,,,,,,,,,
4,9.45,1.225,2240127.75,591.5,115.849998,0.0,283.324982,138.100006,454697.09375,66.150002,185963.796875,96.349998,1.0,0.0,0.0125,0.25,1001.400024,0.0,88.900002,216.283997,0.275,284.0,98.199997,5958.75,4.525,4.1,-1.95,0.0,88.900002,216.283997,0.275,284.0,98.199997,5958.75,4.525,4.1,-1.95,0.0,0.0,0,1.0,-0.739392,-0.673275,-0.258819,-0.965926,9135.31543,0.571108,0.083015,0.0,0.981161,1.0,204.25,0.0,0.0,1.0


In [9]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
import numpy as np

num_folds = 10
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

total_mae = 0
reg_models = []

def compute_sample_weight(data):
    # Assign weight of 2 for estimated data and 1 for observed data
    return np.where(data['time_since_prediction'] > 0, 2, 1)

for train_index, test_index in kf.split(x_whole):
    reg = CatBoostRegressor(
        iterations=10000000,
        depth=8,
        learning_rate=0.001,
        loss_function='MAE',
        verbose=200
    )
    
    X_train, X_test = x_whole.iloc[train_index], x_whole.iloc[test_index]
    y_train, y_test = y_whole.iloc[train_index], y_whole.iloc[test_index]
    
    # Compute sample weights for training and testing data
    train_weight = compute_sample_weight(X_train)
    test_weight = compute_sample_weight(X_test)

    # Create Pool for training and testing
    train_pool = Pool(data=X_train, label=y_train, weight=train_weight)
    test_pool = Pool(data=X_test, label=y_test, weight=test_weight)

    # Fit the model using the sample weights
    reg.fit(train_pool, eval_set=test_pool, early_stopping_rounds=100)

    reg_models.append(reg)
    predictions = reg.predict(test_pool)
    
    # Compute weighted MAE manually
    weighted_mae = np.sum(test_weight * np.abs(y_test - predictions)) / np.sum(test_weight)
    total_mae += weighted_mae
    
    print(f"Fold {len(reg_models)}, Weighted Mean Absolute Error: {weighted_mae}")

average_mae = total_mae / num_folds
print(f"Average Weighted Mean Absolute Error: {average_mae}")


0:	learn: 318.9886077	test: 312.6121514	best: 312.6121514 (0)	total: 170ms	remaining: 19d 17h 5m 33s
200:	learn: 280.3981892	test: 274.5190665	best: 274.5190665 (200)	total: 4.59s	remaining: 2d 15h 25m 46s
400:	learn: 244.7800494	test: 239.5292051	best: 239.5292051 (400)	total: 9.05s	remaining: 2d 14h 41m 15s
600:	learn: 213.4104131	test: 208.8713912	best: 208.8713912 (600)	total: 13.6s	remaining: 2d 14h 52m 54s
800:	learn: 188.1612567	test: 184.1085503	best: 184.1085503 (800)	total: 17.9s	remaining: 2d 14h 1m 59s
1000:	learn: 168.9008429	test: 165.0950869	best: 165.0950869 (1000)	total: 22.1s	remaining: 2d 13h 18m 41s
1200:	learn: 153.7744009	test: 150.2344591	best: 150.2344591 (1200)	total: 26.2s	remaining: 2d 12h 32m 11s
1400:	learn: 141.8349193	test: 138.4925745	best: 138.4925745 (1400)	total: 30.3s	remaining: 2d 11h 59m 8s
1600:	learn: 132.6183590	test: 129.5837296	best: 129.5837296 (1600)	total: 34.3s	remaining: 2d 11h 32m 8s
1800:	learn: 125.3101823	test: 122.5186445	best: 122.5

KeyboardInterrupt: 

In [3]:
def multi_predict(x_values :pd.DataFrame, models) -> pd.DataFrame:
    """
    Function for predicting on multiple models and averaging the results
    """
    results = models[0].predict(x_values)
    for model in models[1:]:
        prediction = model.predict(x_values)
        results += prediction
    
    results = results / len(models)

    return results


In [4]:
y_pred_val_obs_combined = multi_predict(x_whole_obs, reg_models)
y_pred_val_est_combined = multi_predict(x_whole_est, reg_models)

# Evaluate the model's performance using Mean Absolute Error (MAE) on the combined validation observed data
mae_obs_combined = mean_absolute_error(y_whole_obs, y_pred_val_obs_combined)
mae_est_combined = mean_absolute_error(y_whole_est, y_pred_val_est_combined)
print('MAE on validation observed data: ', mae_obs_combined)
print('MAE on validation estimated data: ', mae_est_combined)


NameError: name 'reg_models' is not defined

# Visualization

In [None]:
import matplotlib.pyplot as plt

train_prediction = multi_predict(x_whole, reg_models)

test_prediction = multi_predict(X_val_est_combined, reg_models)
# Observed Data
# Set up the plotting area
plt.figure(figsize=(12, 6))
# Line plot of Actual values
plt.plot(y_whole.reset_index(drop=True), label='Actual', linestyle='-', marker='o', markersize=5, alpha=0.7, color='blue')
# Line plot of Predicted values
plt.plot(train_prediction, label='Predicted', linestyle='--', marker='x', markersize=5, alpha=0.7, color='orange')
# Titles and labels
plt.title('Actual vs Predicted - Observed Data', fontsize=16)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Visualise the monthly predictions
# Observed Data
plt.figure(figsize=(12, 6))
# Line plot of Actual values
plt.plot(y_whole.reset_index(drop=True)[:24*7*4], label='Actual', linestyle='-', marker='o', markersize=5, alpha=0.7, color='blue')
# Line plot of Predicted values
plt.plot(train_prediction[:24*7*4], label='Predicted', linestyle='--', marker='x', markersize=5, alpha=0.7, color='orange')
# Titles and labels
plt.title('Actual vs Predicted - Observed Data', fontsize=16)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Estimated Data
# Set up the plotting area
plt.figure(figsize=(12, 6))
# Line plot of Actual values
plt.plot(y_val_est_combined.reset_index(drop=True), label='Actual', linestyle='-', marker='o', markersize=5, alpha=0.7, color='blue')
# Line plot of Predicted values
plt.plot(test_prediction, label='Predicted', linestyle='--', marker='x', markersize=5, alpha=0.7, color='orange')
# Titles and labels
plt.title('Actual vs Predicted - Estimated Data', fontsize=16)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


# Visualise the monthly predictions
# Estimated Data
plt.figure(figsize=(12, 6))
# Line plot of Actual values
plt.plot(y_val_est_combined.reset_index(drop=True)[:24*7*4], label='Actual', linestyle='-', marker='o', markersize=5, alpha=0.7, color='blue')
# Line plot of Predicted values
plt.plot(test_prediction[:24*7*4], label='Predicted', linestyle='--', marker='x', markersize=5, alpha=0.7, color='orange')
plt.title('Actual vs Predicted - Estimated Data', fontsize=16)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
with open("catboost_models.pkl", "wb") as file:
    pickle.dump(reg_models, file)

In [5]:
with open("catboost_models.pkl", "rb") as file:
    loaded_reg_models = pickle.load(file)

FileNotFoundError: [Errno 2] No such file or directory: 'catboost_models.pkl'

In [None]:
y_pred = multi_predict(x_test_whole, reg_models)


In [None]:
# post process

from src.features.postprocess_data import postprocess_data

processed_y_pred = postprocess_data(x_test_whole, y_pred)

In [None]:
# Save the model
from src.models.saving import save_predictions

save_predictions(processed_y_pred, 'catboost')

   id  prediction
0   0    0.000000
1   1    0.000000
2   2    0.093490
3   3   53.153196
4   4  308.907390
