In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler

from src.data.data_fetcher import get_all_features, get_raw_data
from src.features.feature_engineering import prepare_data, get_location_datasets
from src.features.preprocess_data import get_preprocessed_test_data, fetch_preprocessed_data, fetch_preprocessed_uniform_data
pd.set_option('display.max_columns', 200)


from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')


%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\gunna\Documents\Maskinlæring\Prosjekt\power-predictor\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [35]:
train_a, train_b, train_c, X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_train_observed_a, X_train_observed_b, X_train_observed_c, X_test_estimated_a, X_test_estimated_b, X_test_estimated_c = get_raw_data()

X_train_obs_combined, X_val_obs_combined, y_train_obs_combined, y_val_obs_combined, X_train_est_combined, X_val_est_combined, y_train_est_combined, y_val_est_combined = fetch_preprocessed_data()
X_test_estimated_a_processed, X_test_estimated_b_processed, X_test_estimated_c_processed = get_preprocessed_test_data()

x_whole = pd.concat([X_train_obs_combined, X_val_obs_combined])
y_whole = pd.concat([y_train_obs_combined, y_val_obs_combined])
x_whole.reset_index(drop=True, inplace=True)
y_whole.reset_index(drop=True, inplace=True)

x_whole["pv_measurement"] = y_whole
x_a_train, x_b_train, x_c_train, y_a_train, y_b_train, y_c_train = get_location_datasets(x_whole)

In [36]:
x_whole = pd.concat([X_train_est_combined, X_val_est_combined])
y_whole = pd.concat([y_train_est_combined, y_val_est_combined])
x_whole.reset_index(drop=True, inplace=True)
y_whole.reset_index(drop=True, inplace=True)

x_whole["pv_measurement"] = y_whole
x_a_test, x_b_test, x_c_test, y_a_test, y_b_test, y_c_test = get_location_datasets(x_whole)


In [37]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Store training and testing data for each location in dictionaries
x_train_data = {'a': x_a_train, 'b': x_b_train, 'c': x_c_train}
y_train_data = {'a': y_a_train, 'b': y_b_train, 'c': y_c_train}

x_test_data = {'a': x_a_test, 'b': x_b_test, 'c': x_c_test}
y_test_data = {'a': y_a_test, 'b': y_b_test, 'c': y_c_test}

all_reg_models = []  # List to store regression models for all locations

# Function to drop 'pv_measurement' column if it exists
def drop_pv_measurement(dataframe):
    if 'pv_measurement' in dataframe.columns:
        return dataframe.drop('pv_measurement', axis=1)
    return dataframe

# Loop through locations
for loc in ['a', 'b', 'c']:
    reg_models = []
    total_mae = 0
    
    # Get train and test data for the current location
    x = drop_pv_measurement(x_train_data[loc])  # Drop 'pv_measurement' column
    y_whole = y_train_data[loc]
    
    for train_index, val_index in kf.split(x):
        reg = xgb.XGBRegressor(n_estimators=1000000,
                               early_stopping_rounds=50,
                               learning_rate=0.001,
                               objective="reg:linear",
                               eval_metric="mae",
                               sub_sample=0.9,
                               colsample_bytree=1.0,
                               gamma=0,
                               min_child_weight=0,
                               max_depth=9)
        
        X_train, X_val = x.iloc[train_index], x.iloc[val_index]
        y_train, y_val = y_whole.iloc[train_index], y_whole.iloc[val_index]
        
        # Drop 'pv_measurement' column from the test data too before fitting
        reg.fit(X_train, y_train,
                eval_set=[(drop_pv_measurement(x_test_data[loc]), y_test_data[loc])],
                verbose=100)
        
        reg_models.append(reg)
        predictions = reg.predict(X_val)
        
        mae = mean_absolute_error(y_val, predictions)
        total_mae += mae
        
        print(f"Location {loc}, Fold {len(reg_models)}, Mean Absolute Error: {mae}")

    average_mae = total_mae / num_folds
    print(f"Location {loc}, Average Mean Absolute Error: {average_mae}")
    
    all_reg_models.append(reg_models)  # Add the models for the current location to the main list


[0]	validation_0-mae:748.15645
[9]	validation_0-mae:742.11812
Location a, Fold 1, Mean Absolute Error: 846.5560624501098
[0]	validation_0-mae:746.58211
[9]	validation_0-mae:740.66160
Location a, Fold 2, Mean Absolute Error: 854.689364268268
[0]	validation_0-mae:741.83699
[9]	validation_0-mae:735.89686
Location a, Fold 3, Mean Absolute Error: 873.165303901145
[0]	validation_0-mae:744.41961
[9]	validation_0-mae:738.41715
Location a, Fold 4, Mean Absolute Error: 866.5761249822966
[0]	validation_0-mae:742.84472
[9]	validation_0-mae:736.87239
Location a, Fold 5, Mean Absolute Error: 857.5100863475576
Location a, Average Mean Absolute Error: 859.6993883898755
[0]	validation_0-mae:114.39475
[9]	validation_0-mae:113.42786
Location b, Fold 1, Mean Absolute Error: 142.2927025999762
[0]	validation_0-mae:114.96853
[9]	validation_0-mae:114.00939
Location b, Fold 2, Mean Absolute Error: 141.58362910150177
[0]	validation_0-mae:115.17350
[9]	validation_0-mae:114.22302
Location b, Fold 3, Mean Absolute

In [38]:
def multi_predict(x_values :pd.DataFrame, models) -> pd.DataFrame:
    """
    Function for predicting on multiple models and averaging the results
    """
    results = models[0].predict(x_values)
    for model in models[1:]:
        model: xgb.XGBRegressor
        prediction = model.predict(x_values)
        results += prediction
    
    results = results / len(models)

    return results

In [39]:
locations = ["location_a", "location_b", "location_c"]

X_test_estimated_a_processed = X_test_estimated_a_processed.drop(locations, axis=1)
X_test_estimated_b_processed = X_test_estimated_b_processed.drop(locations, axis=1)
X_test_estimated_c_processed = X_test_estimated_c_processed.drop(locations, axis=1)



In [40]:
X_test_estimated_c_processed.columns

Index(['absolute_humidity_2m:gm3', 'air_density_2m:kgm3',
       'clear_sky_energy_1h:J', 'clear_sky_rad:W', 'cloud_base_agl:m',
       'dew_or_rime:idx', 'dew_point_2m:K', 'diffuse_rad:W',
       'diffuse_rad_1h:J', 'direct_rad:W', 'direct_rad_1h:J',
       'effective_cloud_cover:p', 'is_day:idx', 'is_in_shadow:idx',
       'precip_5min:mm', 'precip_type_5min:idx', 'pressure_50m:hPa',
       'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p',
       'sun_azimuth:d', 'super_cooled_liquid_water:kgm2', 't_1000hPa:K',
       'total_cloud_cover:p', 'visibility:m', 'wind_speed_10m:ms',
       'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms',
       'sin_day_of_year', 'cos_day_of_year', 'sin_hour', 'cos_hour',
       'sun_product', 'modified_solar_elevation', 'effective_radiation',
       'cloud_ratio', 'cloud_cover_over_30%', 'sun_addition', 'is_freezing',
       'is_snow', 'is_rain'],
      dtype='object')

In [41]:
# Predict on the cleaned validation set
from src.features.preprocess_data import get_final_prediction

y_val_pred_est_a = multi_predict(X_test_estimated_a_processed, all_reg_models[0])
y_val_pred_est_b = multi_predict(X_test_estimated_b_processed, all_reg_models[1])
y_val_pred_est_c = multi_predict(X_test_estimated_c_processed, all_reg_models[2])

# Combine the predictions into a single array
y_pred_test_est_combined = get_final_prediction(y_val_pred_est_a, y_val_pred_est_b, y_val_pred_est_c)
y_pred_test_est_combined.shape

y_pred_test_est_combined.describe()

Unnamed: 0,id,time,prediction
count,2160.0,2160,2160.0
mean,1079.5,2023-06-05 21:53:59.999999744,290.409668
min,0.0,2023-05-01 00:00:00,84.642372
25%,539.75,2023-05-23 11:45:00,88.05014
50%,1079.5,2023-06-03 23:30:00,103.563084
75%,1619.25,2023-06-19 11:15:00,668.267517
max,2159.0,2023-07-03 23:00:00,713.925903
std,623.682612,,275.720642


In [42]:
from src.models.saving import save_predictions


save_predictions(y_pred_test_est_combined, '3 models test')


y_pred_test_est_combined.describe()
print(y_pred_test_est_combined[y_pred_test_est_combined["prediction"] < 0])

   id  prediction
0   0  667.778809
1   1  667.778809
2   2  667.778809
3   3  667.778809
4   4  670.068054
Empty DataFrame
Columns: [id, time, location, prediction]
Index: []
