In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import PredictionErrorDisplay

In [2]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [3]:
# Import data
obs_A = pd.read_parquet('../../preprocessing/data/obs_A.parquet')
obs_B = pd.read_parquet('../../preprocessing/data/obs_B.parquet')
obs_C = pd.read_parquet('../../preprocessing/data/obs_C.parquet')
est_A = pd.read_parquet('../../preprocessing/data/est_A.parquet')
est_B = pd.read_parquet('../../preprocessing/data/est_B.parquet')
est_C = pd.read_parquet('../../preprocessing/data/est_C.parquet')
test_A = pd.read_parquet('../../preprocessing/data/test_A.parquet')
test_B = pd.read_parquet('../../preprocessing/data/test_B.parquet')
test_C = pd.read_parquet('../../preprocessing/data/test_C.parquet')

In [4]:
# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Step 1: Filter A to include only months from March to October
A = A[A['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]

# Step 2: Identify unique days within May, June, and July
summer_months = A[A['date_forecast'].dt.month.isin([5, 6, 7])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()

# Step 3: Sample these days for val_A
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.2), replace=False)
val_A = A[A['date_forecast'].dt.date.isin(sampled_days)]

# Step 4: Define train_A as the remaining data
train_A = A[~A['date_forecast'].dt.date.isin(sampled_days)]

# Prepare your features and target variables
X_train_A = train_A.drop(columns='pv_measurement')
y_train_A = train_A['pv_measurement']
X_val_A = val_A.drop(columns='pv_measurement')
y_val_A = val_A['pv_measurement']

# Repeat for B and C
B = B[B['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = B[B['date_forecast'].dt.month.isin([5, 6, 7])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.2), replace=False)
val_B = B[B['date_forecast'].dt.date.isin(sampled_days)]
train_B = B[~B['date_forecast'].dt.date.isin(sampled_days)]
X_train_B = train_B.drop(columns='pv_measurement')
y_train_B = train_B['pv_measurement']
X_val_B = val_B.drop(columns='pv_measurement')
y_val_B = val_B['pv_measurement']

C = C[C['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = C[C['date_forecast'].dt.month.isin([5, 6, 7])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.2), replace=False)
val_C = C[C['date_forecast'].dt.date.isin(sampled_days)]
train_C = C[~C['date_forecast'].dt.date.isin(sampled_days)]
X_train_C = train_C.drop(columns='pv_measurement')
y_train_C = train_C['pv_measurement']
X_val_C = val_C.drop(columns='pv_measurement')
y_val_C = val_C['pv_measurement']


In [5]:
train_A = TabularDataset(train_A)
train_B = TabularDataset(train_B)
train_C = TabularDataset(train_C)
val_A = TabularDataset(val_A)
val_B = TabularDataset(val_B)
val_C = TabularDataset(val_C)
test_A = TabularDataset(test_A)
test_B = TabularDataset(test_B)
test_C = TabularDataset(test_C)

In [6]:
label = 'pv_measurement'

In [7]:
# Fit the predictors
predictor_A = TabularPredictor(label=label, eval_metric='mean_absolute_error').fit(A, presets='good_quality', tuning_data=val_A, use_bag_holdout=True, ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

No path specified. Models will be saved in: "AutogluonModels/ag-20231111_212041/"
Presets specified: ['good_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231111_212041/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 21.6.0: Wed Aug 10 14:28:23 PDT 2022; root:xnu-8020.141.5~2/RELEASE_ARM64_T6000
Disk Space Avail:   53.26 GB / 494.38 GB (10.8%)
Train Data Rows:    34085
Train Data Columns: 16
Tuning Data Rows:    1608
Tuning Data Columns: 16
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 630.59471, 1165.90242)
	If 'regression' is not the correct problem_type

KeyboardInterrupt: 

In [None]:
predictor_B = TabularPredictor(label=label, eval_metric='mean_absolute_error').fit(B, presets='medium_quality', tuning_data=val_B, use_bag_holdout=True, ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

In [None]:
predictor_C = TabularPredictor(label=label, eval_metric='mean_absolute_error').fit(C, presets='medium_quality', tuning_data=val_C, use_bag_holdout=True, ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

In [None]:
# predictor_A.evaluate(test_A, silent=True)
# predictor_B.evaluate(B, silent=True)
# predictor_C.evaluate(C, silent=True)

In [None]:
# predictor_A.leaderboard(test_A, silent=False)
# predictor_B.leaderboard(B, silent=True)
# predictor_C.leaderboard(C, silent=True)

predictor_A.plot_ensemble_model()
predictor_B.plot_ensemble_model()
predictor_C.plot_ensemble_model()

In [None]:
# Loading best predictors
# predictor_A = TabularPredictor.load("AutogluonModels/ag-20231106_113214/")
# predictor_B = TabularPredictor.load("AutogluonModels/ag-20231106_114820/")
# predictor_C = TabularPredictor.load("AutogluonModels/ag-20231106_120433/")

In [None]:
# test_A = TabularDataset(test_A)
# test_B = TabularDataset(test_B)
# test_C = TabularDataset(test_C)

# test_A.describe()

In [None]:
# Create submission

output_file = 'autogluon_submission_nice_mais.csv'

pred_A = predictor_A.predict(test_A)
pred_B = predictor_B.predict(test_B)
pred_C = predictor_C.predict(test_C)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Create an id array
ids = np.arange(0, len(predictions))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions
})

# Save to CSV
df.to_csv(output_file, index=False)
print(f"Submission saved to {output_file}")

In [None]:
# For testing
pred_A = predictor_A.predict(val_A.drop(columns='pv_measurement'))
pred_B = predictor_B.predict(val_B.drop(columns='pv_measurement'))
pred_C = predictor_C.predict(val_C.drop(columns='pv_measurement'))

# Evaluate 
mae_A = mean_absolute_error(val_A['pv_measurement'], pred_A)
mae_B = mean_absolute_error(val_B['pv_measurement'], pred_B)
mae_C = mean_absolute_error(val_C['pv_measurement'], pred_C)

mae = mean_absolute_error(np.concatenate([val_A['pv_measurement'], val_B['pv_measurement'], val_C['pv_measurement']]), np.concatenate([pred_A, pred_B, pred_C]))


print(f"MAE A: {mae_A}")
print(f"MAE B: {mae_B}")
print(f"MAE C: {mae_C}")
print(f"MAE: {mae}")

# Best score features
# MAE A: 329.7136452557737
# MAE B: 67.42051219372165
# MAE C: 46.51223822089323
# MAE: 180.95353998757088

# All features
# MAE A: 290.2708872310664
# MAE B: 65.34583249980328
# MAE C: 42.413845591266046
# MAE: 175.11389348063562

In [None]:
pred_A = predictor_A.predict(val_A.drop(columns='pv_measurement'))
y_A = val_A['pv_measurement']
fig, ax = plt.subplots()
PredictionErrorDisplay.from_predictions(y_A, pred_A, ax=ax)
ax.plot([y_A.min(), y_A.max()], [y_A.min(), y_A.max()], '--r', linewidth=2)
ax.set_title('Prediction Error Plot')
ax.set_xlabel('True Values')
ax.set_ylabel('Predicted Values')
plt.show()

In [None]:
# Assume your peak is around observation number 0 to 1000
start_observation = 0
end_observation = 500

# Slice the DataFrame to include only the range of interest
peak_data = val_A[start_observation:end_observation]

# Slice the predictions in the same way if they're in a separate DataFrame
peak_predictions = predictor_A.predict(val_A.drop(columns='pv_measurement'))[start_observation:end_observation]

# Now plot only that slice
plt.figure(figsize=(10, 6))
plt.plot(peak_data['pv_measurement'], label='True Values', linewidth=0.5)
plt.plot(peak_predictions, label='Predictions', linewidth=0.5)
plt.xlabel('Observation Number')
plt.ylabel('Target Value')
plt.title('True Values vs Predictions for Peak')
plt.legend()
plt.show()
