In [1]:
# Import the necessary libraries
%matplotlib inline
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score as acs_score

%pip install mljar-supervised
from supervised.automl import AutoML

from src.data.data_fetcher import get_raw_data
from src.features.preprocess_data import get_preprocessed_test_data, fetch_preprocessed_data, get_final_prediction
pd.set_option('display.max_columns', 200)

from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Prepare data
train_a, train_b, train_c, X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_train_observed_a, X_train_observed_b, X_train_observed_c, X_test_estimated_a, X_test_estimated_b, X_test_estimated_c = get_raw_data()

X_train_obs_combined, X_val_obs_combined, y_train_obs_combined, y_val_obs_combined, X_train_est_combined, X_val_est_combined, y_train_est_combined, y_val_est_combined = fetch_preprocessed_data()
X_test_estimated_a_processed, X_test_estimated_b_processed, X_test_estimated_c_processed = get_preprocessed_test_data()

x_whole = pd.concat([X_train_obs_combined, X_val_obs_combined])
y_whole = pd.concat([y_train_obs_combined, y_val_obs_combined])
x_whole.reset_index(drop=True, inplace=True)
y_whole.reset_index(drop=True, inplace=True)

x_whole.head()

In [None]:
# Set up and run AutoML
automl = AutoML(
    mode='Compete', 
    total_time_limit=1, 
    ml_task='regression',
    eval_metric='mae', 
    validation_strategy={
        "validation_type": "kfold", 
        "k_folds": 5, 
        "shuffle": True, 
        "stratify": False,
        "random_seed": 42
    }, 
    random_state=42,
    n_jobs=-1, 
    golden_features=True
)

automl.fit(x_whole, y_whole)

In [None]:

test_prediction = automl.predict(X_val_est_combined)

# Evaluate the best model on the validation subset of the observed dataset
# leaderboard = automl.get_leaderboard()
# print(leaderboard)

mae_observed = mean_absolute_error(y_val_est_combined, test_prediction)

print(f'Mean Absolute Error on Validation Set: {mae_observed}')

In [None]:
import matplotlib.pyplot as plt

# Set up the plotting area
plt.figure(figsize=(12, 6))
# Line plot of Actual values
plt.plot(y_val_est_combined.reset_index(drop=True), label='Actual', linestyle='-', marker='o', markersize=5, alpha=0.7, color='blue')
# Line plot of Predicted values
plt.plot(test_prediction, label='Predicted', linestyle='--', marker='x', markersize=5, alpha=0.7, color='orange')
# Titles and labels
plt.title('Actual vs Predicted - Observed Data', fontsize=16)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Evaluate the best model on the validation subset of the observed dataset
y_pred_a = automl.predict(X_test_estimated_a_processed)
y_pred_b = automl.predict(X_test_estimated_b_processed)
y_pred_c = automl.predict(X_test_estimated_c_processed)
# Combine the predictions
y_pred = get_final_prediction(y_pred_a, y_pred_b, y_pred_c)


In [None]:
# Save the model
from src.models.saving import save_predictions

save_predictions(y_pred, 'AutoML mljar Supervised ')