# Emsemble from models outputs loaded from CSV

In [None]:
import numpy as np
import pandas as pd
import optuna
from sklearn.metrics import mean_squared_error

## Loading data

Submissions should be on `'submissions/{model}_train.csv'`for training the weights and `'submissions/{model}.csv'` for the final submission.

In [None]:
true_values = pd.read_csv('data/challenge_set.csv')[['flight_id', 'tow']]

models = ['catboost', 'xdgboost', 'lightgbm', 'mlp', 'saint']

In [None]:
combined_df = true_values.copy()
for model in models:
  model_df = pd.read_csv(f'submissions/{model}_train.csv')[['flight_id', 'tow']]
  combined_df = combined_df.merge(model_df, on='flight_id', suffixes=('', f'_{model}'))

y_true = combined_df['tow'].values
predictions = combined_df[[f'tow_{model}' for model in models]].values

## Finding the best weights

In [None]:
def objective(trial):
  weights = []
  for model in models:
    weights.append(trial.suggest_float(f'weight_{model}', 0.0, 1.0))
  weights = np.array(weights)
  weights /= weights.sum()
  ensemble_preds = np.dot(predictions, weights)
  rmse = mean_squared_error(y_true, ensemble_preds, squared=False)
  return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

In [None]:
best_weights = study.best_params
weights = [best_weights[f'weight_{model}'] for model in models]
weights = np.array(weights) / np.sum(weights)
weights

## Final Ensemble Submission

In [None]:
submission_combined_df = pd.read_csv(f'submissions/{models[0]}.csv')[['flight_id', 'tow']].copy()

for model in models:
  model_df = pd.read_csv(f'submissions/{model}.csv')[['flight_id', 'tow']]
  submission_combined_df = submission_combined_df.merge(model_df, on='flight_id', suffixes=('', f'_{model}'))

submission_predictions = submission_combined_df[[f'tow_{model}' for model in models]].values
ensemble_submission_preds = np.dot(submission_predictions, weights)
submission_combined_df['tow'] = ensemble_submission_preds

In [None]:
submission_combined_df[['flight_id', 'tow']].to_csv('submissions/ensemble.csv', index=False)