# Emsemble from models outputs loaded from CSV

In [1]:
# !pip install qpsolvers[proxqp]

In [2]:
import qpsolvers
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

## Loading data

Submissions should be on `'submissions/{model}_train.csv'`for training the weights and `'submissions/{model}.csv'` for the final submission.

In [5]:
true_values = pd.read_csv('data/challenge_set.csv')[['flight_id', 'tow']]

models = ['catboost', 'xgboost', 'lightgbm']#, 'mlp', 'saint']

In [20]:
_, y_true = train_test_split(true_values['tow'].values, test_size=0.2, random_state=42)
predictions = []
for model in models:
  predictions.append(pd.read_csv(f'submissions/{model}_train.csv')[['tow']].values)
predictions = np.hstack(predictions)

## Finding the best weights

In [66]:
weights = qpsolvers.solve_ls(predictions, y_true, ub=np.ones(len(models)), lb=np.zeros(len(models)), solver='proxqp')
weights

array([8.65886309e-01, 1.34329641e-01, 4.77878426e-22])

## Final Ensemble Submission

In [67]:
submission_combined_df = pd.read_csv(f'submissions/{models[0]}.csv')[['flight_id', 'tow']].copy()

for model in models:
  model_df = pd.read_csv(f'submissions/{model}.csv')[['flight_id', 'tow']]
  submission_combined_df = submission_combined_df.merge(model_df, on='flight_id', suffixes=('', f'_{model}'))

submission_predictions = submission_combined_df[[f'tow_{model}' for model in models]].values
ensemble_submission_preds = np.dot(submission_predictions, weights)
submission_combined_df['tow'] = ensemble_submission_preds

In [68]:
submission_combined_df[['flight_id', 'tow']].to_csv('submissions/ensemble.csv', index=False)