In [None]:
from IPython.display import Markdown as md
from IPython.display import HTML

NOTEBOOK_NAME = '🌪 Ensembling and rounding techniques comparison'
NOTEBOOK_URL = 'https://www.kaggle.com/fergusfindley/ensembling-and-rounding-techniques-comparison'

HTML(f'''<div class="alert alert-block alert-info">
If you find <a href={NOTEBOOK_URL}>this notebook</a> useful or you just like it, please upvote ▲.<br>
If you are using any part of this notebook, please link to <a href={NOTEBOOK_URL}>{NOTEBOOK_NAME}</a> notebook.<br>
In case of any question/feedback don't hesitate to <a href={NOTEBOOK_URL}/comments>comment</a> below.
</div>''')

<hr>

In [None]:
HTML(f'''<h1><center>{NOTEBOOK_NAME}</center></h1>
<h2><center><b>TL;DR</b> One can improve the final result by using fit-based weighted averaging with some geometric rounding on top of it.</center></h2>
<center>
<h3><span style="color:#20BEFF;">Please check my previous notebook <a href="https://www.kaggle.com/fergusfindley/tpsjan22-eda-baseline-train-submission">EDA,baseline,train&submission</a></span></h3></center>''')

<a id="references"></a>
<h1 id="references" class="list-group-item list-group-item-action active" data-toggle="list" style='background:#20BEFF; border:0; color:white' role="tab" aria-controls="home"><center>References</center></h1>
    
<a href="#toc" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover" title="Go to TableOfContents">Go to TOC</a>

> * [Wikipedia Ensemble_learning](https://en.wikipedia.org/wiki/Ensemble_learning)
> * [Model averaging](https://www.mm218.dev/posts/2021/01/model-averaging/)
> * [Blending Ensemble Machine Learning With Python](https://machinelearningmastery.com/blending-ensemble-machine-learning-with-python/)
> * [Ensemble Learning: 5 Main Approaches](https://www.kdnuggets.com/2019/01/ensemble-learning-5-main-approaches.html)

<a id="toc"></a>
<div class="list-group" id="list-tab" role="tablist">
<h1 id="toc" class="list-group-item list-group-item-action active" data-toggle="list" style='background:#20BEFF; border:0; color:white' role="tab" aria-controls="home"><center>Table of contents</center></h1>

0. [References](#references) 🎓
1. [Libraries](#libraries) 📚
1. [Load Datasets](#load-datasets) 🧱
1. [Ensamble Techniques](#ensamble-techniques) 🌪
    1. [Simple mean (Equal Weights)](#simple-mean)
    1. [Fit-Based Weights](#fit-based)
    1. [Fit-Based Weights Rounded](#fit-based-rounded)
    1. [Fit-Based Weights Geo-Rounded](#fit-based-geo-rounded)
    1. [Final comparison](#final-comparison)
1. [Submission](#submission) 📝

<a id="libraries"></a>
<div class="list-group" id="list-tab" role="tablist">
<h1 id="libraries" class="list-group-item list-group-item-action active" data-toggle="list" style='background:#20BEFF; border:0; color:white' role="tab" aria-controls="home"><center>Libraries</center></h1>
    
<a href="#toc" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover" title="Go to TableOfContents">Go to TOC</a>

In [None]:
import numpy as np
import pandas as pd
from kaggle_colors_util import *

In [None]:
RANDOM_STATE = 42

DIRECTORY_PATH = "../input/tabular-playground-series-jan-2022"
TRAIN_CSV = DIRECTORY_PATH + "/train.csv"
TEST_CSV = DIRECTORY_PATH + "/test.csv"
SUBMISSION_CSV = DIRECTORY_PATH + "/sample_submission.csv"

ID = 'row_id'
TARGET = 'num_sold'
DATE = 'date'

In [None]:
cell_hover = {  # for row hover use <tr> instead of <td>
    'selector': 'td:hover',
    'props': [('background-color', COLOR_BLUE_LIGHT)]
}
headers = {
    'selector': 'th:not(.index_name)',
    'props': f'background-color: {COLOR_BLUE}; color: white;'
}

<a id="load-datasets"></a>
<div class="list-group" id="list-tab" role="tablist">
<h1 id="load-datasets" class="list-group-item list-group-item-action active" data-toggle="list" style='background:#20BEFF; border:0; color:white' role="tab" aria-controls="home"><center>Load Datasets</center></h1>
    
<a href="#toc" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover" title="Go to TableOfContents">Go to TOC</a>

In [None]:
train_df = pd.read_csv(TRAIN_CSV, parse_dates=[DATE])
test_df = pd.read_csv(TEST_CSV, parse_dates=[DATE])
submission_df = pd.read_csv(SUBMISSION_CSV)

test_ids = test_df[ID]

In [None]:
lb_417724 = pd.read_csv("../input/tps-jan2022-best-predictions/submission-Scaling is better than blending-417724.csv")
lb_424305 = pd.read_csv("../input/tps-jan2022-best-predictions/submission-TPS Jan 2022 Automated Ensembling-424305.csv")
lb_433625 = pd.read_csv("../input/tps-jan2022-best-predictions/submission-TPS2201_Hybrid_Time_Series-433625.csv")
lb_435293 = pd.read_csv("../input/tps-jan2022-best-predictions/submission-TPSJAN22-03 Linear Model-435293.csv")
lb_438188 = pd.read_csv("../input/tps-jan2022-best-predictions/submission-TPSJAN22-06 LightGBM Quickstart-4.38188.csv")
lb_452597 = pd.read_csv("../input/tps-jan2022-best-predictions/submission-EnsembleModel with addon GDP  Amazons Sales Data-452597.csv")
lb_454527 = pd.read_csv("../input/tps-jan2022-best-predictions/submission-Time Series Transformer Infer-454527.csv")

<a id="ensamble-techniques"></a>
<div class="list-group" id="list-tab" role="tablist">
<h1 id="ensamble-techniques" class="list-group-item list-group-item-action active" data-toggle="list" style='background:#20BEFF; border:0; color:white' role="tab" aria-controls="home"><center>Ensamble techniques</center></h1>
    
<a href="#toc" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover" title="Go to TableOfContents">Go to TOC</a>

| link                                                                                     | model type| Leaderboard Score| 
|------------------------------------------------------------------------------------------|-----------|------------------|
| https://www.kaggle.com/ambrosm/scaling-is-better-than-blending                           |	ensemble | 4.17724|
| https://www.kaggle.com/sytuannguyen/tps-jan-2022-automated-ensembling                    |	ensemble | 4.24305|
| https://www.kaggle.com/teckmengwong/tps2201-hybrid-time-series                           |	Linear Regression | 4.33625|
| https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model                                  |	Linear Regression | 4.35293|
| https://www.kaggle.com/ambrosm/tpsjan22-06-lightgbm-quickstart                           |	LightGBM | 4.38188|
| https://www.kaggle.com/anirudhyadav9784/ensemblemodel-with-addon-gdp-amazon-s-sales-data |	CatBoost | 4.52597|
| https://www.kaggle.com/yamqwe/time-series-transformer-infer                              |	DNN      | 4.54527|

In [None]:
score_and_submission_dict = {4.17724: lb_417724[TARGET],
                             4.24305: lb_424305[TARGET],
                             4.33625: lb_433625[TARGET],
                             4.35293: lb_435293[TARGET],
                             4.38188: lb_438188[TARGET],
                             4.52597: lb_452597[TARGET],
                             4.54527: lb_454527[TARGET]}

<a id="simple-mean"></a>
## **<span style="color:#58355E;">Simple mean (Equal Weights)</span>**
<a href="#toc" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover" title="Go to TableOfContents">Go to TOC</a> 

In [None]:
simple_mean_pred = np.mean(np.array([*score_and_submission_dict.values()]), axis=0)

<a id="fit-based"></a>
## **<span style="color:#58355E;">Fit-Based Weights</span>**
<a href="#toc" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover" title="Go to TableOfContents">Go to TOC</a> 

In [None]:
scores = np.array([*score_and_submission_dict.keys()])
submissions = np.array([*score_and_submission_dict.values()])

In [None]:
weights_fit_based = (1/scores)/sum(1/scores)  # normalized weights to sum up to '1'

In [None]:
display(md('We can see that the first model with lowest SMAPE has the highest weight - the biggest contribution to the final solution'))
display(pd.DataFrame(zip(scores,weights_fit_based), columns=['Leaderboard Score', 'Weights fit-based']).style.hide_index().set_table_styles([cell_hover, headers]))

In [None]:
fit_based_pred = np.average(submissions, weights=weights_fit_based, axis=0)

<a id="fit-based-rounded"></a>
## **<span style="color:#58355E;">Fit-Based Weights Rounded</span>**
<a href="#toc" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover" title="Go to TableOfContents">Go to TOC</a> 

In [None]:
fit_based_pred_rounded = fit_based_pred.round()

<a id="fit-based-geo-rounded"></a>
## **<span style="color:#58355E;">Fit-Based Weights Geo-Rounded</span>**
<a href="#toc" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover" title="Go to TableOfContents">Go to TOC</a> 

In [None]:
from math import ceil, floor, sqrt

def geometric_round(arr):
    result_array = arr
    result_array = np.where(result_array < np.sqrt(np.floor(arr)*np.ceil(arr)), np.floor(arr), result_array)
    result_array = np.where(result_array >= np.sqrt(np.floor(arr)*np.ceil(arr)), np.ceil(arr), result_array)

    return result_array

In [None]:
fit_based_pred_geo_rounded = geometric_round(fit_based_pred)

Both `round()` and `geometric_round()` give almost identical results...

In [None]:
geo_vs_simple_round = pd.DataFrame((fit_based_pred_geo_rounded == fit_based_pred_rounded)).value_counts()  # not much different...

display(md(f"...there are {geo_vs_simple_round[1]} records with the same value as using simple `round()` and **only {geo_vs_simple_round[0]}**, where **values diffes**"))

so as [@siukeitin](https://www.kaggle.com/siukeitin) mentioned in his [sensitivity analysis discussion](https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/299162) 
>  `np.round`, which uses arithmetic mean instead of geometric mean, is a good approximation.

<a id="final-comparison"></a>
## **<span style="color:#58355E;">Final comparison</span>**
<a href="#toc" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover" title="Go to TableOfContents">Go to TOC</a> 

In [None]:
best_public_score = min(score_and_submission_dict.keys())

display(md(f"The best public submission up to date (08-01-2022) is version 3 of https://www.kaggle.com/ambrosm/scaling-is-better-than-blending with {best_public_score} leaderborad score.<br> The table below shows that one can improve this result over 2.5% by using fit-based weighted averaging with some geometric rounding on top of it."))

In [None]:
final_comparison = pd.DataFrame([["Best public",  best_public_score],
                                 ["Simple mean (Equal Weights)", 4.15632],
                                 ["Fit-Based Weights", 4.15641],
                                 ["Fit-Based Weights Rounded", 4.15193],
                                 ["Fit-Based Weights Geo-Rounded", 4.15147]], 
                                columns=['Ensemble technique', 'Leaderboard Score'])
final_comparison['Percentage change'] = -final_comparison['Leaderboard Score'].pct_change()
final_comparison['Improvement over best'] = best_public_score - final_comparison['Leaderboard Score']


display(final_comparison.style.format(na_rep='-',
                                      formatter={'Percentage change': '{:.4%}'.format,
                                                 'Improvement over best': '{:.2%}'.format}
                                     ).hide_index().set_table_styles([cell_hover, headers]))

<a id="submission"></a>
<div class="list-group" id="list-tab" role="tablist">
<h1 id="submission" class="list-group-item list-group-item-action active" data-toggle="list" style='background:#20BEFF; border:0; color:white' role="tab" aria-controls="home"><center>Submission</center></h1>
    
<a href="#toc" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover" title="Go to TableOfContents">Go to TOC</a>

In [None]:
def save_submission(final_pred, postfix):
    submission_df = pd.DataFrame(zip(test_ids, final_pred), columns=[ID, TARGET])
    submission_df.to_csv(f"submission-{postfix}.csv", index=False)

In [None]:
save_submission(simple_mean_pred, 'simple_mean')
save_submission(fit_based_pred, 'fit_based')
save_submission(fit_based_pred_rounded, 'fit_based_rounded')
save_submission(fit_based_pred_geo_rounded, 'fit_based_geo_rounded')

***

<div class="alert alert-block alert-success">  
Yupi! With some ensembling techniques we've manged to increase Public Leaderboard Score 📈 <br>Hope you like this notebook 😊
</div>

In [None]:
HTML(f'''
<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:{COLOR_GREY};
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
    <p style="padding: 10px; color:white;">
If you find this notebook useful or you just like it, please upvote ▲.<br>
        Use this link <a href={NOTEBOOK_URL}>{NOTEBOOK_NAME}</a> to cite.
        Questions/feedback? → <a href={NOTEBOOK_URL}/comments>comment</a>.
    </p>
</div>''')


***