# Evaluation for Tabular Weather Data

Outline:
1. Download evaluation data
2. Load baseline models (you can load your own models here)
3. Evaluate models
4. Generate prediction file

## 1. Download evaluation data

In [2]:
# Download link here TODO

import pandas as pd

# Load each data file as a pandas data frame
df_eval = pd.read_csv('data_eval/eval.csv')
df_eval.head()

Unnamed: 0,topography_bathymetry,sun_elevation,climate_temperature,climate_pressure,cmc_0_0_0_1000,cmc_0_0_0_2_grad,cmc_0_0_0_2_interpolated,cmc_0_0_0_2_next,cmc_0_0_0_2,cmc_0_0_0_500,...,cmc_0_1_66_0_grad,cmc_0_1_66_0_next,cmc_0_1_67_0_grad,cmc_0_1_67_0_next,cmc_0_1_68_0_grad,cmc_0_1_68_0_next,gfs_2m_dewpoint_grad,gfs_2m_dewpoint_next,gfs_total_clouds_cover_low_grad,gfs_total_clouds_cover_low_next
0,298.0,-16.337754,7.78,742.524856,277.912061,-2.039062,275.247046,273.207983,275.247046,256.772266,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.55899,-10.608984,0.0,0.0
1,376.0,30.120293,22.282143,723.331713,292.545093,-0.317285,291.059949,290.954187,291.271472,250.076855,...,0.0,0.0,0.0,0.0,0.0,0.0,2.099976,8.749994,0.0,0.0
2,83.0,1.232164,14.19,761.034372,274.791968,-4.542041,273.794141,272.280127,276.822168,257.556323,...,0.0,0.0,0.0,0.0,0.0,0.0,0.100006,-10.349982,0.0,0.0
3,5.0,-13.62162,7.946429,764.253037,276.940283,-1.329907,277.582633,277.139331,278.469238,257.171875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.125946,-2.341925,0.0,0.0
4,257.0,-20.644158,6.874286,653.700954,285.847852,-0.947852,277.626253,277.310303,278.258154,252.567334,...,0.0,0.0,0.002677,0.00358,0.0,0.0,0.199982,2.35,26.0,91.0


## 2. Load baseline models

In [None]:
# Download baselines
! wget https://storage.yandexcloud.net/yandex-research/shifts/weather/baseline-models.tar

In [6]:
import catboost

# Loading Trained Baseline Models
# Assume models saved as dir_path/seedi.cbm

dir_path = '../regression-baseline-models'
baseline_models = []

# 10 models provided
ensemble_size=10

for ind in range(1, ensemble_size+1):
    model = catboost.CatBoostRegressor()
    model.load_model(f'{dir_path}/seed{ind}.cbm')
    baseline_models.append(model)

## 3. Evaluate models

In [7]:

# Get ensemble of predictions for each data point

import numpy as np

def get_predictions(features_df, model):
    '''
    Calculates predictions on df features for specified model
    
    Return: array [num_samples x 2],
        where
            num_samples = number of rows in features_df
            2 = [mean, variance]
    
    '''
    return model.predict(features_df)


def get_all_predictions(features_df, models_list):
    '''
    Return: array [ensemble_size x num_samples x 2],
        where
            ensemble_size = number of models in models_list
            num_samples = number of rows in features_df
            2 = [mean, variance]
    '''
    all_preds = []
    for model in models_list:
        preds = np.asarray(get_predictions(features_df, model))
        all_preds.append(preds)
    return np.stack(all_preds, axis=0)


all_preds = get_all_predictions(df_eval, baseline_models)

In [9]:
# Choose any uncertainty measure to calculate uncertainty scores
# This tutorial uses total variance as the uncertainty measure

from uncertainty import ensemble_uncertainties_regression

all_uncertainty = ensemble_uncertainties_regression(all_preds)
uncertainties = all_uncertainty['tvar']

## 4. Generate prediction file

In [10]:
# Prepare the ids
ids = np.arange(1, len(df_eval) + 1)

# Predictions are the mean predictions across the ensemble of models
preds = np.mean(all_preds[:,:,0], axis=0)

# The uncertainties have been calculated in the previous step

# Store all the information to be submitted in a df
df_submission = pd.DataFrame(data={
        'ID' : ids,
        'PRED' : preds,
        'UNCERTAINTY' : uncertainties
        })

df_submission.head()

Unnamed: 0,ID,PRED,UNCERTAINTY
0,1,3.163811,0.819192
1,2,16.831751,0.703844
2,3,2.345691,1.067789
3,4,2.586296,1.355215
4,5,11.203381,2.417814


In [11]:
# Save as csv
out_file = 'df_submission.csv'
df_submission.to_csv(out_file, index=False)

In [None]:
# tar the csv file
! tar -zcvf df_submission.tar.gz df_submission.csv