# Bronze Box
* Objective: 'RMSEWithUncertainty'
* Number of models: 29
* Features: 60

In [1]:
import pickle
import numpy as np
import pandas as pd
import datetime
from catboost import CatBoostRegressor, Pool, CatBoostClassifier

# Load data

In [2]:
%%time

with open('../data/processed/train.pcl', 'rb') as f:
    train = pickle.load(f)
    
with open('../data/processed/test.pcl', 'rb') as f:
    test = pickle.load(f)
    
train = train.sort_values(['fact_temperature', 'climate', 'fact_cwsm_class', 
                           'fact_latitude', 'fact_longitude', 'fact_time'])

train.index = [x for x in range(train.shape[0])]
test.index = [x for x in range(test.shape[0])]

print('Train:', train.shape)
print('Test:', test.shape, '\n')

Train: (3129592, 131)
Test: (1137731, 125) 

Wall time: 6.33 s


# Model params

In [3]:
target = 'fact_temperature'

features = ['x1', 'wrf_t2', 'x2', 'gfs_temperature_sea', 'cmc_0_0_0_2_interpolated',
            'gfs_temperature_sea_interpolated', 'wrf_t2_interpolated',
            'cmc_0_0_0_1000', 'topography_bathymetry', 'cmc_0_0_0_2',
            'sun_elevation', 'wrf_t2_next', 'gfs_pressure', 'gfs_soil_temperature',
            'climate_pressure', 'cmc_0_3_0_0_next', 'wrf_rh2', 'cmc_0_3_0_0',
            'climate_temperature', 'cmc_0_0_0_2_grad', 'gfs_temperature_10000',
            'gfs_humidity', 'cmc_0_1_0_0', 'cmc_0_0_7_925', 'gfs_wind_speed',
            'gfs_temperature_5000', 'gfs_a_vorticity', 'gfs_temperature_97500',
            'gfs_temperature_7000', 'wrf_psfc', 'wrf_wind_u',
            'gfs_total_clouds_cover_low_next', 'cmc_0_2_2_500', 'gfs_r_velocity',
            'gfs_temperature_sea_grad', 'cmc_0_0_0_850', 'cmc_timedelta_s',
            'wrf_wind_v', 'gfs_precipitable_water', 'cmc_0_3_5_850', 'cmc_0_2_3_10',
            'cmc_0_0_7_2', 'cmc_0_0_0_925', 'cmc_0_0_7_850', 'cmc_0_3_5_925',
            'gfs_temperature_25000', 'cmc_0_3_1_0', 'gfs_temperature_95000',
            'wrf_t2_grad', 'gfs_2m_dewpoint_next', 'cmc_0_2_2_700', 'cmc_0_2_3_925',
            'gfs_cloudness', 'gfs_temperature_20000', 'cmc_0_2_3_500',
            'gfs_temperature_80000', 'gfs_temperature_15000', 'cmc_0_0_6_2',
            'gfs_total_clouds_cover_low', 'cmc_0_2_2_1000']

print('Features:', len(features))

Features: 60


In [4]:
# Catboost params

model_params = {'eval_metric': 'RMSEWithUncertainty',    
                'objective': 'RMSEWithUncertainty',
                'iterations':5000,
                'learning_rate':0.3,               
                'depth': 9, 
                'l2_leaf_reg': 20,
                'one_hot_max_size': 120,
                'task_type': 'CPU',
                'thread_count': 22
               }

train_params = {'early_stopping_rounds': 150, 
                'silent': True,
                'plot': True               
               }

features_params = {}

# Train models

In [None]:
%%time

n_models = 200
folds = 25
seed_shift = 0
total_models = 29

for model_number in range(total_models):
    # Fix seed
    seed = model_number + seed_shift
    model_params['random_state'] = seed
    np.random.seed(seed)
    print(f'\nModel: {model_number}, seed: {seed}')
    print(str(datetime.datetime.now())[:19]) 
    
    # Split train val
    train_idx = [x for x in range(train.shape[0]) if x%folds != model_number%folds] 
    val_idx = [x for x in range(train.shape[0]) if x%folds == model_number%folds] 
    print('Train indexes:', train_idx[:folds])
    print('Test indexes:', val_idx[:folds][:5])
    train_subset = train.iloc[train_idx]
    val_subset = train.iloc[val_idx]
   
    train_dataset = Pool(data=train_subset[features], label=train_subset[target], **features_params) 
    eval_dataset = Pool(data=val_subset[features], label=val_subset[target], **features_params)
    
    # Train model
    model = CatBoostRegressor(**model_params) 
    model.fit(train_dataset, eval_set=eval_dataset, **train_params)
    model.save_model(f'models/bronze_box_{model_number}_{folds}_{n_models}.cbm')