In [1]:
print('hello')

hello


In [2]:
from pycaret.regression import *

from tqdm import tqdm

import pickle
import warnings
import pandas as pd
import random
import os
import numpy as np

# warnings.filterwarnings('ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(42) # Seed 고정

train_df = pd.read_csv('./train.csv')

train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

test_x = pd.read_csv('./test.csv').drop(columns=['ID'])

In [4]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [5]:
def lg_nrmse_12(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    rmse = metrics.mean_squared_error(gt, preds, squared=False)
    nrmse = rmse/np.mean(np.abs(gt))
    
    return nrmse * 1.2

def lg_nrmse_10(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    rmse = metrics.mean_squared_error(gt, preds, squared=False)
    nrmse = rmse/np.mean(np.abs(gt))
    
    return nrmse

In [6]:
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# device

In [None]:
# add_metric('nrmse10', 'NRMSE10', lg_nrmse_10, greater_is_better = False)
# remove_metric('NRMSE')
# remove_metric('NRMSE10')

## Tune_bag_ens : 1.94822

In [32]:
submit = pd.read_csv('./sample_submission.csv')
best_models_dic = {}
tuned_best_models_dic = {}

for col in train_y.columns:
    train_tmp = pd.concat([train_x, train_y[col]], axis=1)

    reg = setup(data=train_tmp,
                target=col,
                session_id = 42,
                use_gpu = True,
                normalize = True,
                train_size = 0.8,
                silent= True)
                # fold_shuffle = True,
        
    best_models = compare_models(sort='RMSE', n_select=3, fold=5)
    best_models_dic[col] = best_models
    
    tuned_best_models = [tune_model(model, optimize='RMSE', early_stopping=True, fold=5) for model in best_models]
    tuned_best_models_dic[col] = tuned_best_models
    
    bagged_models = []
    for model in tuned_best_models:
        bagged = ensemble_model(model, method='Bagging')
        bagged_models.append(bagged)
        
    blender = blend_models(estimator_list=bagged_models, optimize='RMSE', fold=5)
    
    final_model = finalize_model(blender)
    pred = predict_model(final_model, data=test_x)
    
    submit[col] = pred['Label']
    pred['Label'].to_csv(f'{col}_pred.csv', index=False)
        
submit.to_csv('pycaret_tune_bag_ens.csv', index=False)

with open('pycaret_tune_bag_ens_bm.pickle', 'wb') as file:
    pickle.dump(best_models_dic, file, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('pycaret_tune_bag_ens_tbm.pickle', 'wb') as file:
    pickle.dump(tuned_best_models_dic, file, protocol=pickle.HIGHEST_PROTOCOL)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4949,0.4019,0.634,0.081,0.0232,0.0188
1,0.4861,0.3917,0.6258,0.0931,0.0229,0.0185
2,0.4847,0.3865,0.6217,0.0929,0.0227,0.0184
3,0.4825,0.3898,0.6243,0.0848,0.0228,0.0183
4,0.4837,0.3853,0.6208,0.0935,0.0227,0.0184
Mean,0.4864,0.391,0.6253,0.0891,0.0228,0.0185
Std,0.0044,0.0059,0.0047,0.0052,0.0002,0.0002




## Bag : 1.94648

In [7]:
submit = pd.read_csv('./sample_submission.csv')
best_models_dic = {}
tuned_best_models_dic = {}

for col in train_y.columns[13:]:
    train_tmp = pd.concat([train_x, train_y[col]], axis=1)

    reg = setup(data=train_tmp,
                target=col,
                session_id = 42,
                use_gpu = True,
                normalize = True,
                train_size = 0.8,
                silent= True)
                # fold_shuffle = True,
        
    best_models = compare_models(sort='RMSE', n_select=1, fold=5)
    best_models_dic[col] = best_models
    
    # tuned_best_models = [tune_model(model, optimize='RMSE', early_stopping=True, fold=5) for model in best_models]
    # tuned_best_models_dic[col] = tuned_best_models
    
    bagged = ensemble_model(best_models, method='Bagging')
        
    # blender = blend_models(estimator_list=bagged_models, optimize='RMSE', fold=5)
    
    final_model = finalize_model(bagged)
    pred = predict_model(final_model, data=test_x)
    
    submit[col] = pred['Label']
    pred['Label'].to_csv(f'{col}_pred.csv', index=False)
    
# submit.to_csv('pycaret_single.csv', index=False)

# with open('pycaret_tune_bag_ens_bm.pickle', 'wb') as file:
#     pickle.dump(best_models_dic, file, protocol=pickle.HIGHEST_PROTOCOL)
    
# with open('pycaret_tune_bag_ens_tbm.pickle', 'wb') as file:
#     pickle.dump(tuned_best_models_dic, file, protocol=pickle.HIGHEST_PROTOCOL) 

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.493,0.3964,0.6296,0.0782,0.023,0.0187
1,0.4923,0.3984,0.6312,0.1034,0.0231,0.0187
2,0.4809,0.386,0.6213,0.1058,0.0227,0.0183
3,0.4901,0.3915,0.6257,0.0939,0.0229,0.0187
4,0.4781,0.3743,0.6118,0.1035,0.0223,0.0182
5,0.4898,0.3947,0.6283,0.0915,0.0229,0.0186
6,0.4726,0.3715,0.6095,0.0802,0.0223,0.018
7,0.4918,0.4061,0.6373,0.0932,0.0232,0.0187
8,0.486,0.3849,0.6204,0.1063,0.0227,0.0185
9,0.4804,0.3799,0.6164,0.094,0.0225,0.0183


In [8]:
submit = pd.read_csv('./sample_submission.csv')

for col in train_y.columns:
    col_tmp = pd.read_csv(f'{col}_pred.csv')
    submit[col] = col_tmp['Label']

submit.to_csv('pycaret_single.csv', index=False)

## bag multiple : 현재 실험 중

In [None]:
bag = pd.read_csv('./sample_submission.csv')
boost = pd.read_csv('./sample_submission.csv')

for col in tqdm(train_y.columns[1:]):
    train_tmp = pd.concat([train_x, train_y[col]], axis=1)

    reg = setup(data=train_tmp,
                target=col,
                session_id = 42,
                normalize = True,
                train_size = 0.8,
                silent= True,
                fold=5)
                # fold_shuffle = True,
        
    best_models = compare_models(sort='RMSE', n_select=3)
    
    bagged_models = []
    for model in best_models:
        bagged = ensemble_model(model, method='Bagging')
        bagged_models.append(bagged)
    
    blender = blend_models(estimator_list=bagged_models, optimize='RMSE')

    final_model = finalize_model(blender)
    save_model(final_model, f'pycaret_mt_bag_{col}')
    pred = predict_model(final_model, data=test_x)
    
    bag[col] = pred['Label']
    pred['Label'].to_csv(f'{col}_pred_mt_bag.csv', index=False)
    
    boosted_models = []
    for model in best_models:
        boosted = ensemble_model(model, method='Boosting')
        boosted_models.append(bagged)

    blender = blend_models(estimator_list=boosted_models, optimize='RMSE')

    final_model = finalize_model(blender)
    save_model(final_model, f'pycaret_mt_boost_{col}')
    pred = predict_model(final_model, data=test_x)
    
    boost[col] = pred['Label']
    pred['Label'].to_csv(f'{col}_pred_mt_boost.csv', index=False)
    
bag.to_csv('pycaret_mt_bag.csv', index=False)
boost.to_csv('pycaret_mt_boost.csv', index=False)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2972,0.1427,0.3777,0.0485,0.1873,0.5691
1,0.2979,0.1433,0.3786,0.0436,0.1876,0.5044
2,0.298,0.1422,0.3771,0.0312,0.1886,0.5473
3,0.2998,0.1436,0.3789,0.0391,0.1903,0.602
4,0.2967,0.1419,0.3767,0.0409,0.1855,0.5028
Mean,0.2979,0.1427,0.3778,0.0407,0.1879,0.5451
Std,0.001,0.0006,0.0008,0.0057,0.0016,0.0381


IntProgress(value=0, description='Processing: ', max=6)

Unnamed: 0,Fold,MAE,MSE,RMSE,R2,RMSLE,MAPE


## (Bag + Boost + Tune)을 Stacking 또는 Blending 하기

In [None]:
blending = pd.read_csv('./sample_submission.csv')
stacking = pd.read_csv('./sample_submission.csv')

for col in tqdm(train_y.columns):
    train_tmp = pd.concat([train_x, train_y[col]], axis=1)

    reg = setup(data=train_tmp,
                target=col,
                session_id = 42,
                normalize = True,
                train_size = 0.8,
                use_gpu=True,
                silent= True,
                fold=5)
        
    best_models = compare_models(sort='RMSE', n_select=3)
    
    models = []
    for model in best_models:
        bag = ensemble_model(model, method='Bagging', optimize='RMSE')
        boost = ensemble_model(model, method='Boosting', optimize='RMSE')
        tune = tune_model(model, optimize='RMSE', early_stopping=True)
        models.extend([bag, boost, tune])
    
    blender = blend_models(estimator_list=models, optimize='RMSE')
    final_model = finalize_model(blender)
    save_model(final_model, f'pycaret_blending_{col}')
    pred = predict_model(final_model, data=test_x)
    
    bledning[col] = pred['Label']
    pred['Label'].to_csv(f'{col}_pred_blending.csv', index=False)
    
    stacker = stack_models(estimator_list=models, optimize='RMSE')
    final_model = finalize_model(stacker)
    save_model(final_model, f'pycaret_stacking_{col}')
    pred = predict_model(final_model, data=test_x)
    
    stacking[col] = pred['Label']
    pred['Label'].to_csv(f'{col}_pred_stacking.csv', index=False)
    
blending.to_csv('pycaret_blending.csv', index=False)
stacking.to_csv('pycaret_stacking.csv', index=False)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2675,0.1204,0.347,0.0427,0.1541,0.3102
1,0.2686,0.1195,0.3456,0.0408,0.1512,0.2515
2,0.267,0.1201,0.3466,0.0406,0.1526,0.2802
3,0.2721,0.1237,0.3517,0.0396,0.1555,0.3118
4,0.2688,0.1223,0.3498,0.0394,0.1534,0.2696
Mean,0.2688,0.1212,0.3481,0.0406,0.1534,0.2847
Std,0.0018,0.0016,0.0022,0.0012,0.0015,0.0234


IntProgress(value=0, description='Processing: ', max=6)

Unnamed: 0,Fold,MAE,MSE,RMSE,R2,RMSLE,MAPE


