In [1]:
# Notebook 출력설정
# 주요 라이브러리 임포트

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

%matplotlib inline
import matplotlib.pylab as plt
plt.rcParams["figure.figsize"] = (15,5)
plt.rcParams['lines.linewidth'] = 1
plt.rcParams['axes.grid'] = True
import seaborn as sns

In [2]:
# 원본 데이터셋 로드
dir_dataset = "C:/Users/0stix/Datasets/"
name_project = '2203-dacon-abalone'
df_train = pd.read_csv(dir_dataset+name_project+'/train.csv')
df_test = pd.read_csv(dir_dataset+name_project+'/test.csv')
df_sub = pd.read_csv(dir_dataset+name_project+'/sample_submission.csv')

len_train = len(df_train)
df_all = pd.concat([df_train, df_test], axis=0)
target = 'Target'

In [3]:
df_all.head()
# df_all.info()
# df_all.describe()

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,M,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15.0
1,2,I,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8.0
2,3,I,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18.0
3,4,M,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13.0
4,5,I,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6.0


In [4]:
# Whole weight = 전체 무게
# Shucked weight = 껍데기제거(살) 무게
# Viscra weight = 내장 무게
# Shell weight = 껍데기 무게

In [5]:
df_v0 = pd.DataFrame()

df_v0 = pd.get_dummies(df_all['Gender'].astype('category').cat.codes, prefix='G')

df_v0['L'] = df_all['Lenght']
df_v0['D'] = df_all['Diameter']
df_v0['H'] = df_all['Height']

# df_v0['V'] = df_v0['L'] * df_v0['D'] * df_v0['H']
df_v0['L/D'] = df_v0['L'] / df_v0['D']
df_v0['D/H'] = df_v0['D'] / df_v0['H']
df_v0['H/L'] = df_v0['H'] / df_v0['L']

df_v0['wWl'] = df_all['Whole Weight']
df_v0['wSk'] = df_all['Shucked Weight']
df_v0['wVs'] = df_all['Viscra Weight']
df_v0['wSl'] = df_all['Shell Weight']

df_v0['rSk'] = df_v0['wSk'] / df_v0['wWl']
df_v0['rVs'] = df_v0['wVs'] / df_v0['wWl']
df_v0['rSl'] = df_v0['wSl'] / df_v0['wWl']

# df_v0[target] = df_all[target] + .5
df_v0[target] = df_all[target]

In [6]:
def tr_te_split(df_all, len_train, target):
    X = df_all.drop(target, axis=1)
    y = df_all[target]
    X_tr = X[:len_train]
    y_tr = y[:len_train]
    X_te = X[len_train:]
    return X_tr, y_tr, X_te

In [7]:
# 고속 데이터셋 평가
def eval_df(X, y, lst_model):
    from sklearn.model_selection import cross_val_score
    
    grd_score = []
    for model_ in lst_model:
        grd_score.append(cross_val_score(
            model_, 
            X, 
            y, 
            cv=5, 
            scoring="neg_mean_absolute_error", 
            n_jobs=-1))
        
    return grd_score

In [8]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

lst_model = [
    # XGBRegressor(),
    CatBoostRegressor(silent=True)
]

X_tr, y_tr, X_te = tr_te_split(df_v0, len_train, target)

grd_score = eval_df(X_tr, y_tr, lst_model)

In [9]:
-np.mean(list(map(np.mean, grd_score))) / np.mean(np.abs(y_tr))

0.1583315869166241

In [10]:
np.mean(grd_score)

-1.5694160490857711

In [11]:
import numpy as np

def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [12]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
import time

def objective(trial):
    params = {
        "random_state":trial.suggest_categorical("random_state", [2022]),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.0001, 0.3),
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        # "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
        "n_estimators": 1000,
        "max_depth":trial.suggest_int("max_depth", 2, 9),
        'random_strength' :trial.suggest_int('random_strength', 0, 100),
        "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
        # 'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        # 'task_type': trial.suggest_categorical('task_type', ['GPU']),
        'loss_function': trial.suggest_categorical('loss_function', ['MAE']),
        'eval_metric': trial.suggest_categorical('eval_metric', ['MAE'])
    }
    
    model = CatBoostRegressor(**params)
    
    time_s = time.time()
    # for key_, value_ in params.items():
    #     print(key_, value_)
        
    X_train_tmp, X_valid_tmp, y_train_tmp, y_valid_tmp = train_test_split(X_tr, y_tr, test_size=0.3, random_state=42)
    model.fit(
        X_train_tmp, y_train_tmp,
        eval_set=[(X_valid_tmp, y_valid_tmp)],
        early_stopping_rounds=25, 
        verbose=0,
    )
        
    y_train_pred = model.predict(X_train_tmp)
    y_valid_pred = model.predict(X_valid_tmp)
    train_mae = mae(y_train_tmp, y_train_pred)
    valid_mae = mae(y_valid_tmp, y_valid_pred)
    
    print(f'MAE of Train: {train_mae}, MAE of Validation: {valid_mae}', time.time() - time_s)
    print()
    
    return valid_mae

In [13]:
import optuna
from optuna.samplers import TPESampler

TRIALS = 100
TIMEOUT = 3600

sampler_ = TPESampler()
study_ = optuna.create_study(
    study_name = 'cat_parameter_opt',
    direction = 'minimize',
    sampler = sampler_,
)
study_.optimize(objective, n_trials=TRIALS, timeout=TIMEOUT, show_progress_bar=1, )
print(study_.best_value, study_.best_trial.params)

[32m[I 2022-03-24 19:18:40,757][0m A new study created in memory with name: cat_parameter_opt[0m


  0%|          | 0/100 [00:00<?, ?it/s]

MAE of Train: 2.249423790565075, MAE of Validation: 2.1061272660385595 3.0334739685058594

[32m[I 2022-03-24 19:18:43,806][0m Trial 0 finished with value: 2.1061272660385595 and parameters: {'random_state': 2022, 'learning_rate': 0.00016123439555128274, 'bagging_temperature': 1.5366747679267607, 'max_depth': 8, 'random_strength': 36, 'l2_leaf_reg': 1.562275817754908e-05, 'min_child_samples': 94, 'max_bin': 343, 'loss_function': 'MAE', 'eval_metric': 'MAE'}. Best is trial 0 with value: 2.1061272660385595.[0m
MAE of Train: 1.4962434473599504, MAE of Validation: 1.4624829136230373 0.6789975166320801

[32m[I 2022-03-24 19:18:44,489][0m Trial 1 finished with value: 1.4624829136230373 and parameters: {'random_state': 2022, 'learning_rate': 0.004083342155160508, 'bagging_temperature': 50.649470741380625, 'max_depth': 4, 'random_strength': 72, 'l2_leaf_reg': 1.5190249766874579e-05, 'min_child_samples': 56, 'max_bin': 315, 'loss_function': 'MAE', 'eval_metric': 'MAE'}. Best is trial 1 with

In [14]:
best_params = study_.best_params

X_train_tmp, X_valid_tmp, y_train_tmp, y_valid_tmp = train_test_split(X_tr, y_tr, test_size=0.3, random_state=42)
model_tmp = CatBoostRegressor(**best_params, n_estimators=30000, verbose=1000).fit(X_train_tmp, y_train_tmp, eval_set=[(X_valid_tmp, y_valid_tmp)], early_stopping_rounds=35)

# X_train_tmp, X_valid_tmp, y_train_tmp, y_valid_tmp = train_test_split(X_tr, y_tr, test_size=0.3, random_state=42)
# model_tmp = CatBoostRegressor(**study_.best_params, 
#                               n_estimators = 1000, 
#                               verbose=1000)
# # model_tmp.set_params(n_estimators = 30000)
# model_tmp.fit(X_train_tmp, y_train_tmp, eval_set=[(X_valid_tmp, y_valid_tmp)], early_stopping_rounds=35)

# if allow_optimize:
#     sampler = TPESampler(seed=42)

#     study = optuna.create_study(
#         study_name = 'cat_parameter_opt',
#         direction = 'minimize',
#         sampler = sampler,
#     )
#     study.optimize(objective, n_trials=TRIALS)
#     print("Best Score:",study.best_value)
#     print("Best trial",study.best_trial.params)
    
#     best_params = study.best_params
    
#     X_train_tmp, X_valid_tmp, y_train_tmp, y_valid_tmp = train_test_split(X, y, test_size=0.3, random_state=42)
#     model_tmp = CatBoostRegressor(**best_params, n_estimators=30000, verbose=1000).fit(X_train_tmp, y_train_tmp, eval_set=[(X_valid_tmp, y_valid_tmp)], early_stopping_rounds=35)

0:	learn: 2.3095110	test: 2.1518699	best: 2.1518699 (0)	total: 11.1ms	remaining: 5m 33s
Stopped by overfitting detector  (35 iterations wait)

bestTest = 1.376118569
bestIteration = 47

Shrink model to first 48 iterations.


In [15]:
cat_reg = CatBoostRegressor(**study_.best_trial.params, n_estimators=model_tmp.get_best_iteration(), verbose=1000)
cat_reg.fit(X_tr, y_tr)
df_sub[target] = cat_reg.predict(X_te)

import datetime
now = datetime.datetime.now()
str_datetime = now.strftime("%y%m%d_%H%M%S")
df_sub.to_csv(dir_dataset+'submission-'+name_project+'-'+str_datetime+'.csv', index = 0)

0:	learn: 2.2516894	total: 1.21ms	remaining: 55.8ms
46:	learn: 1.2855243	total: 76.8ms	remaining: 0us


In [16]:
from sklearn.model_selection import GridSearchCV

param_grid={
    'max_depth': [2, 3, 4, 5, 6, 7, 8],
    'n_estimators': [50, 100, 150, 200, 250, 300],
    'min_child_weight': [1, 2, 3, 4, 5]
}

In [17]:
from sklearn.model_selection import GridSearchCV
allow_optimize = 1
if allow_optimize:
    param_grid={
#         'max_depth': [4,5,6,7,8,9],
        'max_depth': [5],
        #'n_estimators': [100,200,300,400,500,600,700,800,900,1000],
        'n_estimators': [75],
#         'min_child_weight' : [1,2,3,4,5,6],
        'min_child_weight' : [1],
        
#       'gpu_id' : [0]
        }

    regressor = XGBRegressor(
#         tree_method = 'gpu_hist', 
#         predictor = 'gpu_predictor'
    )
    CV_regressor = GridSearchCV(regressor, param_grid, cv=3, scoring="neg_mean_absolute_error", n_jobs= -1, return_train_score = True, verbose = 1)
    CV_regressor.fit(X, y)
    
    print("The best hyperparameters are : ","\n")
    print(CV_regressor.best_params_)

NameError: name 'X' is not defined