In [1]:
# Notebook 출력설정
# 주요 라이브러리 임포트

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

%matplotlib inline
import matplotlib.pylab as plt
plt.rcParams["figure.figsize"] = (15,5)
plt.rcParams['lines.linewidth'] = 1
plt.rcParams['axes.grid'] = True
import seaborn as sns

In [69]:
# 원본 데이터셋 로드
dir_dataset = "C:/Users/0stix/Datasets/"
name_project = '2203-dacon-abalone'
df_train = pd.read_csv(dir_dataset+name_project+'/train.csv')
df_test = pd.read_csv(dir_dataset+name_project+'/test.csv')
df_sub = pd.read_csv(dir_dataset+name_project+'/sample_submission.csv')

len_train = len(df_train)
df_all = pd.concat([df_train, df_test], axis=0)
target = 'Target'

In [9]:
df_all.head()
# df_all.info()
# df_all.describe()

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,M,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15.0
1,2,I,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8.0
2,3,I,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18.0
3,4,M,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13.0
4,5,I,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6.0


In [None]:
# Whole weight = 전체 무게
# Shucked weight = 껍데기제거(살) 무게
# Viscra weight = 내장 무게
# Shell weight = 껍데기 무게

In [10]:
df_all['Gender'].unique()

array(['M', 'I', 'F'], dtype=object)

In [4]:
len_train

1253

In [113]:
df_v0 = pd.DataFrame()

df_v0 = pd.get_dummies(df_all['Gender'].astype('category').cat.codes, prefix='G')

df_v0['L'] = df_all['Lenght']
df_v0['D'] = df_all['Diameter']
df_v0['H'] = df_all['Height']

df_v0['L/D'] = df_v0['L'] / df_v0['D']
df_v0['D/H'] = df_v0['D'] / df_v0['H']
df_v0['H/L'] = df_v0['H'] / df_v0['L']

df_v0['wWl'] = df_all['Whole Weight']
df_v0['wSk'] = df_all['Shucked Weight']
df_v0['wVs'] = df_all['Viscra Weight']
df_v0['wSl'] = df_all['Shell Weight']

df_v0['rSk'] = df_v0['wSk'] / df_v0['wWl']
df_v0['rVs'] = df_v0['wVs'] / df_v0['wWl']
df_v0['rSl'] = df_v0['wSl'] / df_v0['wWl']

# df_v0[target] = df_all[target] + .5
df_v0[target] = df_all[target]

In [114]:
def tr_te_split(df_all, len_train, target):
    X = df_all.drop(target, axis=1)
    y = df_all[target]
    X_tr = X[:len_train]
    y_tr = y[:len_train]
    X_te = X[len_train:]
    return X_tr, y_tr, X_te

In [115]:
# 고속 데이터셋 평가
def eval_df(X, y, lst_model):
    from sklearn.model_selection import cross_val_score
    
    grd_score = []
    for model_ in lst_model:
        grd_score.append(cross_val_score(
            model_, 
            X, 
            y, 
            cv=5, 
            scoring="neg_mean_absolute_error", 
            n_jobs=-1))
        
    return grd_score

In [116]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

lst_model = [
    # XGBRegressor(),
    CatBoostRegressor(silent=True)
]

X_tr, y_tr, X_te = tr_te_split(df_v0, len_train, target)

grd_score = eval_df(X_tr, y_tr, lst_model)

In [117]:
-np.mean(list(map(np.mean, grd_score)))

1.6703697059342588

In [97]:
grd_score

[array([-1.4576537 , -1.62852356, -1.69092835, -1.64521161, -1.92953131]),
 array([-1.32727061, -1.55295134, -1.63503327, -1.60421292, -1.72761211])]

In [86]:
cat_reg = CatBoostRegressor(silent=True)
cat_reg.fit(X_tr, y_tr)
df_sub[target] = cat_reg.predict(X_te)

import datetime
now = datetime.datetime.now()
str_datetime = now.strftime("%y%m%d_%H%M%S")
df_sub.to_csv(dir_dataset+'submission-'+name_project+'-'+str_datetime+'.csv', index = 0)

In [None]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score

# def score_dataset(X, y, model=XGBRegressor(tree_method='gpu_hist', predictor='gpu_predictor'), model_2 = CatBoostRegressor(task_type = 'GPU', silent=True)):
def score_dataset(X, y, model=XGBRegressor(), model_2 = CatBoostRegressor(silent=True)):
#def score_dataset(X, y, model=XGBRegressor(), model_2 = CatBoostRegressor(silent=True)):
    # Label encoding is good for XGBoost and RandomForest, but one-hot
    # would be better for models like Lasso or Ridge. The `cat.codes`
    # attribute holds the category levels.
    for colname in X.select_dtypes(["object"]).columns:
        X[colname] = LabelEncoder().fit_transform(X[colname])
    X['week'] = X['week'].astype(int)
    X = X.drop('row_id',axis=1)
    # Metric for TPS Mar22 competition is MAE (Mean Absolute Error)
    score_xgb = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_absolute_error", n_jobs=1
    )
    
    score_cat = cross_val_score(
        model_2, X, y, cv=5, scoring="neg_mean_absolute_error", n_jobs=1
    )
    
    score = -0.5 * (score_xgb.mean() + score_cat.mean())
    return score

#df_data = df_data.reset_index().set_index('row_id')
#df_data = df_data.drop(outliers_index,axis=0)
#df_data = df_data.reset_index().set_index('time')

x = df_data[df_data['congestion'].isnull() == False].copy()
y = pd.DataFrame(x.pop('congestion'))

baseline_score = score_dataset(x, y)
print(f"Baseline score: {baseline_score:.5f} MAE")

In [None]:
import numpy as np

def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score