In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/Kaggle_Dataset/Mushroom

/content/drive/MyDrive/Kaggle_Dataset/Mushroom


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [5]:
# id는 필요없는 feature이므로 삭제
df_train = df_train.drop('id', axis = 1)
df_test = df_test.drop('id', axis = 1)

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
class SafeLabelEncoder(LabelEncoder):
    def __init__(self):
        super().__init__()
        self.classes_ = np.array([])

    def fit(self, y):
        y = y.astype(str)
        self.classes_ = np.unique(y)
        return self

    def transform(self, y):
        y = y.astype(str)
        # 새롭게 등장한 카테고리(클래스)에 대해 -1을 반환
        return np.array([self._safe_transform(val) for val in y])

    def _safe_transform(self, val):
        if val in self.classes_:
            return np.where(self.classes_ == val)[0][0]
        else:
            # 새로운 데이터에 대해 -1을 반환
            return -1

    def fit_transform(self, y):
        return self.fit(y).transform(y)

In [8]:
# tqdm 라이브러리: 반복문 진행률 시각화
from tqdm import tqdm

In [9]:
skip = {'cap-diameter' , 'stem-height' , 'stem-width' , 'class'}
SLE = SafeLabelEncoder()

for col in tqdm(df_train.columns):
    if col not in skip:
        SLE.fit(df_train[col])
        df_train[col] = SLE.transform(df_train[col])
        df_test[col] = SLE.transform(df_test[col])

df_train['class'] = SLE.fit_transform(df_train['class'])

100%|██████████| 21/21 [19:09<00:00, 54.75s/it]


In [10]:
X = df_train
y = df_train.pop('class')

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=156)

In [12]:
# hyperopt 라이브러리: 하이퍼파라미터 최적화 기능 제공
import hyperopt
from hyperopt import hp, STATUS_OK, fmin, tpe, Trials

In [15]:
from sklearn.model_selection import cross_val_score

In [17]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.5


In [18]:
# CatBoost
from catboost import CatBoostClassifier

In [20]:
# CatBoost 하이퍼파라미터
cat_search_space = {'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
                    'depth': hp.quniform('depth', 4, 10, 1),
                    'l2_leaf_reg': hp.randint('l2_leaf_reg', 1, 10),
                    'border_count': hp.quniform('border_count', 32, 255, 1)}

In [32]:
cat_search_space = {'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
                    'depth': hp.quniform('depth', 4, 10, 1),
                    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 10)}

In [33]:
def cat_obj(search_space):
    cat = CatBoostClassifier(iterations=100,
                             learning_rate=search_space['learning_rate'],
                             depth=int(search_space['depth']),
                             l2_leaf_reg=search_space['l2_leaf_reg'],
                             logging_level='Silent')


    accuracy = cross_val_score(cat, X_train, y_train, scoring='accuracy', cv=3)
    return {'loss' : -1 * np.mean(accuracy) , 'status' : STATUS_OK}

In [28]:
import warnings
warnings.filterwarnings('ignore')

In [34]:
trial_val = Trials()

best = fmin(fn = cat_obj,
            space = cat_search_space,
            algo = tpe.suggest,
            max_evals = 50,
            trials = trial_val)

100%|██████████| 50/50 [2:24:35<00:00, 173.52s/trial, best loss: -0.9912449870047307]


In [35]:
cat_wrapper = CatBoostClassifier(iterations=500,
                                learning_rate=round(best['learning_rate'], 5),
                                depth=int(best['depth']),
                                l2_leaf_reg=best['l2_leaf_reg'],
                                logging_level='Silent')

In [37]:
# CatBoost 하이퍼파라미터
best

{'depth': 10.0,
 'l2_leaf_reg': 2.765948425400872,
 'learning_rate': 0.20729401044979315}

In [38]:
valid = [(X_train, y_train), (X_valid, y_valid)]
cat_wrapper.fit(X_train, y_train, eval_set=valid, verbose=True)

<catboost.core.CatBoostClassifier at 0x7fcc85cf2020>

In [39]:
cat_pred_probs = cat_wrapper.predict(df_test)
cat_preds = [1 if x > 0.5 else 0 for x in cat_pred_probs]

In [40]:
id = pd.read_csv('test.csv')['id']

with open("result_cat.csv", "w") as f2:
    f2.write("id,class\n")
    for i , p in zip(id , cat_preds):

      if p == 1:
        f2.write(f"{i},p\n")
      else:
        f2.write(f"{i},e\n")

In [41]:
# XGBoost
from xgboost import XGBClassifier

In [42]:
xgb_search_space = {'max_depth' : hp.quniform('max_depth' , 5 , 20 , 1) ,
                    'min_child_weight' : hp.quniform('min_child_weight' , 1 , 2 , 1) ,
                    'learning_rate' : hp.uniform('learning_rate' , 0.01 , 0.3) ,
                    'colsample_bytree' : hp.uniform('colsample_bytree' , 0.5 , 1)
                    }

In [43]:
def xgb_obj(search_space):
    xgb = XGBClassifier(n_estimators = 100,
                        max_depth = int(search_space['max_depth']),
                        min_child_weight = int(search_space['min_child_weight']),
                        learning_rate = search_space['learning_rate'],
                        colsample_bytree = search_space['colsample_bytree'],
                        eval_metric = 'logloss',
                        tree_method = 'gpu_hist',
                        use_label_encoder = False,
                        n_jobs = -1)

    accuracy = cross_val_score(xgb, X_train, y_train, scoring='accuracy', cv=3)
    return {'loss' : -1 * np.mean(accuracy) , 'status' : STATUS_OK}

In [44]:
trial_val = Trials()

xgb_best = fmin(fn = xgb_obj,
                space = xgb_search_space,
                algo = tpe.suggest,
                max_evals = 50,
                trials = trial_val)

100%|██████████| 50/50 [28:59<00:00, 34.79s/trial, best loss: -0.9923557614789322]


In [45]:
# XGBoost 하이퍼파라미터
xgb_best

{'colsample_bytree': 0.500868678105999,
 'learning_rate': 0.0720334754812404,
 'max_depth': 18.0,
 'min_child_weight': 2.0}

In [46]:
xgb_wrapper = XGBClassifier(n_estimators = 400 ,
                            learning_rate = round(xgb_best['learning_rate'] , 5),
                            max_depth = int(xgb_best['max_depth']),
                            min_child_weight = int(xgb_best['min_child_weight']),
                            colsample_bytree = round(xgb_best['colsample_bytree'] , 5),
                            early_stopping_rounds=50,
                            eval_metric='logloss')

In [47]:
valid = [(X_train, y_train), (X_valid, y_valid)]
xgb_wrapper.fit(X_train, y_train, eval_set=valid, verbose=True)

[0]	validation_0-logloss:0.63094	validation_1-logloss:0.63091
[1]	validation_0-logloss:0.57840	validation_1-logloss:0.57851
[2]	validation_0-logloss:0.52776	validation_1-logloss:0.52793
[3]	validation_0-logloss:0.48808	validation_1-logloss:0.48829
[4]	validation_0-logloss:0.44801	validation_1-logloss:0.44834
[5]	validation_0-logloss:0.41141	validation_1-logloss:0.41180
[6]	validation_0-logloss:0.38076	validation_1-logloss:0.38122
[7]	validation_0-logloss:0.35345	validation_1-logloss:0.35395
[8]	validation_0-logloss:0.32850	validation_1-logloss:0.32905
[9]	validation_0-logloss:0.30524	validation_1-logloss:0.30578
[10]	validation_0-logloss:0.28282	validation_1-logloss:0.28337
[11]	validation_0-logloss:0.26287	validation_1-logloss:0.26343
[12]	validation_0-logloss:0.24684	validation_1-logloss:0.24739
[13]	validation_0-logloss:0.22973	validation_1-logloss:0.23028
[14]	validation_0-logloss:0.21486	validation_1-logloss:0.21548
[15]	validation_0-logloss:0.20139	validation_1-logloss:0.20203
[1

In [48]:
xgb_pred_probs = xgb_wrapper.predict(df_test)
xgb_preds = [1 if x > 0.5 else 0 for x in xgb_pred_probs]

In [49]:
id = pd.read_csv('test.csv')['id']

with open("result_xgb.csv", "w") as f2:
    f2.write("id,class\n")
    for i , p in zip(id , xgb_preds):

      if p == 1:
        f2.write(f"{i},p\n")
      else:
        f2.write(f"{i},e\n")

In [50]:
id = pd.read_csv('test.csv')['id']

with open("result_xgb_cat.csv", "w") as f2:
    f2.write("id,class\n")
    for i, x, c in zip(id , xgb_pred_probs, cat_pred_probs):

      if x + c >= 1:
        f2.write(f"{i},p\n")
      else:
        f2.write(f"{i},e\n")