In [1]:
import hyperopt
print(hyperopt.__version__)

0.2.7


In [2]:
import torch

# CUDA가 사용 가능한지 확인하고 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
from hyperopt import hp

xgb_search_space = {'max_depth' : hp.quniform('max_depth' , 5 , 20 , 1) ,
                    'min_child_weight' : hp.quniform('min_child_weight' , 1 , 2 , 1) ,
                    'learning_rate' : hp.uniform('learning_rate' , 0.01 , 0.3) ,
                    'colsample_bytree' : hp.uniform('colsample_bytree' , 0.5 , 1)
                    }


In [4]:
import xgboost
print(xgboost.__version__)

2.1.1


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import plot_importance

import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv('/content/train.csv')
data = pd.read_csv('/content/test.csv')

df = df.drop('id', axis=1)
data = data.drop('id', axis=1)
print(df['class'].value_counts())
df.head(3)

class
p    1705396
e    1411549
Name: count, dtype: int64


Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,e,8.8,f,s,u,f,a,c,w,4.51,...,,,w,,,f,f,,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,...,,y,o,,,t,z,,d,w
2,e,6.94,f,s,b,f,x,c,w,6.85,...,,s,n,,,f,f,,l,w


In [7]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

class SafeLabelEncoder(LabelEncoder):
    def __init__(self):
        super().__init__()
        self.classes_ = np.array([])

    def fit(self, y):
        y = y.astype(str)
        self.classes_ = np.unique(y)
        return self

    def transform(self, y):
        y = y.astype(str)
        # 새롭게 등장한 카테고리(클래스)에 대해 -1을 반환
        return np.array([self._safe_transform(val) for val in y])

    def _safe_transform(self, val):
        if val in self.classes_:
            return np.where(self.classes_ == val)[0][0]
        else:
            # 새로운 데이터에 대해 -1을 반환
            return -1

    def fit_transform(self, y):
        return self.fit(y).transform(y)

In [8]:
Pass = set(['cap-diameter' , 'stem-height' , 'stem-width' , 'class'])
le = SafeLabelEncoder()
for key in df.columns:
    print(key)
    if key in Pass : continue
    le.fit(df[key])
    df[key] = le.transform(df[key])
    data[key] = le.transform(data[key])
df['class'] = le.fit_transform(df['class'])
y = df['class']
X = df.drop('class', axis=1)
print(y.value_counts())

class
cap-diameter
cap-shape
cap-surface
cap-color
does-bruise-or-bleed
gill-attachment
gill-spacing
gill-color
stem-height
stem-width
stem-root
stem-surface
stem-color
veil-type
veil-color
has-ring
ring-type
spore-print-color
habitat
season
class
1    1705396
0    1411549
Name: count, dtype: int64


In [9]:
# X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.1 , random_state=156)
X_train , X_eval , y_train , y_eval = train_test_split(X , y , test_size=0.1 , random_state=156)

In [10]:
from xgboost import XGBClassifier
from hyperopt import STATUS_OK
from sklearn.model_selection import cross_val_score

def objective_func(search_space):
    xgb = XGBClassifier(
        n_estimators = 100,
        max_depth = int(search_space['max_depth']),
        min_child_weight = int(search_space['min_child_weight']),
        learning_rate = search_space['learning_rate'],
        colsample_bytree = search_space['colsample_bytree'],
        eval_metric = 'logloss',
        tree_method='gpu_hist',  # GPU 사용 설정
        use_label_encoder=False,  # XGBoost 경고 방지를 위한 설정
        n_jobs=-1  # 모든 CPU 코어 사용
    )

    accuracy = cross_val_score(xgb , X_train , y_train , scoring='accuracy' , cv = 3)

    return {'loss' : -1 * np.mean(accuracy) , 'status' : STATUS_OK}




In [11]:
from hyperopt import fmin , tpe , Trials

trial_val = Trials()

best = fmin(
    fn = objective_func,
    space = xgb_search_space,
    algo = tpe.suggest,
    max_evals = 50,
    trials = trial_val
)

best

100%|██████████| 50/50 [29:20<00:00, 35.20s/trial, best loss: -0.9923678816124797]


{'colsample_bytree': 0.5395317035035201,
 'learning_rate': 0.08094810900900441,
 'max_depth': 19.0,
 'min_child_weight': 2.0}

In [15]:
xgb_wrapper = XGBClassifier(n_estimators = 400 ,
                            learning_rate = round(best['learning_rate'] , 5),
                            max_depth = int(best['max_depth']),
                            min_child_weight = int(best['min_child_weight']),
                            colsample_bytree = round(best['colsample_bytree'] , 5),
                            early_stopping_rounds=50,
                            eval_metric='logloss'
                            )


evals = [(X_train,y_train) , (X_eval , y_eval)]
xgb_wrapper.fit(X_train , y_train  , eval_set=evals , verbose=True)

[0]	validation_0-logloss:0.62332	validation_1-logloss:0.62329
[1]	validation_0-logloss:0.56466	validation_1-logloss:0.56481
[2]	validation_0-logloss:0.50921	validation_1-logloss:0.50944
[3]	validation_0-logloss:0.46558	validation_1-logloss:0.46587
[4]	validation_0-logloss:0.42322	validation_1-logloss:0.42367
[5]	validation_0-logloss:0.38509	validation_1-logloss:0.38561
[6]	validation_0-logloss:0.35349	validation_1-logloss:0.35408
[7]	validation_0-logloss:0.32555	validation_1-logloss:0.32619
[8]	validation_0-logloss:0.30025	validation_1-logloss:0.30096
[9]	validation_0-logloss:0.27661	validation_1-logloss:0.27732
[10]	validation_0-logloss:0.25441	validation_1-logloss:0.25514
[11]	validation_0-logloss:0.23485	validation_1-logloss:0.23557
[12]	validation_0-logloss:0.21895	validation_1-logloss:0.21966
[13]	validation_0-logloss:0.20241	validation_1-logloss:0.20313
[14]	validation_0-logloss:0.18812	validation_1-logloss:0.18891
[15]	validation_0-logloss:0.17535	validation_1-logloss:0.17617
[1

In [16]:
pred_probs = xgb_wrapper.predict(data)
preds = [1 if x > 0.5 else 0 for x in pred_probs]

preds[:5]

[0, 1, 1, 1, 0]

In [17]:
id = pd.read_csv('/content/test.csv')['id']

with open("result.csv", "w") as f2:
    f2.write("id,class\n")
    for i , p in zip(id , preds):

      if p == 1:
        f2.write(f"{i},p\n")
      else:
        f2.write(f"{i},e\n")