In [1]:
import pandas as pd
import random
import os
import numpy as np
from collections import Counter
from matplotlib import pyplot

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
from xgboost import XGBClassifier

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

In [4]:
train_df = pd.read_csv('open/train.csv')
test_df = pd.read_csv('open/test.csv')

In [5]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [6]:
train_x = train_x.fillna(0) # NaN 0으로 채우기
test_x = test_x.fillna(0)

In [7]:
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    # test_x 데이터에만 존재하는 새로 출현한 데이터를 신규 클래스로 추가한다 (중요!!!)
    for label in np.unique(test_x[i]):
        if label not in le.classes_: # unseen label 데이터인 경우( )
            le.classes_ = np.append(le.classes_,label) # 미처리 시 ValueError발생
    test_x[i] = le.transform(test_x[i])

print('Done.')

Done.


In [8]:
from xgboost import XGBClassifier

xgbc = XGBClassifier(random_state=37)

In [9]:
xgbc.fit(train_x, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=37, reg_alpha=0, ...)

In [10]:
pred = xgbc.predict(test_x)

In [11]:
submit = pd.read_csv('open/sample_submission.csv')

In [12]:
submit['Y_Class'] = pred

In [13]:
submit.to_csv('./baseline_submission_7.csv', index=False)