In [2]:
import warnings
warnings.filterwarnings('ignore')

import os
import random
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import f1_score
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN
from sklearn.inspection import permutation_importance
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import matplotlib.pyplot as plt
import catboost
import optuna

import eli5
from eli5.sklearn import PermutationImportance

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

In [4]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
subm = pd.read_csv('./sample_submission.csv')

In [5]:
train['PRODUCT_CODE'] = train['PRODUCT_CODE'].astype('category')
train['LINE'] = train['LINE'].astype('category')

In [6]:
# 열에서 모든 값이 nan이면 train, test에서 열 제거
def remove_col(train_df, test_df):
    for x in train_df.columns[6:]:
        if train_df[x].nunique() == 0 or list(train_df[x].unique()) == [0]:
            train_df.drop(columns=[x], inplace=True)
            test_df.drop(columns=[x], inplace=True)
    return train_df, test_df

In [7]:
train, test = remove_col(train, test)

In [8]:
train.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Quality'], inplace=True)

test = test[train.columns[1:]]

In [9]:
train_x, train_y = train.drop(columns=['Y_Class']), train['Y_Class']

In [10]:
# 상관계수 == 1 이면 제거
def same_corr_drop(train_df):
    tmp = train_df[train_df.columns[2:]].corr()
    for e, c in enumerate(tmp.columns):
        for i in range(e+1, len(tmp.columns)):
            if abs(tmp[c][i]) == 1 and tmp.columns[i] in train_df.columns:
                train_df.drop(columns=[tmp.columns[i]], inplace=True)
    return train_df

train_x = same_corr_drop(train_x)
test = test[train_x.columns]

In [11]:
train_x.head()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2799,X_2800,X_2801,X_2837,X_2839,X_2840,X_2841,X_2842,X_2843,X_2871
0,T050304,A_31,,,,,,,,,...,25.0,22.925926,20.0,9.04,5.02,7.0,40.06,0.000331,3.3e-05,
1,T050307,A_31,,,,,,,,,...,26.0,25.05,24.0,8.0,5.0,7.05,36.54,0.00115,9e-06,
2,T050304,A_31,,,,,,,,,...,26.0,23.962963,22.0,9.04,5.02,6.52,40.05,0.000332,3.3e-05,
3,T050307,A_31,,,,,,,,,...,26.0,25.037037,23.0,7.49,5.01,7.03,40.03,0.00121,8e-06,
4,T050304,A_31,,,,,,,,,...,26.0,24.0,22.0,9.04,5.03,6.52,40.07,0.000334,4.1e-05,


### [0, 1] <-> [2]로 Y_Class를 2분할

In [15]:
train_y_ex0 = train_y.replace(0, 1)
train_y_ex0

0      1
1      2
2      1
3      2
4      1
      ..
593    1
594    1
595    1
596    1
597    1
Name: Y_Class, Length: 598, dtype: int64

In [None]:
rskfold = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)

f1_list = []
class_rate = []
for fold, (train_index, val_index) in enumerate(tqdm(rskfold.split(train_x, train_y_ex0))):
    x_trn, x_val, y_trn, y_val = train_x.loc[train_index], train_x.loc[val_index], train_y.loc[train_index], train_y.loc[val_index]

    clf = catboost.CatBoostClassifier(verbose=0, task_type='GPU') # cpu -> thread_count=5
    clf.fit(x_trn, y_trn, early_stopping_rounds=100, cat_features=['PRODUCT_CODE', 'LINE'])
    f1_list.append(f1_score(clf.predict(x_val), y_val, average='macro'))
    class_rate.append(np.array([len(y_val[y_val==0]), len(y_val[y_val==1]), len(y_val[y_val==1])]) / len(y_val))
    
print(f'{min(f1_list)} : {class_rate[f1_list.index(min(f1_list))]} ~ {max(f1_list)} : {class_rate[f1_list.index(min(f1_list))]}')
print('mean :', np.mean(f1_list))