In [15]:
import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
import os
import random

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
# from sklearn.inspection import permutation_importance
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import f1_score

from imblearn.over_sampling import BorderlineSMOTE, ADASYN, RandomOverSampler
from imblearn.combine import SMOTEENN, SMOTETomek

import matplotlib.pyplot as plt
from pylab import rcParams
plt.style.use('fivethirtyeight')
plt.rc("font", family="Malgun Gothic")
plt.rcParams['axes.unicode_minus'] = False
%matplotlib inline
import seaborn as sns

import catboost
import optuna

In [16]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

In [17]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
subm = pd.read_csv('./sample_submission.csv')

In [18]:
train['PRODUCT_CODE'] = train['PRODUCT_CODE'].astype('category')
train['LINE'] = train['LINE'].astype('category')

test['PRODUCT_CODE'] = test['PRODUCT_CODE'].astype('category')
test['LINE'] = test['LINE'].astype('category')

In [19]:
# train에서 열의 유일한 값이 nan이거나 모두 같은 값인 경우 해당 열을 제외
def remove_col(train_df, test_df):
    for x in train_df.columns[6:]:
        if train_df[x].nunique()==0 or (train_df[x].nunique()==1 and len(train_df[x].unique())==1): # nan 이거나 모두 같은 값인 경우
            train_df.drop(columns=[x], inplace=True)
            test_df.drop(columns=[x], inplace=True)
    return train_df, test_df

In [20]:
train, test = remove_col(train, test)

In [21]:
train.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Quality'], inplace=True)

test = test[train.columns[1:]]

In [22]:
train_x, train_y = train.drop(columns=['Y_Class']), train['Y_Class']

In [23]:
train_x[train_x.columns[2:]] = train_x[train_x.columns[2:]].fillna(0)
test[test.columns[2:]] = test[test.columns[2:]].fillna(0)

In [25]:
clf = catboost.CatBoostClassifier(task_type='GPU', verbose=0).fit(train_x, train_y, cat_features=['PRODUCT_CODE', 'LINE'])
pred = clf.predict(test)
subm['Y_Class'] = pred

In [26]:
subm.Y_Class.value_counts()

1    264
0     35
2     11
Name: Y_Class, dtype: int64

In [27]:
subm.to_csv('./submission_34.csv', index=False)

In [30]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

clf = catboost.CatBoostClassifier(task_type='GPU', verbose=0).fit(train_x, train_y)
pred = clf.predict(test_x)
subm['Y_Class'] = pred

Done.


In [32]:
subm.Y_Class.value_counts()

1    264
0     42
2      4
Name: Y_Class, dtype: int64

In [39]:
for a, b in zip(pd.read_csv('submission_35_perm_del.csv').Y_Class, pd.read_csv('submission_36_feat_del.csv').Y_Class):
    if a != b: print(a, b)

1 0
1 0
2 1
2 1
2 1
1 0
1 0
2 1
2 1
2 1
2 1
2 1
2 1
2 1
1 0
2 1


In [38]:
10/300

0.03333333333333333