In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import BorderlineSMOTE
import catboost

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

In [18]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
subm = pd.read_csv('./sample_submission.csv')

In [4]:
train.head()

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,TRAIN_004,1,0.53159,2022-06-13 5:47,T050304,A_31,,,,,...,38.7,41.89,46.93,33.09,76.97,,,,,


In [28]:
train_A = train[train['PRODUCT_CODE']=='A_31']
train_T = train[train['PRODUCT_CODE']=='T_31']
train_O = train[train['PRODUCT_CODE']=='O_31']

test_A = test[test['PRODUCT_CODE']=='A_31']
test_T = test[test['PRODUCT_CODE']=='T_31']
test_O = test[test['PRODUCT_CODE']=='O_31']

In [29]:
print(len(train_A))
print(len(train_T))
print(len(train_O))

print(len(test_A))
print(len(test_T))
print(len(test_O))

249
343
6
67
239
4


In [30]:
train_A.head()

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,TRAIN_004,1,0.53159,2022-06-13 5:47,T050304,A_31,,,,,...,38.7,41.89,46.93,33.09,76.97,,,,,


In [31]:
# 결측치가 5% 이상이면 제거
def remove_col(train_df, test_df):
    train_df, test_df = train_df.copy(), test_df.copy()
    for x in train_df.columns[6:]:
        if len(train_df[train_df[x].isnull()]) > 0.05 or len(test_df[test_df[x].isnull()]) > 0.05:
            train_df.drop(columns=[x], inplace=True)
            test_df.drop(columns=[x], inplace=True)
    return train_df, test_df

In [32]:
train_A, test_A = remove_col(train_A, test_A)
train_T, test_T = remove_col(train_T, test_T)
train_O, test_O = remove_col(train_O, test_O)

In [33]:
train_A.head()

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_284,X_955,X_956,X_1002,...,X_2856,X_2857,X_2858,X_2859,X_2860,X_2861,X_2862,X_2863,X_2864,X_2865
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,1.0,51.0,-26.0,-5.740741,...,181.6,139.6,131.646667,115.4,209.0,197.286667,189.0,383.0,368.296296,353.0
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,1.0,62.0,-26.0,3.471698,...,177.1,145.5,128.748276,119.7,198.0,193.296552,185.6,383.0,367.735849,353.0
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,1.0,69.0,-34.0,7.320755,...,183.6,128.0,115.365517,104.0,193.4,179.82069,165.5,383.0,367.320755,353.0
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,1.0,68.0,-22.0,7.075472,...,179.5,126.2,112.082759,94.5,190.3,181.92069,165.8,384.0,369.188679,353.0
4,TRAIN_004,1,0.53159,2022-06-13 5:47,T050304,A_31,1.0,59.0,-22.0,8.981481,...,188.0,152.1,138.066667,109.7,208.6,196.393333,182.6,383.0,367.351852,352.0


In [11]:
def split_df_Class(train_df, test_df):
    train_x_df = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality', 'LINE', 'PRODUCT_CODE'])
    train_y_df = train_df['Y_Class']

    test_x_df = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'LINE', 'PRODUCT_CODE'])
    return train_x_df, train_y_df, test_x_df

def split_df_Quality(train_df, test_df):
    train_x_df = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality', 'LINE', 'PRODUCT_CODE'])
    train_y_df = train_df['Y_Quality']

    test_x_df = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'LINE', 'PRODUCT_CODE'])
    return train_x_df, train_y_df, test_x_df

In [12]:
train_x_A, train_y_A, test_x_A = split_df_Class(train_A, test_A)
train_x_T, train_y_T, test_x_T = split_df_Class(train_T, test_T)
train_x_O, train_y_O, test_x_O = split_df_Quality(train_O, test_O)

In [13]:
train_x_A.head()

Unnamed: 0,X_284,X_955,X_956,X_1002,X_1003,X_1004,X_1005,X_1077,X_1078,X_1079,...,X_2856,X_2857,X_2858,X_2859,X_2860,X_2861,X_2862,X_2863,X_2864,X_2865
0,1.0,51.0,-26.0,-5.740741,1.0,1.0,1.0,11569.0,-16.0,139.0,...,181.6,139.6,131.646667,115.4,209.0,197.286667,189.0,383.0,368.296296,353.0
1,1.0,62.0,-26.0,3.471698,1.0,1.0,1.0,11605.0,-8.0,139.0,...,177.1,145.5,128.748276,119.7,198.0,193.296552,185.6,383.0,367.735849,353.0
2,1.0,69.0,-34.0,7.320755,1.0,1.0,1.0,11643.0,-5.0,139.0,...,183.6,128.0,115.365517,104.0,193.4,179.82069,165.5,383.0,367.320755,353.0
3,1.0,68.0,-22.0,7.075472,1.0,1.0,1.0,11690.0,1.0,139.0,...,179.5,126.2,112.082759,94.5,190.3,181.92069,165.8,384.0,369.188679,353.0
4,1.0,59.0,-22.0,8.981481,1.0,1.0,1.0,11924.0,9.0,140.0,...,188.0,152.1,138.066667,109.7,208.6,196.393333,182.6,383.0,367.351852,352.0


In [14]:
def Borderline_SMOTE_Class(train_x_df, train_y_df):
    B_SMOTE = BorderlineSMOTE()
    x_b_smote, y_b_smote = B_SMOTE.fit_resample(train_x_df, train_y_df)
    return x_b_smote, y_b_smote

In [15]:
train_x_A, train_y_A = Borderline_SMOTE_Class(train_x_A, train_y_A)
train_x_T, train_y_T = Borderline_SMOTE_Class(train_x_T, train_y_T)

In [16]:
test_x_A

Unnamed: 0,X_284,X_955,X_956,X_1002,X_1003,X_1004,X_1005,X_1077,X_1078,X_1079,...,X_2856,X_2857,X_2858,X_2859,X_2860,X_2861,X_2862,X_2863,X_2864,X_2865
3,1.0,50.0,15.0,26.846154,1.0,1.0,1.0,641.0,19.0,716.0,...,159.4,201.3,179.739286,149.9,198.0,191.450000,183.8,467.0,444.192308,423.0
4,1.0,43.0,13.0,25.980769,1.0,1.0,1.0,675.0,27.0,716.0,...,175.3,227.9,190.664286,162.4,210.2,193.082143,179.7,465.0,443.211539,423.0
5,1.0,32.0,10.0,22.018868,1.0,1.0,1.0,2603.0,24.0,717.0,...,160.7,190.7,170.910345,148.9,203.3,192.375862,181.7,466.0,441.830189,422.0
6,1.0,41.0,10.0,22.339623,1.0,1.0,1.0,2605.0,20.0,718.0,...,138.9,178.7,162.317241,120.6,201.5,194.351724,175.1,464.0,445.075472,423.0
7,2.0,-1.0,-31.0,-12.074074,1.0,1.0,1.0,32193.0,-17.0,139.0,...,167.7,184.8,176.040000,157.8,208.0,190.800000,168.3,384.0,369.462963,354.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,2.0,5.0,-12.0,-1.925926,1.0,1.0,1.0,24469.0,-4.0,134.0,...,151.2,167.1,155.386667,145.6,188.4,174.146667,165.4,384.0,369.481481,352.0
285,2.0,6.0,-11.0,-1.629630,1.0,1.0,1.0,25365.0,-4.0,134.0,...,155.1,173.7,157.010000,144.8,187.2,172.956667,157.8,384.0,369.203704,352.0
286,2.0,10.0,-6.0,1.603774,1.0,1.0,1.0,26110.0,4.0,133.0,...,162.2,213.7,159.755172,135.8,187.6,176.496552,167.8,384.0,368.924528,352.0
292,2.0,12.0,-3.0,5.886792,1.0,1.0,1.0,58366.0,8.0,134.0,...,151.7,163.8,151.727586,137.6,170.3,163.200000,154.9,383.0,365.056604,343.0


In [20]:
for train_x, train_y, test_x in zip([train_x_A, train_x_T], [train_y_A, train_y_T], [test_x_A, test_x_T]):

    clf = catboost.CatBoostClassifier(verbose=0).fit(train_x, train_y)
    pred = clf.predict(test_x)
    
    for i, idx in enumerate(test_x.index):
        subm['Y_Class'][idx] = pred[i]
    
reg = catboost.CatBoostRegressor(verbose=0).fit(train_x_O, train_y_O)
pred = reg.predict(test_x_O)
for i, p in enumerate(pred):
    if p > 0.5349: pred[i] = 2
    elif p < 0.5250762: pred[i] = 0
    else: pred[i] = 1
for i, idx in enumerate(test_x_O.index):
    subm['Y_Class'][idx] = pred[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subm['Y_Class'][idx] = pred[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subm['Y_Class'][idx] = pred[i]


In [27]:
subm.to_csv('./submission_30.csv', index=False)

In [26]:
subm.Y_Class.value_counts()

1    259
0     43
2      8
Name: Y_Class, dtype: int64