In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37)

In [58]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [10]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [11]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

In [13]:
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 

In [14]:
train_df.head()

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,TRAIN_004,1,0.53159,2022-06-13 5:47,T050304,A_31,,,,,...,38.7,41.89,46.93,33.09,76.97,,,,,


In [15]:
train_df.LINE.unique()

array(['T050304', 'T050307', 'T100304', 'T100306', 'T010306', 'T010305'],
      dtype=object)

In [41]:
pd.get_dummies(train_df,columns=['LINE','PRODUCT_CODE'])

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,X_1,X_2,X_3,X_4,X_5,X_6,...,X_2875,LINE_T010305,LINE_T010306,LINE_T050304,LINE_T050307,LINE_T100304,LINE_T100306,PRODUCT_CODE_A_31,PRODUCT_CODE_O_31,PRODUCT_CODE_T_31
0,TRAIN_000,1,0.533433,2022-06-13 5:14,,,,,,,...,,0,0,1,0,0,0,1,0,0
1,TRAIN_001,2,0.541819,2022-06-13 5:22,,,,,,,...,,0,0,0,1,0,0,1,0,0
2,TRAIN_002,1,0.531267,2022-06-13 5:30,,,,,,,...,,0,0,1,0,0,0,1,0,0
3,TRAIN_003,2,0.537325,2022-06-13 5:39,,,,,,,...,,0,0,0,1,0,0,1,0,0
4,TRAIN_004,1,0.531590,2022-06-13 5:47,,,,,,,...,,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,TRAIN_593,1,0.526546,2022-09-08 14:30,2.0,95.0,0.0,45.0,10.0,0.0,...,,0,0,0,0,0,1,0,0,1
594,TRAIN_594,0,0.524022,2022-09-08 22:38,,,,,,,...,,0,0,1,0,0,0,1,0,0
595,TRAIN_595,0,0.521289,2022-09-08 22:47,,,,,,,...,,0,0,1,0,0,0,1,0,0
596,TRAIN_596,1,0.531375,2022-09-08 14:38,40.0,94.0,0.0,45.0,11.0,0.0,...,,0,0,0,0,1,0,0,1,0


## 제품별 라인에 들어갈때 생기는 특정데이터 정리

In [39]:
train_df['Y_Class'].value_counts()

1    407
2    103
0     88
Name: Y_Class, dtype: int64

In [40]:
train_df['PRODUCT_CODE'].value_counts()

T_31    343
A_31    249
O_31      6
Name: PRODUCT_CODE, dtype: int64

In [43]:
train_df.PRODUCT_CODE.unique()

array(['A_31', 'T_31', 'O_31'], dtype=object)

In [66]:
prod_cols_dict = {}
for p in train_df.PRODUCT_CODE.unique():
    k=train_df[train_df['PRODUCT_CODE']==p].isnull().sum()
    cols = [x for x in k[k>0].index if 'X_' in x]
    prod_cols_dict[p] = cols

In [67]:
train_df.LINE.unique()
line_cols_dict = {}
for p in train_df.LINE.unique():
    k=train_df[train_df['LINE']==p].isnull().sum()
    cols = [x for x in k[k>0].index if 'X_' in x]
    line_cols_dict[p] = cols

In [68]:
train_df.LINE.unique()

array(['T050304', 'T050307', 'T100304', 'T100306', 'T010306', 'T010305'],
      dtype=object)

In [79]:
for prod in train_df.PRODUCT_CODE.unique():
    for line in train_df.LINE.unique():
        prod_len = len(prod_cols_dict[prod])
        line_len = len(line_cols_dict[line])
        cross_len = len(set(prod_cols_dict[prod]).intersection(set(line_cols_dict[line])))
        print('prod {}:{}, line {}:{}, cross : {}'.format(prod, prod_len, line, line_len, cross_len))
    print()

prod A_31:2633, line T050304:1972, cross : 1972
prod A_31:2633, line T050307:1425, cross : 1425
prod A_31:2633, line T100304:2639, cross : 2397
prod A_31:2633, line T100306:2219, cross : 1977
prod A_31:2633, line T010306:2004, cross : 2004
prod A_31:2633, line T010305:2004, cross : 2004

prod T_31:2645, line T050304:1972, cross : 1742
prod T_31:2645, line T050307:1425, cross : 1195
prod T_31:2645, line T100304:2639, cross : 2639
prod T_31:2645, line T100306:2219, cross : 2219
prod T_31:2645, line T010306:2004, cross : 1774
prod T_31:2645, line T010305:2004, cross : 1774

prod O_31:2225, line T050304:1972, cross : 1322
prod O_31:2225, line T050307:1425, cross : 775
prod O_31:2225, line T100304:2639, cross : 2219
prod O_31:2225, line T100306:2219, cross : 2219
prod O_31:2225, line T010306:2004, cross : 1354
prod O_31:2225, line T010305:2004, cross : 1354



In [134]:
tmp = set()
for prod in train_df.PRODUCT_CODE.unique():
    tmp = tmp | set(prod_cols_dict[prod])
    
for line in train_df.LINE.unique():
    tmp = tmp | set(line_cols_dict[line])

for prod in train_df.PRODUCT_CODE.unique():
    tmp = tmp & set(prod_cols_dict[prod])
    
for line in train_df.LINE.unique():
    tmp = tmp & set(line_cols_dict[line])

In [135]:
len(tmp)

630

In [112]:
len(list(set(prod_cols_dict[prod])&(set(line_cols_dict[line]))))

1354

In [136]:
train_df[tmp].describe()

Unnamed: 0,X_2149,X_2185,X_2151,X_2669,X_2092,X_2337,X_1973,X_2381,X_2163,X_2614,...,X_2355,X_2069,X_2216,X_2104,X_1168,X_1980,X_2349,X_2145,X_2658,X_2640
count,62.0,62.0,62.0,0.0,39.0,62.0,33.0,62.0,62.0,62.0,...,62.0,39.0,62.0,62.0,94.0,33.0,62.0,62.0,0.0,0.0
mean,56.806452,1.101613,51.824194,,0.121096,50.46129,2.3e-05,57.380645,53.032258,53.622581,...,52.316129,0.122445,52.598387,6e-06,262.458959,4.389364e-06,53.237097,50.816129,,
std,2.170357,0.0127,4.077515,,0.008983,1.012925,7e-06,2.869388,0.606731,1.625455,...,1.253087,0.008163,4.894208,4e-06,6.918203,2.807657e-06,0.844118,4.678279,,
min,55.2,1.1,48.8,,0.110433,49.7,8e-06,55.2,52.2,52.3,...,51.4,0.11296,48.9,3e-06,245.612903,8.09e-07,52.6,47.3,,
25%,55.2,1.1,48.8,,0.115833,49.7,1.6e-05,55.2,52.6,52.3,...,51.4,0.116715,48.9,4e-06,257.871976,1.72e-06,52.6,47.3,,
50%,55.2,1.1,48.8,,0.120529,49.7,2.5e-05,55.2,52.6,53.2,...,51.4,0.1219,48.9,4e-06,262.649698,6.02e-06,52.6,47.3,,
75%,59.2,1.1,57.45,,0.12275,51.6,2.9e-05,61.0,53.8,54.4,...,53.8,0.12495,59.2,9e-06,267.016129,7.05e-06,54.4,57.175,,
max,62.0,1.2,57.7,,0.158667,52.5,3e-05,62.0,54.1,59.6,...,55.5,0.152985,59.4,2.8e-05,281.903226,7.97e-06,54.4,57.4,,


In [111]:
train_df[list(set(prod_cols_dict[prod])&(set(line_cols_dict[line])))].describe()

Unnamed: 0,X_2821,X_2427,X_2151,X_2163,X_2614,X_1847,X_2746,X_2327,X_2814,X_2412,...,X_2788,X_1782,X_2371,X_142,X_601,X_245,X_2605,X_2532,X_92,X_2216
count,120.0,120.0,62.0,62.0,62.0,120.0,120.0,62.0,120.0,120.0,...,120.0,120.0,62.0,120.0,87.0,120.0,62.0,120.0,175.0,62.0
mean,51.7075,249.762583,51.824194,53.032258,53.622581,0.0,180.085,59.556452,52.4325,1.087895,...,23.410017,0.738658,56.806452,0.0,457.712644,4.05,51.227419,55.766667,28.0,52.598387
std,4.031018,0.611615,4.077515,0.606731,1.625455,0.0,0.035857,1.164027,4.847769,0.032102,...,0.511297,0.078288,2.170357,0.0,27.181093,1.436908,1.646274,0.517157,0.0,4.894208
min,48.8,244.83,48.8,52.2,52.3,0.0,180.0,58.7,48.9,1.02,...,22.416667,0.610898,55.2,0.0,413.0,3.0,48.1,55.4,28.0,48.9
25%,48.8,249.78,48.8,52.6,52.3,0.0,180.1,58.7,48.9,1.095076,...,23.028846,0.682983,55.2,0.0,438.0,3.0,50.2,55.4,28.0,48.9
50%,48.8,249.92,48.8,52.6,53.2,0.0,180.1,58.7,48.9,1.1,...,23.448161,0.718151,55.2,0.0,460.0,3.0,50.2,55.4,28.0,48.9
75%,56.75,249.96,57.45,53.8,54.4,0.0,180.1,61.1,58.8,1.1,...,23.827586,0.787566,59.2,0.0,478.0,6.0,53.5,56.4,28.0,59.2
max,57.7,249.98,57.7,54.1,59.6,0.0,180.1,61.6,59.4,1.3,...,24.366667,1.049364,62.0,0.0,500.0,6.0,53.6,57.1,28.0,59.4


In [113]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Columns: 2881 entries, PRODUCT_ID to X_2875
dtypes: float64(2876), int64(1), object(4)
memory usage: 13.1+ MB


In [124]:
std_zero = []
for x in list(set(prod_cols_dict[prod])&(set(line_cols_dict[line]))):
    if train_df[x] .std() <= 0.035857:
        std_zero.append(x)
print(len(std_zero))

382


In [126]:
train_df['X_92'].unique()

array([nan, 28.])

In [129]:
tmp = train_df[list(set(prod_cols_dict[prod])&(set(line_cols_dict[line])))].describe()

In [132]:
tmp.loc['count'].unique()

array([120.,  62.,   0.,  33.,  94., 110.,  70.,  23.,  42.,  87., 118.,
        39., 100.,  10., 174.,  78., 104., 175.,   6., 119.,  99.,  52.])

In [142]:
desc = train_df.describe()
desc

Unnamed: 0,Y_Class,Y_Quality,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
count,598.0,598.0,349.0,349.0,349.0,349.0,349.0,349.0,349.0,349.0,...,100.0,100.0,100.0,100.0,100.0,99.0,0.0,0.0,0.0,0.0
mean,1.025084,0.530896,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,,,,
std,0.565069,0.007401,5.895256,4.10764,0.0,0.0,0.489019,0.0,4.373824,0.215571,...,7.011828,8.13899,7.158917,8.913065,4.52781,0.0,,,,
min,0.0,0.500856,1.0,87.0,0.0,45.0,10.0,0.0,45.0,10.0,...,32.12,31.7,32.56,30.49,61.67,1.0,,,,
25%,1.0,0.527535,2.0,93.0,0.0,45.0,10.0,0.0,45.0,10.0,...,49.485,52.2,42.16,49.915,63.645,1.0,,,,
50%,1.0,0.530436,2.0,95.0,0.0,45.0,10.0,0.0,45.0,10.0,...,53.425,55.925,51.46,56.175,65.14,1.0,,,,
75%,1.0,0.533433,2.0,98.0,0.0,45.0,11.0,0.0,51.0,10.0,...,55.2875,58.975,55.03,57.175,67.115,1.0,,,,
max,2.0,0.578841,103.0,102.0,0.0,45.0,11.0,0.0,62.0,11.0,...,60.24,68.66,60.41,59.93,79.75,1.0,,,,


In [143]:
desc.loc['count'].unique()

array([598., 349., 175., 174.,  78.,  42., 120., 129., 249.,  59.,  70.,
       348.,  87.,  10.,   0., 247., 248., 110., 118.,  94., 104.,   6.,
       233., 119.,  52., 100.,  33.,  62.,  39.,  23.,  99.])