## import

In [1]:
!pip install optuna

Collecting optuna
  Using cached optuna-3.6.1-py3-none-any.whl (380 kB)
Collecting colorlog
  Using cached colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting alembic>=1.5.0
  Using cached alembic-1.13.1-py3-none-any.whl (233 kB)
Collecting typing-extensions>=4
  Using cached typing_extensions-4.11.0-py3-none-any.whl (34 kB)
Collecting Mako
  Using cached Mako-1.3.3-py3-none-any.whl (78 kB)
Installing collected packages: typing-extensions, Mako, colorlog, alembic, optuna
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 3.7.4.3
    Uninstalling typing-extensions-3.7.4.3:
      Successfully uninstalled typing-extensions-3.7.4.3
Successfully installed Mako-1.3.3 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1 typing-extensions-4.11.0

[notice] A new release of pip available: 22.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.5.3 requires numpy~=1.19.2, but you have numpy 1.18.5 which is incompatible.
tensorflow 2.5.3 requires typing-extensions~=3.7.4, but you have typing-extensions 4.11.0 which is incompatible.


In [4]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score # 정확도 함수


In [5]:
def seed_everything(seed):# 시드 고정이 필요한 이유 : 모델의 재현성을 위해 반드시 필요한 장치/ 모델 학습 및 실험의 일관성을 보장하게됨
                          #                            코드를 실행 할 때 마다 동일한 난수 순서를 얻게 됨
    
    random.seed(seed) ## 목적 : python 내장 random 모듈의 난수 생성을 예측 가능하게 함 
    os.environ['PYTHONHASHSEED'] = str(seed) ## os.environ은 내장모듈 환경변수 목적 : python해시 함수의 재현 가능성을 보장하게된다. 
                                             ##dict와 set과 같은 해시 기반 구조의 순서가 프로그램 실행 간 일정하게 유지됨
                                                ### dict set은  모두 유일한 키를 가지고 있으므로 주어진 객체를 O(1) 시간복잡도로 찾을수 있다.(리스트의 경우 시간복잡도 O(n))
                                                ### dict set은 주어진 객체를 빠르게 찾을 수 있는것이 장점. 하지만 메모리를 많이 먹고 hash 함수에 의존적
                                                ### dict set 동작원리 : 두개 모두 hashtable을 사용한다. dict : hashtable은 key, value, hash를 메모리에 저장 //  set : key,hash를 메모리에 저장
                                                ### hash 값은 key 데이터를 입력으로 hash function의 결과값을 의미함. 즉 dict과 set의 수만가지 key값을 입력으로 서로 다른 hash 결과값을 도출 할 수 있어야 한다
                                                ### 여기서 hash 함수는 엔트로피(불확실성)가 커지도록 설계해야한다. 엔트로피가 커지면 고르고 균일한 분포의 hash값을 만들어 낼 수 있기 때문.
                                                ### ex) dict or set 의 key 값이 hash function 의 함수값에 대입되면 hashes 값이 나오게 된다.
                                                ### my_info = {'name': 'rsb', 'birth' : '1997-07-16'}
                                                ### my_info에 대한 hash table = [['--','--','--'],[-230273521,'birth','1997-07-16']] 은 위처럼 불필요 데이터가 저장되며 데이터 저장이 비효율 적임을 알수있음.
                                                ### 현재 pyhton은 데이터 효율성을 늘리기 위하여 각 index와 hash key vlaue를 분리하여 메모리 효율성을 챙기며  [-230273521,'birth','1997-07-16']로 표기된다.
    np.random.seed(seed)## 목적 : 모든 난수 생성을 예측 가능한 순서로 만듬 (난수: 정의된 범위 내에서 무작위로 추출된 수)
                        ## numpy를 통해 생성되는 난수 시퀸스가 일정하게 유지 // 재현성 : 측정한 결과가 다시 나타나는 성질
    #torch.manual_seed(seed) ##목적 : pytorch난수 생성을 예측하게됨 cpu상에서 pytorch를 통해 생성되는 난수가 일정해짐
    #torch.cuda.manual_seed(seed) ##목적 : 현재 GPU에서 pytorch 난수 생성 예측가능 GPU상에서 pytorch 연산을 위한 난수가 일정해짐
seed_everything(37)# 시드 고정

## data load

In [6]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [7]:
train_df

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,TRAIN_004,1,0.531590,2022-06-13 5:47,T050304,A_31,,,,,...,38.70,41.89,46.93,33.09,76.97,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,TRAIN_593,1,0.526546,2022-09-08 14:30,T100306,T_31,2.0,95.0,0.0,45.0,...,,,,,,,,,,
594,TRAIN_594,0,0.524022,2022-09-08 22:38,T050304,A_31,,,,,...,49.47,53.07,50.89,55.10,66.49,1.0,,,,
595,TRAIN_595,0,0.521289,2022-09-08 22:47,T050304,A_31,,,,,...,,,,,,1.0,,,,
596,TRAIN_596,1,0.531375,2022-09-08 14:38,T100304,O_31,40.0,94.0,0.0,45.0,...,,,,,,,,,,


In [8]:
train_x = train_df.drop(columns = ['PRODUCT_ID','TIMESTAMP','Y_Class','Y_Quality'])

In [9]:
train_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T050304,A_31,,,,,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,T050307,A_31,,,,,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,T050304,A_31,,,,,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,T050307,A_31,,,,,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,T050304,A_31,,,,,,,,,...,38.70,41.89,46.93,33.09,76.97,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,50.0,10.0,...,,,,,,,,,,
594,T050304,A_31,,,,,,,,,...,49.47,53.07,50.89,55.10,66.49,1.0,,,,
595,T050304,A_31,,,,,,,,,...,,,,,,1.0,,,,
596,T100304,O_31,40.0,94.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,


In [10]:
train_y = train_df['Y_Class']

In [11]:
train_y

0      1
1      2
2      1
3      2
4      1
      ..
593    1
594    0
595    0
596    1
597    1
Name: Y_Class, Length: 598, dtype: int64

In [12]:
test_df

Unnamed: 0,PRODUCT_ID,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TEST_000,2022-09-09 2:01,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,
1,TEST_001,2022-09-09 2:09,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
2,TEST_002,2022-09-09 8:42,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
3,TEST_003,2022-09-09 10:56,T010305,A_31,,,,,,,...,,,,,,,,,,
4,TEST_004,2022-09-09 11:04,T010306,A_31,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,TEST_305,2022-11-05 11:18,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,
306,TEST_306,2022-11-05 16:39,T100304,T_31,2.0,96.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
307,TEST_307,2022-11-05 16:47,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,
308,TEST_308,2022-11-05 20:53,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,


In [13]:
test_x = test_df.drop(columns = ['PRODUCT_ID','TIMESTAMP'])

In [14]:
test_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
1,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
2,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
3,T010305,A_31,,,,,,,,,...,,,,,,,,,,
4,T010306,A_31,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
306,T100304,T_31,2.0,96.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
307,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,50.0,10.0,...,,,,,,,,,,
308,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,


## data pre processing (전처리)

In [15]:
#전처리 과정 
train_x = train_x.fillna(0) #nan 값 0 으로 채워줌
test_x = test_x.fillna(0)

In [16]:
train_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.34,40.89,32.56,34.09,77.77,0.0,0.0,0.0,0.0,0.0
1,T050307,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38.89,42.82,43.92,35.34,72.55,0.0,0.0,0.0,0.0,0.0
2,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.19,36.65,42.47,36.53,78.35,0.0,0.0,0.0,0.0,0.0
3,T050307,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,37.74,39.17,52.17,30.58,71.78,0.0,0.0,0.0,0.0,0.0
4,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38.70,41.89,46.93,33.09,76.97,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,50.0,10.0,...,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0
594,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,49.47,53.07,50.89,55.10,66.49,1.0,0.0,0.0,0.0,0.0
595,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,1.0,0.0,0.0,0.0,0.0
596,T100304,O_31,40.0,94.0,0.0,45.0,11.0,0.0,45.0,10.0,...,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0


In [17]:
test_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,T010305,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,T010306,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,51.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
306,T100304,T_31,2.0,96.0,0.0,45.0,11.0,0.0,45.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
307,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,50.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
308,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,51.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [1]:
print(test_x[['LINE','X_1']])

NameError: name 'test_x' is not defined

In [18]:
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    print(np.unique(test_x[i]))

['T010305' 'T010306' 'T050304' 'T050307' 'T100304' 'T100306']
['A_31' 'O_31' 'T_31']


In [19]:
train_x['LINE']

0      T050304
1      T050307
2      T050304
3      T050307
4      T050304
        ...   
593    T100306
594    T050304
595    T050304
596    T100304
597    T100306
Name: LINE, Length: 598, dtype: object

In [20]:
le = LabelEncoder()
le = le.fit(train_x['LINE'])
print(le.classes_)

['T010305' 'T010306' 'T050304' 'T050307' 'T100304' 'T100306']


In [21]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 


##레이블 인코딩
#정의 : 문자열 값을 오름차 순으로 정렬 후 0부터 1씩 증가하는 값으로 변환한다.
#숫자의 차이가 모델에 영향을 주지 않는 트리 계열 모델 의사결정나무, 랜덤포레스트에 적용
#숫자의 차이가 모델에 영향을 미치는 선형 계열 모델 로지스틱회귀, svm, 신경망에는 사용 할 수 없다.


In [22]:
print(train_x)

     LINE  PRODUCT_CODE   X_1   X_2  X_3   X_4   X_5  X_6   X_7   X_8  ...  \
0       2             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
1       3             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
2       2             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
3       3             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
4       2             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
..    ...           ...   ...   ...  ...   ...   ...  ...   ...   ...  ...   
593     5             2   2.0  95.0  0.0  45.0  10.0  0.0  50.0  10.0  ...   
594     2             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
595     2             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
596     4             1  40.0  94.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
597     5             1  21.0  87.0  0.0  45.0  10.0  0.0  61.0  10.0  ...   

     X_2866  X_2867  X_2868  X_2869  X_2870  X_2871  X_2872  X_

In [23]:
print(test_x)

     LINE  PRODUCT_CODE  X_1   X_2  X_3   X_4   X_5  X_6   X_7   X_8  ...  \
0       5             2  2.0  94.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   
1       4             2  2.0  93.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
2       4             2  2.0  95.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
3       0             0  0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
4       1             0  0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
..    ...           ...  ...   ...  ...   ...   ...  ...   ...   ...  ...   
305     5             2  2.0  91.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   
306     4             2  2.0  96.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
307     5             2  2.0  91.0  0.0  45.0  10.0  0.0  50.0  10.0  ...   
308     5             2  2.0  95.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   
309     5             2  2.0  87.0  0.0  45.0  10.0  0.0  51.0  10.0  ...   

     X_2866  X_2867  X_2868  X_2869  X_2870  X_2871  X_2872  X_2873  X_2874

## randomforest classifier 분류 모델 학습

In [24]:
RF = RandomForestClassifier(random_state = 37).fit(train_x,train_y)

## 추론 예측

In [35]:
preds = RF.predict(test_x)
print(preds)

[1 1 1 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


## Submit

In [39]:
submit = pd.read_csv('./sample_submission.csv')

In [40]:
submit

Unnamed: 0,PRODUCT_ID,Y_Class
0,TEST_000,0
1,TEST_001,0
2,TEST_002,0
3,TEST_003,0
4,TEST_004,0
...,...,...
305,TEST_305,0
306,TEST_306,0
307,TEST_307,0
308,TEST_308,0


In [42]:
submit['Y_Class'] = preds

In [45]:
np.unique(submit['Y_Class']) 

array([0, 1], dtype=int64)

In [None]:
submit.to_csv('./pretice.csv',index=False)