# 의사결정나무 이해하기

In [1]:
# 로지스틱 회귀는 항상 모양이 s자 - 모수적
# 의사결정나무는 비모수적 따라서 분포에 맞게 모양을 구축가능
#soft voting은 클래스의 확률의 평균, hard voting은 클래스 개수가 많은걸 선택 

# 머신러닝 모델 훈련&예측

## 패키지들 불러오기

In [3]:
import os
import pandas as pd 
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader_v2 import data_loader_v2 # 자체적으로 만든 data loader version 2.0 ([데이콘 15회 대회] 데이터 설명 및 데이터 불러오기 영상 참조)
from tqdm import tqdm

In [4]:
train_folder = '../train/'
test_folder = '../test/'
train_label_path = '../etc/train_label.csv'

train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [5]:
# 모든 csv 파일의 상태_B로 변화는 시점이 같다라고 가정
# 하지만, 개별 csv파일의 상태_B로 변화는 시점은 상이할 수 있음
def data_loader_all_v2(func, files, folder='', train_label=None, event_time=10, nrows=60):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(tqdm(pool.imap(func_fixed, files), total = len(files)))
        pool.close()
        pool.join()       
    combined_df = pd.concat(df_list)    
    return combined_df

## 데이터 불러오기

In [6]:
train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label, event_time=10, nrows=40) 
#10초부터 40초까지의 30초간간의 데이터 샘플만 가져온다

100%|██████████| 826/826 [01:54<00:00,  7.21it/s]


In [7]:
test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=10, nrows=40)

100%|██████████| 718/718 [01:38<00:00,  7.25it/s]


## 데이터 분리! 중요한 부분

In [9]:
train.head()

Unnamed: 0,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,V0009,...,V5112,V5113,V5114,V5115,V5116,V5117,V5118,V5119,V5120,label
0,30.474394,8.691177,8.714483,8.687399,8.72123,207.697895,165.86573,-6.018876999999999e-19,0.0,-0.002136,...,1.0,1.0,1.0,60.0,0.0,0.0,1.42162e-05,85.4,0.0,110
0,30.470463,8.736521,8.682769,8.717135,8.682402,192.66508,191.006871,-3.9187579999999997e-19,0.0,0.00171,...,1.0,1.0,1.0,60.0,0.0,0.0,-6.114455e-06,85.4,0.0,110
0,30.465427,8.753559,8.663426,8.700049,8.734147,187.065171,192.700238,-1.7991789999999997e-19,0.0,0.000493,...,1.0,1.0,1.0,60.0,0.0,0.0,-1.813291e-05,85.4,0.0,110
0,30.458532,8.715056,8.714854,8.717174,8.699257,188.500036,180.150567,-6.636970999999999e-19,0.0,0.000318,...,1.0,1.0,1.0,60.0,0.0,0.0,-5.745568e-07,85.4,0.0,110
0,30.475773,8.790241,8.735125,8.703167,8.72103,193.269046,195.98489,-6.379752e-20,0.0,-9.1e-05,...,1.0,1.0,1.0,60.0,0.0,0.0,8.437883e-06,85.4,0.0,110


In [10]:
X_train = train.drop(['label'], axis=1) #axis =1 : column 
y_train = train['label']

## 모델 정의

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
model = RandomForestClassifier(random_state=0, verbose=1, n_jobs=-1)

## 모델 훈련

In [14]:
model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   56.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.7min finished


RandomForestClassifier(n_jobs=-1, random_state=0, verbose=1)

## 모델 예측

In [15]:
pred = model.predict_proba(test) # proba는 probability 따라서 soft votiong #그냥 예측만 하면 hard로
submission = pd.DataFrame(data=pred)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    4.7s finished


In [16]:
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()

In [None]:
#submission.to_csv('feat5120.csv') # 저장