# Index

#### 01 : Setting (import Library)  

#### 02 : NumPy API

- 개요 : 머신러닝의 근본 / 행렬 배열 계산
- 차원
    - [ ] = axis 중첩수 : 차원수
    - axis0 : 신규차원의 방향이 axis0  
- 생성
    - import numpy as np
    - np.array( [ndarray] )
    - np.arange( start = int , stop = int )
    - np.zeros( int, int)
    - np.ones( int, int)
- 재구성
    - A.reshape( axis0 size , axis1 size , axis2 size ... )
        - -1 을 넣는 다면 타 입력값에 맞춰 적절한 값을 자동으로 적용해 준다
- 인덱싱
    - Sampling : array[ int , int ]
    - Slicing : array[ 1:3 , : ]
    - Fancy index : array[ [1,2,3], 1:3 ]
        - 연속된 값의 경우 차원이 축소 되지 않는다
    - Boolean index : array_answer = array[ array > 5 ]
- 정렬
    - np.sort( [ndarray] )
        - 원 행렬은 유지한 채 재정렬된 행렬을 반환
    - ndarray.sort()
        - 원 행렬 자체를 지정한 형태로 재정렬 후 , 반환값은 None
        - default : 오름차순 , 내림차순 ndarray.sort()[::-1]
    - np.argsort( )
- 선형대수 연산
    - 내적 : np.dot( A , B)
    - 전치 : np.transpose( A )

#### 03 : Pandas API  

- 개요 : Python 데이터 처리에 있어 가장 인기있는 라이브러리
- 구성
    - Series : Index / DataFrame : Index & Column
- 생성
    - import pandas as pd
    - pd.read_csv('파일명.csv')
    - pd.DataFrame(dic_name)
- DATA check
    - A.head()
    - A.tail()
    - A.info()
    - A.shape()
    - A.describe()
    - df_name['col_name'].value_count()
        - dropna = True : default
        - dropna = False : null 값도 count 를 진행한다
- Type change
    - List -> ndarray -> DF
    - dict -> DF
    - DF -> ndarray -> List
    - DF -> dict
- axis 추가
    - A['new_col'] = data or A['col'] func
- axis 수정
    - A['org_col'] = data or A['col'] func
- axis 삭제 : 지정axis ( col or row ) 삭제
    - A.drop(['name_1', 'name_2',...], axis = , inplace = ]
        - inplace = Flase (default)
        - inplace = True
- axis 재정렬
    - A.reset_index([drop = , inplace = )
- axis 재명명
    - A.rename(columns = {'old_1' : 'new_1' , 'old_2' : 'new_2'}, inplace = ) 
- 필터링
    - 기본식 : A['col_name']
    - 명칭기반
        - A['col'].loc[ , ]
    - 위치기반
        - A['col'].iloc[ , ]
    - Boolean indexing
        - A[ boolean ]
- 정렬
    - A.sort_values(by = 'col_name' , asending = , inplace = )
- Aggregation (집합연산)
    - .sum()
    - .max()
    - .min()
    - .count()
    - .mean()
- GroupBy
    - A.groupby(by='col_name').agg({ 'col_1' : [np.func_1, np.func_2], 'col_2' : np.func_3 })
- Null data
    - null check
        - A['col'].isna()
    - null fill
        - A['col'].fillna('a')
        - A['col'].replace(np.nan, 'new_data', inplace = )
- 고유값 확인
    - A['col'].value_counts() : 객체 고유값의 수량
    - A['col'].nunique() : 객체 고유값의 종류 가짓 수
- DF 변경
    - A['col'].replace( { 'org_data' : 'new_data' }, inplace = )
    - A['col'].replace(np.nan, 'new_data', inplace = )
- lambda
    - func = lambda x : return_value
    - .apply(lambda x : return)

# 01 : Setting (Import Libraries)

In [7]:
# 1 data EDA Library
import numpy as np 
import pandas as pd
import random as rnd 
import missingno as msno
import sklearn
import xgboost
import lightgbm

In [2]:
# 2 plot vizualize Library
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns 
sns.set(font_scale=1.0) 
plt.style.use('seaborn-whitegrid') 
sns.set_theme(style='whitegrid', font_scale=1.0) 

In [3]:
# 3 ML / DL Library
import tensorflow as tf
import keras as kr 
from keras import layers
import torch

In [4]:
# 4 scikit-learn algorithm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.svm import SVC    
from xgboost import XGBClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB 

In [5]:
# 5 모델 튜닝 및 평가
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn import model_selection 

In [6]:
# 6 경고 제거 (판다스가 에어 메섹지를 자주 만들어 내기 때문)
import sys
import warnings
warnings.filterwarnings('ignore') 

# 02 : Training

## 00 - Setting

In [1]:
import numpy as np
import pandas as pd
import sklearn
from IPython.display import display

from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()

print(iris.keys())


dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


## 01 - ndarray_data set

In [51]:
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

## 02 - DF_data set

In [54]:
import pandas as pd
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['label'] = iris.target
iris_df.head(3)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


## 03 - 임의 분할 : train_test_split()

In [80]:
dt_clf = DecisionTreeClassifier(random_state = 5)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.3 , random_state = 5)

dt_clf.fit(X_train, y_train)

pred = dt_clf.predict(X_test)

accuracy_tts = accuracy_score(pred, y_test)

print('# train_test_split() : ', np.round(accuracy_tts,4))

# train_test_split() :  0.9556


## 04 - Kfold

In [75]:
df_clf = DecisionTreeClassifier(random_state = 5)

from sklearn.model_selection import KFold
kfold = KFold(n_splits = 3)

accuracy_kfold = []
n = 0

for train_index, test_index in kfold.split(iris.data) : # 설정된 n_split 만큼 분할되어 index 를 반환
    n = n + 1
    X_train, X_test = iris.data[train_index], iris.data[test_index]
    y_train, y_test = iris.target[train_index], iris.target[test_index]

    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)

    accuracy = accuracy_score(pred, y_test)
    print('# Kfold {} 차 결과 : {:.4f}'.format(n, np.round(accuracy,4)))

    accuracy_kfold.append(accuracy)

print('# 1~{} 차 Kfold 교차검증 평균 : {:.4f}'.format(n, np.round(np.mean(accuracy_kfold),4)))

# Kfold 1 차 결과 : 0.0000
# Kfold 2 차 결과 : 0.0000
# Kfold 3 차 결과 : 0.0000
# 1~3 차 Kfold 교차검증 평균 : 0.0000


## 05 - Stratified Kfold

In [76]:
dt_clf = DecisionTreeClassifier(random_state = 5)

from sklearn.model_selection import StratifiedKFold
stf_kfold = StratifiedKFold(n_splits=3)

accuracy_stf_kfold = []
n = 0

for train_index, test_index in stf_kfold.split(iris.data, iris.target) :
    n = n + 1    
    X_train, X_test = iris.data[train_index], iris.data[test_index]
    y_train, y_test = iris.target[train_index], iris.target[test_index]

    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)

    accuracy = accuracy_score(pred, y_test)
    print('# stf_Kfold {} 차 결과 : {:.4f}'.format(n, np.round(accuracy,4)))

    accuracy_stf_kfold.append(accuracy)

print('# 1~{} 차 Kfold 교차검증 평균 : {:.4f}'.format(n, np.round(np.mean(accuracy_stf_kfold),4)))


# stf_Kfold 1 차 결과 : 0.9800
# stf_Kfold 2 차 결과 : 0.9400
# stf_Kfold 3 차 결과 : 0.9800
# 1~3 차 Kfold 교차검증 평균 : 0.9667


## 06 - cross_val_score

In [77]:
dt_clf = DecisionTreeClassifier(random_state = 5)

from sklearn.model_selection import cross_val_predict, cross_validate
accuracy_cvs = cross_val_score(dt_clf, iris.data, iris.target, scoring = 'accuracy', cv = 3)

print('교차 검증별 정확도 : ', np.round(accuracy_cvs,4))
print('평균 교차검증 정확도 : ', np.round(np.mean(accuracy_cvs),4))

교차 검증별 정확도 :  [0.98 0.94 0.98]
평균 교차검증 정확도 :  0.9667


## 07 - GridSearch CV
교차검증과 최적 하이퍼 파라미터 튜닝을 한번에

In [127]:
# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.3, random_state = 5)

dtree = DecisionTreeClassifier()

# 파라미터 설정 : Dict
parameter = { 'max_depth' : [1,2,3], 'min_samples_split' : [2,3] }

import pandas as pd

from sklearn.model_selection import GridSearchCV
# GridSearchCV 설정 
    # param_grid의 하이퍼 파라미터를 3개의(cv = 3) train, test set fold 로 나누어 테스트 수행 설정
        # (max_dapth) * (min_sample_split) * (cv) = 3 * 2 * 2 = 18 set
grid_dtree = GridSearchCV(dtree, param_grid = parameter, cv = 3, refit = True)

# 학습/검증
grid_dtree.fit(X_train, y_train)

# GridSearchCV 결과 dType 변환
    # Dict -> DataFrame
score_df = pd.DataFrame(grid_dtree.cv_results_)
display(score_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']])

# GridSearchCV 결과 분석
print('GridSerachCV 최적 parameter : ', grid_dtree.best_params_)
print('GridSearchCV 최고 정확도 : {:.4f}'.format(grid_dtree.best_score_))

# GridSearchCV 의 refit에 의해 학습된 estimator 반환
estimator = grid_dtree.best_estimator_

# test 데이터 평가
    # GridSearchCV 의 .best_estimator 는 이미 최적 학습이 완료 되었으므로 별도 추가 학습이 필요없음
pred = estimator.predict(X_test)
accuracy_grd = accuracy_score(pred, y_test)
print('# 테스트 데이터 정확도 : {:.4f}'.format(accuracy_grd))

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.67619,5,0.657143,0.685714,0.685714
1,"{'max_depth': 1, 'min_samples_split': 3}",0.67619,5,0.657143,0.685714,0.685714
2,"{'max_depth': 2, 'min_samples_split': 2}",0.961905,1,0.971429,0.971429,0.942857
3,"{'max_depth': 2, 'min_samples_split': 3}",0.961905,1,0.971429,0.971429,0.942857
4,"{'max_depth': 3, 'min_samples_split': 2}",0.961905,1,0.971429,0.971429,0.942857
5,"{'max_depth': 3, 'min_samples_split': 3}",0.961905,1,0.971429,0.971429,0.942857


GridSerachCV 최적 parameter :  {'max_depth': 2, 'min_samples_split': 2}
GridSearchCV 최고 정확도 : 0.9619
# 테스트 데이터 정확도 : 0.9333


## 08 - 결과 비교

In [123]:
print('# train_test_split 정확도 : ', np.round(accuracy_tts,4))
print('')

print('# Kfold 교차검증 별 정확도 : ', np.round(accuracy_kfold,4))
print('## Kfold 교차검증 평균 정확도 : ', np.round(np.mean(accuracy_kfold),4))
print('')

print('# Stratified Kfold 교차검증 별 정확도 : ', np.round(accuracy_stf_kfold,4))
print('## Stratified Kfold 교차검증 평균 정확도 : ', np.round(np.mean(accuracy_stf_kfold),4))
print('')

print('# cross_val_scroe 교차검증 별 정확도 : ', np.round(accuracy_cvs,4))
print('## cross_val_score 교차검증 평균 정확도 : ', np.round(np.mean(accuracy_cvs),4))
print('')

print('# GridSearchCV 최적 튜닝 시 각 정확도 : ', np.round(score_df['mean_test_score'].tolist(),4))
print('# GridSearchCV 최적 튜닝 시 최대 정확도 : ', np.round(np.max(score_df['mean_test_score']),4))
print('## GridSearchCV 하이퍼튜닝 최종 교차검증 정확도 : ', np.round(np.mean(accuracy_grd),4))

# train_test_split 정확도 :  0.9556

# Kfold 교차검증 별 정확도 :  [0. 0. 0.]
## Kfold 교차검증 평균 정확도 :  0.0

# Stratified Kfold 교차검증 별 정확도 :  [0.98 0.94 0.98]
## Stratified Kfold 교차검증 평균 정확도 :  0.9667

# cross_val_scroe 교차검증 별 정확도 :  [0.98 0.94 0.98]
## cross_val_score 교차검증 평균 정확도 :  0.9667

# GridSearchCV 최적 튜닝 시 각 정확도 :  [0.6762 0.6762 0.9619 0.9619 0.9619 0.9619]
# GridSearchCV 최적 튜닝 시 최대 정확도 :  0.9619
## GridSearchCV 하이퍼튜닝 최종 교차검증 정확도 :  0.9333
