#### 학습 목표
- 머신러닝의 분류모델 이용하여, 여러가지 평가지표를 적용하여 확인
- 의학(당뇨병 여부 판단) : 재현율의 지표를 확인
    

In [1]:
from sklearn.tree         import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble     import RandomForestClassifier

from sklearn.preprocessing   import LabelEncoder , OneHotEncoder , StandardScaler , MinMaxScaler , Binarizer 
from sklearn.model_selection import train_test_split , GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score , roc_auc_score
from sklearn.metrics import confusion_matrix, precision_recall_curve , roc_curve

import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import seaborn as sns

import missingno as ms
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# datasets

In [2]:
diabetesDF = pd.read_csv('./data/diabetes.csv')
diabetesDF.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


* Pregnancies: 임신 횟수
* Glucose: 포도당 부하 검사 수치
* BloodPressure: 혈압(mm Hg)
* SkinThickness: 팔 삼두근 뒤쪽의 피하지방 측정값(mm)
* Insulin: 혈청 인슐린(mu U/ml)
* BMI: 체질량지수(체중(kg)/(키(m))^2)
* DiabetesPedigreeFunction: 당뇨 내력 가중치 값
* Age: 나이
* Outcome: 클래스 결정 값(0또는 1)

In [3]:
diabetesDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
diabetesDF['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [None]:
# 모델생성 (3가지)
# 전처리(결측치 , 이상치 , 표준화&정규화)
# 모델 셀렉션 , 교차검증
# 학습, 예측, 평가

In [None]:
# 임계값 정밀도-재현율을 확인 및 시각화
# 분포확인


# StandardScaler 클래스를 이용해 피처 데이터 세트에 일괄적으로 스케일링 적용하고 0값을 평균값으로 대체한 데이터 세트로 학습/예측
# 분류결정 임계값을 변경하면서 성능 측정


In [23]:
# AUC 성능평가 확인

def metrics_evaluation(y_test, y_pred=None , y_pred_proba=None) :
    print("정확도 : {} , 정밀도 : {} , 재현율 : {} , 조화평균(F1) : {} , AUC : {} "
          .format(accuracy_score(y_test, y_pred) , 
                  precision_score(y_test, y_pred) , 
                  recall_score(y_test, y_pred) , 
                  f1_score(y_test, y_pred) , 
                  roc_auc_score(y_test, y_pred_proba) ) )

In [25]:
# 데이터 클린징 없이 그냥 테스트

X = diabetesDF.iloc[:, :-1]
y = diabetesDF.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 156, stratify=y)

# 로지스틱 회귀로 학습,예측 및 평가 수행. 
lr_clf = LogisticRegression()
lr_clf.fit(X_train , y_train)
lr_y_pred = lr_clf.predict(X_test)

dt_clf =  DecisionTreeClassifier()
dt_clf.fit(X_train , y_train)
dt_y_pred = dt_clf.predict(X_test)

rf_clf =  RandomForestClassifier()
rf_clf.fit(X_train , y_train)
rf_y_pred = rf_clf.predict(X_test)


# roc_auc_score 수정에 따른 추가
print('로지스틱')
lr_pred_proba = lr_clf.predict_proba(X_test)[:, 1]
metrics_evaluation(y_test , lr_y_pred, lr_pred_proba)

print()
print('의사결정트리')
dt_pred_proba = dt_clf.predict_proba(X_test)[:, 1]
metrics_evaluation(y_test , dt_y_pred, dt_pred_proba)

print()
print('랜덤포레스트')
rf_pred_proba = rf_clf.predict_proba(X_test)[:, 1]
metrics_evaluation(y_test , rf_y_pred, rf_pred_proba)



로지스틱
정확도 : 0.7727272727272727 , 정밀도 : 0.7209302325581395 , 재현율 : 0.5740740740740741 , 조화평균(F1) : 0.6391752577319588 , AUC : 0.7918518518518518 

의사결정트리
정확도 : 0.6818181818181818 , 정밀도 : 0.5454545454545454 , 재현율 : 0.5555555555555556 , 조화평균(F1) : 0.5504587155963303 , AUC : 0.6527777777777778 

랜덤포레스트
정확도 : 0.7857142857142857 , 정밀도 : 0.7560975609756098 , 재현율 : 0.5740740740740741 , 조화평균(F1) : 0.6526315789473683 , AUC : 0.8240740740740742 


#### 스케일링 변환 시 주의사항!!!!!

In [9]:
X_train = np.arange(0, 11).reshape(-1, 1)
X_test = np.arange(0, 6).reshape(-1, 1)

In [11]:
X_train , X_test , X_train.shape , X_test.shape

(array([[ 0],
        [ 1],
        [ 2],
        [ 3],
        [ 4],
        [ 5],
        [ 6],
        [ 7],
        [ 8],
        [ 9],
        [10]]),
 array([[0],
        [1],
        [2],
        [3],
        [4],
        [5]]),
 (11, 1),
 (6, 1))

In [13]:
scaler = MinMaxScaler()
scaler.fit(X_train)

scaler_train = scaler.transform(X_train)

print('original : ' ,  X_train.reshape(-1))
print('scaler   : ' ,  scaler_train.reshape(-1)) 

original :  [ 0  1  2  3  4  5  6  7  8  9 10]
scaler   :  [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]


In [14]:
scaler = MinMaxScaler()
scaler.fit(X_test)

scaler_test = scaler.transform(X_test)

print('original : ' ,  X_test.reshape(-1))
print('scaler   : ' ,  scaler_test.reshape(-1)) 

original :  [0 1 2 3 4 5]
scaler   :  [0.  0.2 0.4 0.6 0.8 1. ]


In [16]:
scaler = MinMaxScaler()
scaler.fit(X_train)
scaler_train = scaler.transform(X_train)

print('original : ' ,  X_train.reshape(-1))
print('scaler   : ' ,  scaler_train.reshape(-1)) 

scaler_test = scaler.transform(X_test)

print('original : ' ,  X_test.reshape(-1))
print('scaler   : ' ,  scaler_test.reshape(-1)) 

original :  [ 0  1  2  3  4  5  6  7  8  9 10]
scaler   :  [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
original :  [0 1 2 3 4 5]
scaler   :  [0.  0.1 0.2 0.3 0.4 0.5]
