# Pima 아메리카 원주민 당뇨병 예측모델

In [1]:
import pandas as pd
import numpy as np

### 데이터 불러오기/전처리

In [2]:
pima = pd.read_csv('../00. data/pima/diabetes.csv')
pima.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
pima.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
zero_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
total_count = pima['Glucose'].count()
for feature in zero_features:
    zero_count = pima[pima[feature] == 0][feature].count()
    percent = 100 * zero_count / total_count
    print(f'{feature}의 0 값 개수: {zero_count}, 비율은 {percent:.2f}%')

Glucose의 0 값 개수: 5, 비율은 0.65%
BloodPressure의 0 값 개수: 35, 비율은 4.56%
SkinThickness의 0 값 개수: 227, 비율은 29.56%
Insulin의 0 값 개수: 374, 비율은 48.70%
BMI의 0 값 개수: 11, 비율은 1.43%


In [5]:
pima[zero_features] = pima[zero_features].replace(0, pima[zero_features].mean())

### 데이터 정규화, 학습/테스트 데이터 분리

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
X = pima.iloc[:, :-1]
y = pima.iloc[:, -1]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=11)

### GridSearchCV

In [7]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
parameters = {
    'max_depth': [2, 3, 5, 10],
    'min_samples_leaf': [1, 5, 8],
    'min_samples_split': [2, 3, 5]
}

In [8]:
grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv=5, scoring='accuracy', refit=True)
grid_dtree.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [2, 3, 5, 10],
                         'min_samples_leaf': [1, 5, 8],
                         'min_samples_split': [2, 3, 5]},
             scoring='accuracy')

In [9]:
print('GridSearchCV 최적 파라미터:', grid_dtree.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_dtree.best_score_))

GridSearchCV 최적 파라미터: {'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
GridSearchCV 최고 정확도: 0.7606


In [10]:
estimator = grid_dtree.best_estimator_
pred = estimator.predict(X_test)
pred_proba = estimator.predict_proba(X_test)

### 평가

In [11]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred)
    print('오차 행렬')
    print(confusion)
    print(f'정확도: {accuracy:.4f}, 정밀도: {precision:.4f}, 재현율: {recall:.4f}, F1 스코어: {f1:.4f}, ROC AUC 스코어: {roc_auc:.4f}')

In [12]:
get_clf_eval(y_test, pred)

오차 행렬
[[86 14]
 [35 19]]
정확도: 0.6818, 정밀도: 0.5758, 재현율: 0.3519, F1 스코어: 0.4368, ROC AUC 스코어: 0.6059


### 판단 기준 임의 설정

In [13]:
from sklearn.preprocessing import Binarizer
my_threshold_1 = 0.5
pred_proba_1 = pred_proba[:, 1].reshape(-1, 1)

binarizer = Binarizer(threshold=my_threshold_1)
custom_pred_1 = binarizer.fit_transform(pred_proba_1)

get_clf_eval(y_test, custom_pred_1)

오차 행렬
[[86 14]
 [35 19]]
정확도: 0.6818, 정밀도: 0.5758, 재현율: 0.3519, F1 스코어: 0.4368, ROC AUC 스코어: 0.6059


In [14]:
my_threshold_2 = 0.4
pred_proba_2 = pred_proba[:, 1].reshape(-1, 1)

binarizer = Binarizer(threshold=my_threshold_2)
custom_pred_2 = binarizer.fit_transform(pred_proba_2)

get_clf_eval(y_test, custom_pred_2)

오차 행렬
[[86 14]
 [35 19]]
정확도: 0.6818, 정밀도: 0.5758, 재현율: 0.3519, F1 스코어: 0.4368, ROC AUC 스코어: 0.6059


In [15]:
my_threshold_3 = 0.6
pred_proba_3 = pred_proba[:, 1].reshape(-1, 1)

binarizer = Binarizer(threshold=my_threshold_3)
custom_pred_3 = binarizer.fit_transform(pred_proba_3)

get_clf_eval(y_test, custom_pred_3)

오차 행렬
[[86 14]
 [35 19]]
정확도: 0.6818, 정밀도: 0.5758, 재현율: 0.3519, F1 스코어: 0.4368, ROC AUC 스코어: 0.6059


In [16]:
my_threshold_4 = 0.45
pred_proba_4 = pred_proba[:, 1].reshape(-1, 1)

binarizer = Binarizer(threshold=my_threshold_4)
custom_pred_4 = binarizer.fit_transform(pred_proba_4)

get_clf_eval(y_test, custom_pred_4)

오차 행렬
[[86 14]
 [35 19]]
정확도: 0.6818, 정밀도: 0.5758, 재현율: 0.3519, F1 스코어: 0.4368, ROC AUC 스코어: 0.6059


### LogisticRegression

In [17]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
lr_pred_proba = lr_clf.predict_proba(X_test)

get_clf_eval(y_test, lr_pred)

오차 행렬
[[89 11]
 [29 25]]
정확도: 0.7403, 정밀도: 0.6944, 재현율: 0.4630, F1 스코어: 0.5556, ROC AUC 스코어: 0.6765


In [18]:
lr_threshold_1 = 0.5
lr_pred_proba_1 = lr_pred_proba[:, 1].reshape(-1, 1)

binarizer = Binarizer(threshold=lr_threshold_1)
lr_custom_pred_1 = binarizer.fit_transform(lr_pred_proba_1)

get_clf_eval(y_test, lr_custom_pred_1)

오차 행렬
[[89 11]
 [29 25]]
정확도: 0.7403, 정밀도: 0.6944, 재현율: 0.4630, F1 스코어: 0.5556, ROC AUC 스코어: 0.6765


In [19]:
lr_threshold_2 = 0.4
lr_pred_proba_2 = lr_pred_proba[:, 1].reshape(-1, 1)

binarizer = Binarizer(threshold=lr_threshold_2)
lr_custom_pred_2 = binarizer.fit_transform(lr_pred_proba_2)

get_clf_eval(y_test, lr_custom_pred_2)

오차 행렬
[[84 16]
 [23 31]]
정확도: 0.7468, 정밀도: 0.6596, 재현율: 0.5741, F1 스코어: 0.6139, ROC AUC 스코어: 0.7070


In [20]:
lr_threshold_3 = 0.6
lr_pred_proba_3 = lr_pred_proba[:, 1].reshape(-1, 1)

binarizer = Binarizer(threshold=lr_threshold_3)
lr_custom_pred_3 = binarizer.fit_transform(lr_pred_proba_3)

get_clf_eval(y_test, lr_custom_pred_3)

오차 행렬
[[94  6]
 [30 24]]
정확도: 0.7662, 정밀도: 0.8000, 재현율: 0.4444, F1 스코어: 0.5714, ROC AUC 스코어: 0.6922


### RandomForestClassifier

In [21]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=11)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
rf_pred_proba = rf_clf.predict_proba(X_test)

get_clf_eval(y_test, rf_pred)

오차 행렬
[[84 16]
 [27 27]]
정확도: 0.7208, 정밀도: 0.6279, 재현율: 0.5000, F1 스코어: 0.5567, ROC AUC 스코어: 0.6700


In [22]:
rf_threshold_1 = 0.5
rf_pred_proba_1 = rf_pred_proba[:, 1].reshape(-1, 1)

binarizer = Binarizer(threshold=rf_threshold_1)
rf_custom_pred_1 = binarizer.fit_transform(rf_pred_proba_1)

get_clf_eval(y_test, rf_custom_pred_1)

오차 행렬
[[84 16]
 [27 27]]
정확도: 0.7208, 정밀도: 0.6279, 재현율: 0.5000, F1 스코어: 0.5567, ROC AUC 스코어: 0.6700


In [23]:
rf_threshold_2 = 0.4
rf_pred_proba_2 = rf_pred_proba[:, 1].reshape(-1, 1)

binarizer = Binarizer(threshold=rf_threshold_2)
rf_custom_pred_2 = binarizer.fit_transform(rf_pred_proba_2)

get_clf_eval(y_test, rf_custom_pred_2)

오차 행렬
[[79 21]
 [17 37]]
정확도: 0.7532, 정밀도: 0.6379, 재현율: 0.6852, F1 스코어: 0.6607, ROC AUC 스코어: 0.7376


In [24]:
rf_threshold_3 = 0.6
rf_pred_proba_3 = rf_pred_proba[:, 1].reshape(-1, 1)

binarizer = Binarizer(threshold=rf_threshold_3)
rf_custom_pred_3 = binarizer.fit_transform(rf_pred_proba_3)

get_clf_eval(y_test, rf_custom_pred_3)

오차 행렬
[[94  6]
 [33 21]]
정확도: 0.7468, 정밀도: 0.7778, 재현율: 0.3889, F1 스코어: 0.5185, ROC AUC 스코어: 0.6644
