In [2]:
import pandas as pd
import numpy as np
import plotly.express as px


from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.preprocessing import StandardScaler, Binarizer
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve, roc_auc_score

from sklearn import datasets





import warnings
warnings.filterwarnings('ignore')

In [3]:
pima = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/diabetes.csv')
pima.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
pima.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
# 평가지표 출력하는 함수 설정
def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)
    
    print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('F1: {:.4f}'.format(F1))
    print('AUC: {:.4f}'.format(AUC))

In [6]:
# 피쳐 데이터 세트 X, 레이블 데이터 세트 y 를 추출
X = pima.iloc[:,:-1]
y = pima['Outcome']

# 데이터를 훈련과 테스트 데이터 셋으로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 156, stratify=y)

# 로지스틱 회귀로 학습, 예측 및 평가 수행
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
get_clf_eval(y_test, pred)

오차행렬:
 [[88 12]
 [23 31]]

정확도: 0.7727
정밀도: 0.7209
재현율: 0.5741
F1: 0.6392
AUC: 0.7270


In [7]:
#표
#for col in pima.columns:
#  fig = px.histogram(pima, x=col, nbins=20)
#  fig.show()

In [8]:
get_clf_eval(y_test, pred)

오차행렬:
 [[88 12]
 [23 31]]

정확도: 0.7727
정밀도: 0.7209
재현율: 0.5741
F1: 0.6392
AUC: 0.7270


In [9]:
feature_list = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
pima[feature_list] = pima[feature_list].replace(0, np.nan)

# 위 5개 feature 에 대해 0값을 평균 값으로 대체
mean_features = pima[feature_list].mean()
pima[feature_list] = pima[feature_list].replace(np.nan, mean_features)

In [26]:
X = pima.iloc[:, :-1]
y = pima.iloc[:, -1]

# StandardScaler 클래스를 상용하여 데이터 세트에 스케일링 적용
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state=156, stratify = y)

# 로지스틱 회귀로 학습, 예측
lr_clf = LogisticRegression()
lr_clf.fit(X_train,  y_train)
pred = lr_clf.predict(X_test)
print(X_train)

[[ 0.93691372 -0.41710645  0.62826949 ...  0.50104784  1.97855132
   0.83038113]
 [ 0.04601433  0.24043946  1.12461028 ...  0.38461094 -0.72747502
  -0.87137393]
 [-0.54791859  0.04317569 -2.01888141 ...  1.40343386  0.14533928
  -0.61611067]
 ...
 [-0.84488505  1.48977668  0.13192869 ... -1.3182788  -0.07512938
  -0.0204964 ]
 [-0.54791859 -0.21984267 -0.69530596 ... -0.24123743 -0.15365247
  -1.04154944]
 [-1.14185152  0.07605298 -1.35709369 ... -1.55115261 -0.06002879
  -1.04154944]]


In [11]:
pima.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.435949,12.096346,8.790942,85.021108,6.875151,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,25.0,121.5,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.202592,29.15342,155.548223,32.4,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,155.548223,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [12]:
get_clf_eval(y_test, pred)

오차행렬:
 [[89 11]
 [21 33]]

정확도: 0.7922
정밀도: 0.7500
재현율: 0.6111
F1: 0.6735
AUC: 0.7506


In [13]:
def get_eval_by_threshold(y_test, pred_proba_c1, thresholds):
    #thresholds list 객체 내의 값을 iteration 하면서 평가 수행
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print('\n임계값: ', custom_threshold)
        get_clf_eval(y_test, custom_predict)


In [14]:

#thresholds = [0.2,0.22,0.24,0.26,0.28,0.3,0.32,0.34,0.36,0.38,0.4,0.42,0.44,0.46,0.48,0.5]
pred_proba = lr_clf.predict_proba(X_test)
#get_eval_by_threshold(y_test, pred_proba[:, 1].reshape(-1, 1), thresholds)


In [15]:
# 임계값을 설정
binarizer = Binarizer(threshold=0.38)

# 위에서 구한 predict_proba() 예측확률의 array에서 1에 해당하는 컬럼 값을 대입하여 Binarizer 반환하기
pred_th = binarizer.fit_transform(pred_proba[:, 1].reshape(-1, 1))

get_clf_eval(y_test, pred_th)

오차행렬:
 [[80 20]
 [15 39]]

정확도: 0.7727
정밀도: 0.6610
재현율: 0.7222
F1: 0.6903
AUC: 0.7611


In [28]:
Jack = np.array([3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885])
Rose = np.array([6,148,72,35,0,33.6,0.627,50])
ME1 = np.array([0,102,78,40,90,34.5,0.238,24])
ME2 = np.array([2,111,60,0,0,26.2,0.343,23])
ME3 = np.array([1,128,82,17,183,27.5,0.115,22])
ME4 = np.array([6,148,72,35,0,33.6,0.627,50])

sample_passengers = np.array([Jack,Rose,ME1,ME2,ME3,ME4])

scaler = StandardScaler()
scaler_sample = scaler.fit_transform(sample_passengers)

print(lr_clf.predict(scaler_sample))

[0 1 0 0 0 1]


In [29]:
import joblib 
joblib.dump(lr_clf.predict, 'data.pkl')
 

['data.pkl']

In [30]:
from google.colab import files
files.download('data.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>