## 머신러닝 모델 학습
---

In [10]:
import pandas as pd

origin_df = pd.read_csv('./heart.csv')
origin_df.drop(['oldpeak', 'slp', 'caa', 'restecg', 'thall'], axis = 1, inplace = True)
origin_df.rename(
    columns = {
        'age' : 'Age',
        'sex' : 'Sex',
        'cp' : 'PainType',
        'trtbps' : 'BloodPressure',
        'chol' : 'CholestoralDensity',
        'fbs' : 'BloodSugar',
        'thalachh' : 'MaxHeartRate',
        'exng' : 'IsExerciseInduced',
        'output' : 'Result',
    },
    inplace = True
)

df_feature = origin_df.drop(['Result'], axis=1, inplace=False)
df_label = origin_df['Result']

### One-Hot Encoding

In [11]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

atomic_col_names = ['Sex', 'PainType', 'BloodSugar', 'IsExerciseInduced']

df_feature_oh = pd.get_dummies(df_feature, columns=atomic_col_names)
df_feature_oh.head()

Unnamed: 0,Age,BloodPressure,CholestoralDensity,MaxHeartRate,Sex_0,Sex_1,PainType_0,PainType_1,PainType_2,PainType_3,BloodSugar_0,BloodSugar_1,IsExerciseInduced_0,IsExerciseInduced_1
0,63,145,233,150,0,1,0,0,0,1,0,1,1,0
1,37,130,250,187,0,1,0,0,1,0,1,0,1,0
2,41,130,204,172,1,0,0,1,0,0,1,0,1,0
3,56,120,236,178,0,1,0,1,0,0,1,0,1,0
4,57,120,354,163,1,0,1,0,0,0,1,0,0,1


#### 불필요한 요소 포함하고 모델 학습 및 포함 안 하고 모델 학습
1. 포함한 상태

In [12]:
from sklearn.model_selection import train_test_split

# 학습용 데이터와 테스트용 데이터로 분리 (8 : 2)
def split(feature, label):
    return train_test_split(feature, label, test_size=0.2, random_state=11)

In [13]:
from sklearn.preprocessing import StandardScaler

# 분할하고 표준화를 하면 행 순서가 맞지 않아서 join을 했을 때 NaN이 뜨는 현상이 있음
# 그래서 학습 데이터 전체를 표준화 한 후 분할을 할 예정

con_indices = ['Age', 'BloodPressure', 'CholestoralDensity', 'MaxHeartRate'] # 연속적인 데이터프레임 인덱스

x_scaled = StandardScaler().fit_transform(df_feature[con_indices]) # 연속적인 열만 표준화

df_scaled_feature = pd.DataFrame(x_scaled, columns=con_indices) # DataFrame으로 변환

df_scaled_feature = df_scaled_feature.join(df_feature_oh.drop(con_indices, axis=1, inplace=False)) # 표준화 된 데이터 프레임과 기존 것과 합침

x_train_s, x_test_s, y_train_s, y_test_s = split(df_scaled_feature, df_label) # 표준화 된 데이터 프레임과 레이블을 학습/테스트 데이터로 분리

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  accuracy_score

## 로지스틱 회귀를 이용한 모델 학습
def logisticReg(x_train, y_train, x_test, y_test):
    model = LogisticRegression().fit(x_train, y_train) # 모델 학습

    prediction = model.predict(x_test) # 예측

    pred_score = accuracy_score(y_test, prediction) # 평가

    return pred_score # 정확도 반환

print('정확도 :', logisticReg(x_train_s, y_train_s, x_test_s, y_test_s))

정확도 : 0.819672131147541


In [15]:
import numpy as np
from sklearn.model_selection import KFold

# KFold 교차 검증
def LR_KFold(n_splits, feature, label):
    
    cv = KFold(n_splits=n_splits)

    accs = []

    for train_index, val_index in cv.split(df_scaled_feature):
        # 인덱싱 -> 변수 나눔
        x_train = feature.loc[train_index]
        y_train = label[train_index]

        x_val = feature.loc[val_index]
        y_val = label[val_index]

        acc = logisticReg(x_train, y_train, x_val, y_val)

        accs.append(acc)
        
    return np.mean(accs)

print('평균 정확도 :', LR_KFold(5, df_scaled_feature, df_label))

평균 정확도 : 0.6926775956284155


2. 포함하지 않은 상태

In [16]:
# 데이터 조작
unnecessary_indices = ['PainType_3', 'BloodSugar_0', 'BloodSugar_1']

df_scaled_feature2 = df_scaled_feature.drop(unnecessary_indices, axis=1, inplace=False)

mean_accuracy = LR_KFold(5, df_scaled_feature2, df_label)

print('데이터 조작 후 KFold 평균 정확도 :', mean_accuracy)

데이터 조작 후 KFold 평균 정확도 : 0.6992896174863389


In [17]:
# KFold의 n_splits 변수 조작
for i in [3, 4, 5, 6, 7, 8, 9, 10]:
    print(str(i) + '번 나눔 : ', LR_KFold(i, df_scaled_feature2, df_label))

3번 나눔 :  0.5610561056105611
4번 나눔 :  0.6664912280701755
5번 나눔 :  0.6992896174863389
6번 나눔 :  0.7060130718954248
7번 나눔 :  0.729009362730293
8번 나눔 :  0.7453769559032717
9번 나눔 :  0.732224202812438
10번 나눔 :  0.7386021505376343


- 8번 나눴을 때 제일 정확도가 높게 나왔음