In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
# mpl.rc('font', family='NanumBarunGothic') # 혹은 다른 설치한 Nanum 폰트 사용 - 리눅스 사용할때
import scipy
import scipy.stats as stats

import os
import re
import time
import datetime as dt # 사용시 datetime.dt 로 사용 # 현재 시간 사용
from datetime import datetime # 사용시 함수로 바로 사용 #-> 시간 비교

In [2]:
### 한글 폰트 설치 - 윈도우 사용할 때
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import platform

path = 'c:/Windows/Fonts/malgun.ttf'  # 또는 나눔고딕: 'c:/Windows/Fonts/NanumGothic.ttf'
font_name = font_manager.FontProperties(fname=path).get_name()
rc('font', family=font_name)

In [3]:
# 경고창 무시
import warnings
warnings.filterwarnings('ignore')

## 데이터 전처리

In [4]:
df_heart = pd.read_csv("../../dataset/heart_2020_cleaned.csv")
df_heart.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [5]:
# BMI 사분위수를 이용한 이상치 제거
Q1 = 24.030000
Q3 = 31.420000
IQR = Q3-Q1
print(f"(1) Q1-1.5*IQR 값 : {Q1-1.5*IQR}")
print(f"(2) Q3+1.5*IQR 값 : {Q3+1.5*IQR}")

condition_bmi = f"{Q1-1.5*IQR} <= BMI <= {Q3+1.5*IQR}"

(1) Q1-1.5*IQR 값 : 12.945
(2) Q3+1.5*IQR 값 : 42.505


In [6]:
# BMI 기준 정의
def classify_bmi(bmi):
    if bmi < 18.5:
        return '저체중'
    elif 18.5 <= bmi <= 24.9:
        return '정상 체중'
    elif 25 <= bmi <= 29.9:
        return '과체중'
    elif 30 <= bmi <= 34.9:
        return '경도 비만'
    elif 35 <= bmi <= 39.9:
        return '중도 비만'
    else:
        return '고도 비만'

# Sleeping 기준 정의
def classify_sleep(hours):
    if hours < 6:
        return '불면증'
    elif 6 <= hours <= 9:
        return '정상'
    else:
        return '과다 수면'

# 신체 건강 정도 범주화
def classify_physical(value):
    if value < 3.24:
        return '나쁨'
    else:
        return '좋음'

# 정신 건강 정도 범주화
def classify_mental(value):
    if value < 3.80:
        return '나쁨'
    else:
        return '좋음'

# 나이 범주화
def classify_age_category(age_range):
    if age_range in ['18-24']:
        return '청소년기'
    elif age_range in ['25-29', '30-34', '35-39', '40-44', '45-49']:
        return '청년기'
    elif age_range in ['50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80 or older']:
        return '노년기'

# bmi 이상치 제거(df_heart_stable 생성)
df_heart_stable = df_heart.query(condition_bmi)

df_heart_stable['BMI'] = df_heart_stable['BMI'].apply(classify_bmi)
df_heart_stable['SleepTime'] = df_heart_stable['SleepTime'].apply(classify_sleep)
df_heart_stable['PhysicalHealth'] = df_heart_stable['PhysicalHealth'].apply(classify_physical)
df_heart_stable['MentalHealth'] = df_heart_stable['MentalHealth'].apply(classify_mental)
df_heart_stable['AgeCategory'] = df_heart_stable['AgeCategory'].apply(classify_age_category)

In [7]:
df_heart_stable.info()

<class 'pandas.core.frame.DataFrame'>
Index: 309399 entries, 0 to 319793
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   HeartDisease      309399 non-null  object
 1   BMI               309399 non-null  object
 2   Smoking           309399 non-null  object
 3   AlcoholDrinking   309399 non-null  object
 4   Stroke            309399 non-null  object
 5   PhysicalHealth    309399 non-null  object
 6   MentalHealth      309399 non-null  object
 7   DiffWalking       309399 non-null  object
 8   Sex               309399 non-null  object
 9   AgeCategory       309399 non-null  object
 10  Race              309399 non-null  object
 11  Diabetic          309399 non-null  object
 12  PhysicalActivity  309399 non-null  object
 13  GenHealth         309399 non-null  object
 14  SleepTime         309399 non-null  object
 15  Asthma            309399 non-null  object
 16  KidneyDisease     309399 non-null  object
 

## 머신러닝 모델링

### 0. 레이블 인코딩

In [8]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# 모든 컬럼값을 레이블 인코딩
for column in df_heart_stable.columns:
    df_heart_stable[column] = label_encoder.fit_transform(df_heart_stable[column])

df_heart_stable

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,3,1,0,0,0,1,0,0,0,5,2,1,4,1,1,0,1
1,0,4,0,0,1,0,0,0,0,0,5,0,1,4,2,0,0,0
2,0,2,1,0,0,1,1,0,1,0,5,2,1,1,2,1,0,0
3,0,4,0,0,0,0,0,0,0,0,5,0,0,2,2,0,0,1
4,0,4,0,0,0,1,0,1,0,1,5,0,1,4,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319789,0,4,0,0,0,0,0,0,0,2,3,0,1,0,2,0,0,0
319790,1,2,1,0,0,1,0,1,1,0,3,2,0,1,2,1,0,0
319791,0,2,1,0,0,0,0,0,1,1,3,0,1,4,1,1,0,0
319792,0,4,0,0,0,0,0,0,0,1,3,0,1,2,2,0,0,0


### 1. 로지스틱 회귀

In [9]:
# 훈련용, 시험용 데이터셋 설정
from sklearn.model_selection import train_test_split
labeled_x = df_heart_stable.drop(columns=['HeartDisease'])
labeled_y = df_heart_stable['HeartDisease']

labeled_x_train,labeled_x_test = train_test_split(labeled_x,test_size = 0.3, random_state = 42)
labeled_y_train,labeled_y_test = train_test_split(labeled_y,test_size = 0.3, random_state = 42)

In [10]:
# 훈련용 데이터 학습 
from sklearn.linear_model import LogisticRegression
logisticRegression=LogisticRegression()
logisticRegression.fit(labeled_x_train,labeled_y_train)

In [11]:
# 시험용 데이터 예측률 확인
labeled_y_test_predict = logisticRegression.predict(labeled_x_test)
from sklearn.metrics import classification_report
print("\n분류 보고서:")
print(classification_report(labeled_y_test, labeled_y_test_predict))


분류 보고서:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96     84934
           1       0.55      0.08      0.14      7886

    accuracy                           0.92     92820
   macro avg       0.74      0.54      0.55     92820
weighted avg       0.89      0.92      0.89     92820



### 2. 랜덤 포레스트

In [12]:
# 훈련용, 시험용 데이터셋 설정
from sklearn.model_selection import train_test_split
labeled_x = df_heart_stable.drop(columns=['HeartDisease'])
labeled_y = df_heart_stable['HeartDisease']

labeled_x_train,labeled_x_test= train_test_split(labeled_x,test_size = 0.3, random_state = 42)
labeled_y_train,labeled_y_test= train_test_split(labeled_y,test_size = 0.3, random_state = 42)

In [13]:
# 훈련용 데이터 학습
from sklearn.ensemble import RandomForestClassifier
random_forest= RandomForestClassifier(random_state=42)
random_forest.fit(labeled_x_train, labeled_y_train)

In [14]:
# 시험용 데이터 예측률 확인
labeled_y_test_predict= random_forest.predict(labeled_x_test)
from sklearn.metrics import classification_report
print("\n분류 보고서:")
print(classification_report(labeled_y_test, labeled_y_test_predict))


분류 보고서:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     84934
           1       0.38      0.12      0.18      7886

    accuracy                           0.91     92820
   macro avg       0.65      0.55      0.57     92820
weighted avg       0.88      0.91      0.89     92820



### 3. Kneighbors

In [15]:
# 훈련용, 시험용 데이터셋 설정
from sklearn.model_selection import train_test_split
labeled_x = df_heart_stable.drop(columns=['HeartDisease'])
labeled_y = df_heart_stable['HeartDisease']

labeled_x_train,labeled_x_test= train_test_split(labeled_x,test_size = 0.3, random_state = 42)
labeled_y_train,labeled_y_test= train_test_split(labeled_y,test_size = 0.3, random_state = 42)

In [16]:
# 훈련용 데이터 학습
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# 데이터 스케일링 (KNN에 중요)
scaler = StandardScaler()
labeled_x_train = scaler.fit_transform(labeled_x_train)
labeled_x_test = scaler.transform(labeled_x_test)

k = 5  # K 값 설정
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(labeled_x_train, labeled_y_train)

In [17]:
# 시험용 데이터 예측률 확인
labeled_y_test_predict= knn.predict(labeled_x_test)
from sklearn.metrics import classification_report
print("\n분류 보고서:")
print(classification_report(labeled_y_test, labeled_y_test_predict))


분류 보고서:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     84934
           1       0.36      0.13      0.19      7886

    accuracy                           0.91     92820
   macro avg       0.64      0.55      0.57     92820
weighted avg       0.88      0.91      0.89     92820



### 4. XGBoost

In [18]:
# 훈련용, 시험용 데이터셋 설정
from sklearn.model_selection import train_test_split
labeled_x = df_heart_stable.drop(columns=['HeartDisease'])
labeled_y = df_heart_stable['HeartDisease']

labeled_x_train,labeled_x_test= train_test_split(labeled_x,test_size = 0.3, random_state = 42)
labeled_y_train,labeled_y_test= train_test_split(labeled_y,test_size = 0.3, random_state = 42)

In [21]:
# 훈련용 데이터 학습
from xgboost import XGBClassifier
# use_label_encoder=Falses는 경고 메시지 방지, logloss는 이진 분류 문제에서 모델의 예측 확률과 실제 레이블 간의 차이를 측정
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')  
xgb_model.fit(labeled_x_train, labeled_y_train)  # 모델 학습

In [22]:
# 시험용 데이터 예측률 확인
labeled_y_test_predict= xgb_model.predict(labeled_x_test)
from sklearn.metrics import classification_report
print("\n분류 보고서:")
print(classification_report(labeled_y_test, labeled_y_test_predict))


분류 보고서:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96     84934
           1       0.54      0.08      0.14      7886

    accuracy                           0.92     92820
   macro avg       0.73      0.54      0.55     92820
weighted avg       0.89      0.92      0.89     92820



### 5. Support Vector Machine

In [23]:
# 훈련용, 시험용 데이터셋 설정
from sklearn.model_selection import train_test_split
labeled_x = df_heart_stable.drop(columns=['HeartDisease'])
labeled_y = df_heart_stable['HeartDisease']

labeled_x_train,labeled_x_test= train_test_split(labeled_x,test_size = 0.3, random_state = 42)
labeled_y_train,labeled_y_test= train_test_split(labeled_y,test_size = 0.3, random_state = 42)

In [24]:
# 훈련용 데이터 학습
from sklearn.svm import SVC
svm_model = SVC(random_state=42)  # SVM 모델 인스턴스 생성
svm_model.fit(labeled_x_train, labeled_y_train)  # 모델 학습

In [25]:
# 시험용 데이터 예측률 확인
labeled_y_test_predict= svm_model.predict(labeled_x_test)
from sklearn.metrics import classification_report
print("\n분류 보고서:")
print(classification_report(labeled_y_test, labeled_y_test_predict))


분류 보고서:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     84934
           1       0.00      0.00      0.00      7886

    accuracy                           0.92     92820
   macro avg       0.46      0.50      0.48     92820
weighted avg       0.84      0.92      0.87     92820



## 분류 모델 생성 결과

#### (1) 모든 모델이 0.91 ~ 0.92 값의 accuracy 값을 가짐.
#### (2) 다만, 클래스 1의 정밀도(precision 값)가 낮은 것으로 보아, 모델이 양성을 잘 구별하지 못하고 있음.
#### (3) 재현율(recall) 또한 매우 낮은 값으로 클래스 1에 대한 샘플 발견을 잘 못함.

##  해결 방안
#### (1) 데이터 균형 맞추기 : 오버샘플링, 언더샘플링 등
#### (2) 하이퍼 파라미터 적용 : 가중치 등