In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
# mpl.rc('font', family='NanumBarunGothic') # 혹은 다른 설치한 Nanum 폰트 사용 - 리눅스 사용할때
import scipy
import scipy.stats as stats

import os
import re
import time
import datetime as dt # 사용시 datetime.dt 로 사용 # 현재 시간 사용
from datetime import datetime # 사용시 함수로 바로 사용 #-> 시간 비교

In [2]:
### 한글 폰트 설치 - 윈도우 사용할 때
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import platform

path = 'c:/Windows/Fonts/malgun.ttf'  # 또는 나눔고딕: 'c:/Windows/Fonts/NanumGothic.ttf'
font_name = font_manager.FontProperties(fname=path).get_name()
rc('font', family=font_name)

In [3]:
# 경고창 무시
import warnings
warnings.filterwarnings('ignore')

## 데이터 불러오기

In [4]:
df_heart_stable = pd.read_csv("../../dataset/df_heart_stable_new.csv", index_col = 0)
df_heart_stable.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,0,1,0,0,0,5,2,1,4,1,1,0,1
1,0,20.34,0,0,1,0,0,0,0,0,5,0,1,4,2,0,0,0
2,0,26.58,1,0,0,1,1,0,1,0,5,2,1,1,2,1,0,0
3,0,24.21,0,0,0,0,0,0,0,0,5,0,0,2,2,0,0,1
4,0,23.71,0,0,0,1,0,1,0,1,5,0,1,4,2,0,0,0


## 1. SMOTE

In [5]:
# 훈련용, 시험용 데이터셋 설정
from sklearn.model_selection import train_test_split
labeled_x = df_heart_stable.drop(columns=['HeartDisease'])
labeled_y = df_heart_stable['HeartDisease']

labeled_x_train,labeled_x_test = train_test_split(labeled_x,test_size = 0.2, random_state = 42)
labeled_y_train,labeled_y_test = train_test_split(labeled_y,test_size = 0.2, random_state = 42)

In [6]:
# SMOTE 오버샘플링
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
labeled_x_train_resampled, labeled_y_train_resampled = smote.fit_resample(labeled_x_train, labeled_y_train)

In [7]:
# 훈련용 데이터 학습
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# 데이터 스케일링 (KNN에 중요)
scaler = StandardScaler()
labeled_x_train_resampled = scaler.fit_transform(labeled_x_train_resampled)
labeled_x_test = scaler.transform(labeled_x_test)

k = 5  # K 값 설정
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(labeled_x_train_resampled, labeled_y_train_resampled)

In [8]:
# 시험용 데이터 예측률 확인
labeled_y_test_predict = knn.predict(labeled_x_test)
from sklearn.metrics import classification_report
print("\n분류 보고서:")
print(classification_report(labeled_y_test, labeled_y_test_predict))


분류 보고서:
              precision    recall  f1-score   support

           0       0.95      0.81      0.87     56642
           1       0.19      0.51      0.28      5238

    accuracy                           0.78     61880
   macro avg       0.57      0.66      0.58     61880
weighted avg       0.88      0.78      0.82     61880



## 2. 랜덤 오버샘플링

In [9]:
# 훈련용, 시험용 데이터셋 설정
from sklearn.model_selection import train_test_split
labeled_x = df_heart_stable.drop(columns=['HeartDisease'])
labeled_y = df_heart_stable['HeartDisease']

labeled_x_train,labeled_x_test = train_test_split(labeled_x,test_size = 0.2, random_state = 42)
labeled_y_train,labeled_y_test = train_test_split(labeled_y,test_size = 0.2, random_state = 42)

In [10]:
# 랜덤 오버샘플링
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
labeled_x_train_resampled, labeled_y_train_resampled = ros.fit_resample(labeled_x_train, labeled_y_train)

In [11]:
# 훈련용 데이터 학습
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# 데이터 스케일링 (KNN에 중요)
scaler = StandardScaler()
labeled_x_train_resampled = scaler.fit_transform(labeled_x_train_resampled)
labeled_x_test = scaler.transform(labeled_x_test)

k = 5  # K 값 설정
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(labeled_x_train_resampled, labeled_y_train_resampled)

In [12]:
# 시험용 데이터 예측률 확인
labeled_y_test_predict = knn.predict(labeled_x_test)
from sklearn.metrics import classification_report
print("\n분류 보고서:")
print(classification_report(labeled_y_test, labeled_y_test_predict))


분류 보고서:
              precision    recall  f1-score   support

           0       0.94      0.85      0.90     56642
           1       0.22      0.45      0.29      5238

    accuracy                           0.82     61880
   macro avg       0.58      0.65      0.59     61880
weighted avg       0.88      0.82      0.84     61880



## 3. ADASYN

In [13]:
# 훈련용, 시험용 데이터셋 설정
from sklearn.model_selection import train_test_split
labeled_x = df_heart_stable.drop(columns=['HeartDisease'])
labeled_y = df_heart_stable['HeartDisease']

labeled_x_train,labeled_x_test = train_test_split(labeled_x,test_size = 0.2, random_state = 42)
labeled_y_train,labeled_y_test = train_test_split(labeled_y,test_size = 0.2, random_state = 42)

In [14]:
# ADASYN 오버샘플링
from imblearn.over_sampling import ADASYN
adasyn = ADASYN(random_state=42)
labeled_x_train_resampled, labeled_y_train_resampled = adasyn.fit_resample(labeled_x_train, labeled_y_train)

In [15]:
# 훈련용 데이터 학습
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# 데이터 스케일링 (KNN에 중요)
scaler = StandardScaler()
labeled_x_train_resampled = scaler.fit_transform(labeled_x_train_resampled)
labeled_x_test = scaler.transform(labeled_x_test)

k = 5  # K 값 설정
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(labeled_x_train_resampled, labeled_y_train_resampled)

In [16]:
# 시험용 데이터 예측률 확인
labeled_y_test_predict = knn.predict(labeled_x_test)
from sklearn.metrics import classification_report
print("\n분류 보고서:")
print(classification_report(labeled_y_test, labeled_y_test_predict))


분류 보고서:
              precision    recall  f1-score   support

           0       0.95      0.79      0.86     56642
           1       0.19      0.53      0.28      5238

    accuracy                           0.77     61880
   macro avg       0.57      0.66      0.57     61880
weighted avg       0.88      0.77      0.81     61880



## 4. Borderline-SMOTE

In [17]:
# 훈련용, 시험용 데이터셋 설정
from sklearn.model_selection import train_test_split
labeled_x = df_heart_stable.drop(columns=['HeartDisease'])
labeled_y = df_heart_stable['HeartDisease']

labeled_x_train,labeled_x_test = train_test_split(labeled_x,test_size = 0.2, random_state = 42)
labeled_y_train,labeled_y_test = train_test_split(labeled_y,test_size = 0.2, random_state = 42)

In [18]:
# Borderline-SMOTE 오버샘플링
from imblearn.over_sampling import BorderlineSMOTE
borderline_smote = BorderlineSMOTE(random_state=42)
labeled_x_train_resampled, labeled_y_train_resampled = adasyn.fit_resample(labeled_x_train, labeled_y_train)

In [19]:
# 훈련용 데이터 학습
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# 데이터 스케일링 (KNN에 중요)
scaler = StandardScaler()
labeled_x_train_resampled = scaler.fit_transform(labeled_x_train_resampled)
labeled_x_test = scaler.transform(labeled_x_test)

k = 5  # K 값 설정
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(labeled_x_train_resampled, labeled_y_train_resampled)

In [20]:
# 시험용 데이터 예측률 확인
labeled_y_test_predict = knn.predict(labeled_x_test)
from sklearn.metrics import classification_report
print("\n분류 보고서:")
print(classification_report(labeled_y_test, labeled_y_test_predict))


분류 보고서:
              precision    recall  f1-score   support

           0       0.95      0.79      0.86     56642
           1       0.19      0.53      0.28      5238

    accuracy                           0.77     61880
   macro avg       0.57      0.66      0.57     61880
weighted avg       0.88      0.77      0.81     61880



## K-최근접 모델 오버샘플링 결과
#### (1) 4가지 오버샘플링을 진행 (SMOTE, ADASYN, 랜덤 오버샘플링, Borderline SMOTE)
#### (2) 전체적으로 비슷함.