In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_predict
from math import sqrt
import matplotlib.pyplot as plt

df = pd.read_csv("C:\\Users\\NMPL-SKKWON\\Desktop\\국민건강보험공단_건강검진정보_20221231.CSV", encoding='cp949')
print(df.head())


In [None]:
df = df.drop(columns=['총콜레스테롤', '트리글리세라이드', 'HDL콜레스테롤', 'LDL콜레스테롤', '치아우식증유무', '치석', '구강검진수검여부', '시력(좌)', '시력(우)', '청력(좌)', '청력(우)', '시도코드', '기준년도'])

In [None]:
# 수축기혈압과 이완기혈압 열에 NaN 값이 존재하는 행 제거
df_filtered = df.dropna(subset=['수축기혈압', '이완기혈압'])
print("\n수축기혈압 또는 이완기혈압에 NaN 값이 있는 행 제거 후 DataFrame:")
print(df_filtered)


In [None]:
# 전체 행에 NaN 값이 두 개 이상 존재하는 행 제거
df_filtered = df_filtered.dropna(thresh=df_filtered.shape[1] - 1)
print("\n전체 행에 NaN 값이 두 개 이상 존재하는 행 제거 후 DataFrame:")
print(df_filtered)


In [None]:
df = df_filtered

In [None]:
from sklearn.impute import SimpleImputer, KNNImputer


df['고혈압유무'] = df.apply(lambda row: 1 if row['수축기혈압'] >= 140 and row['이완기혈압'] >= 90 else 0, axis=1)
# 데이터 정규화

In [None]:
df = df[df['연령대코드(5세단위)'] >= 4]

In [None]:
df = df.reset_index(drop=True)

In [None]:
df

In [None]:
import pandas as pd

file_paths = ["C:\\Users\\NMPL-SKKWON\\Desktop\\국민건강보험공단_의약품처방정보_20221231\\3.국민건강보험공단_의약품처방정보_2022_2(수정).CSV"]

# 모든 파일을 읽어서 하나의 DataFrame으로 합치기
dfs = [pd.read_csv(file, encoding='cp949') for file in file_paths]
df2 = pd.concat(dfs, ignore_index=True)
antihypertensive_atc_codes = [
    'C02', 'C03', 'C07', 'C08', 'C09'
    # 필요한 경우에 따라 더 추가할 수 있습니다.
]
# file2에서 고혈압 치료제에 해당하는 ATC 코드를 가진 행 필터링
antihypertensive_df2 = df2[df2['약품일반성분명코드(ATC코드)'].str.startswith(tuple(antihypertensive_atc_codes))]
# file1에서 고혈압 치료제에 해당하는 ATC 코드를 가진 행들의 일련번호 가져오기
antihypertensive_serial_numbers = antihypertensive_df2['가입자일련번호'].unique()
# file1에서 고혈압 치료제에 해당하는 ATC 코드를 가진 행 제거
df = df[~df['가입자일련번호'].isin(antihypertensive_serial_numbers)]

# 결과 확인
print(df.head())
# 일련번호를 기준으로 두 데이터프레임을 병합 (inner join)
merged_df = pd.merge(df, df2[['가입자일련번호', '약품일반성분명코드(ATC코드)']], on='가입자일련번호', how='inner')
merged_df = merged_df.drop_duplicates(subset=['가입자일련번호'])
# 결과 확인
print(merged_df.head())


In [None]:
merged_df

In [None]:
df = merged_df.reset_index(drop=True)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
import pandas as pd
import matplotlib.font_manager as fm
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [None]:

# X와 y 분리
x_columns = ['신장(5cm단위)', '체중(5kg단위)', '혈청지피티(ALT)', '혈청지오티(AST)', '혈청크레아티닌', '연령대코드(5세단위)', 
             '허리둘레', '식전혈당(공복혈당)', '혈색소', '감마지티피']
X = df[x_columns]
y = df['고혈압유무']

X = pd.concat([X, df[['성별', '흡연상태', '음주여부', '요단백']]], axis=1)

imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)


data = pd.concat([X, df[['고혈압유무']]], axis=1)

X['성별'] = np.round(X['성별'])
X['흡연상태'] = np.round(X['흡연상태'])
X['음주여부'] = np.round(X['음주여부'])
X['요단백'] = np.round(X['요단백'])
X['연령대코드(5세단위)'] = np.round(X['연령대코드(5세단위)'])
# 평균값으로 보간 






In [None]:
X

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 2. 클래스 불균형 확인
target_column = '고혈압유무'  # 타겟 컬럼의 이름을 지정해주세요.
print(f'원본 데이터 클래스 분포: {Counter(data[target_column])}')

# 3. Random Over-Sampling 적용
ros = RandomOverSampler(random_state=42)
X = data.drop(columns=[target_column])
y = data[target_column]
X_resampled, y_resampled = ros.fit_resample(X, y)

# 4. 결과 확인
print(f'오버 샘플링 후 클래스 분포: {Counter(y_resampled)}')

# Resampled 데이터프레임 생성
resampled_data = pd.DataFrame(X_resampled, columns=X.columns)
resampled_data[target_column] = y_resampled

In [None]:
X_resampled

In [None]:
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)
# 3. Stratified 5-Fold 교차 검증 설정
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 4. 분류기 설정
classifiers = {
    'RandomForest': RandomForestClassifier(n_estimators = 5, random_state=42),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'LogisticRegression': LogisticRegression(max_iter=1000)
}

# 5. 각 분류기에 대해 하드 보팅을 위한 예측 초기화 및 교차 검증 수행
results = []
feature_importances = {}
for name, clf in classifiers.items():
    y_preds = np.zeros(len(y_resampled))
    
    for fold, (train_index, test_index) in enumerate(skf.split(X_resampled, y_resampled)):
        print(f'{name} - Fold {fold + 1}')
        X_train, X_test = X_resampled[train_index], X_resampled[test_index]
        y_train, y_test = y_resampled[train_index], y_resampled[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_preds[test_index] = y_pred
        print(f'Fold {fold + 1} predictions: {y_pred[:10]}')  # 첫 10개 예측 출력
    
    # 최종 성능 평가
    accuracy = accuracy_score(y_resampled, y_preds)
    f1 = f1_score(y_resampled, y_preds)
    cm = confusion_matrix(y_resampled, y_preds)
    
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'F1 Score': f1
    })
    
    # Feature importance 계산
    if name == 'RandomForest':
        importances = clf.feature_importances_
    elif name == 'XGBoost':
        importances = clf.feature_importances_
    elif name == 'LogisticRegression':
        importances = np.abs(clf.coef_[0])
    
    feature_importances[name] = importances

# 6. 결과 출력
results_df = pd.DataFrame(results)
print("Model Performance:")
print(results_df)

# Feature importance 출력 및 시각화
feature_names = X.columns
for name, importances in feature_importances.items():
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)
    print(f"\n{name} Feature Importances:")
    print(feature_importance_df)
    
    # 그래프 출력
    plt.figure(figsize=(10, 6))
    plt.bar(feature_importance_df['Feature'], feature_importance_df['Importance'])
    plt.title(f'{name} Feature Importances')
    plt.xlabel('Feature')
    plt.ylabel('Importance')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()
    print()

In [None]:
result

In [None]:
y_soft_vote

In [None]:
def golden_test(data, feature, column, save, y):
    classifiers = {
        "Random Forest": RandomForestClassifier(n_estimators=5, random_state=42)
    }
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    results = {}
    if(len(save) > 0):
        feature = pd.concat([feature, save], axis=1) 
    feature = pd.concat([feature, data[column]], axis=1) 
    print(feature)
    scaler = StandardScaler()
    feature = scaler.fit_transform(feature)
    # 3. Stratified 5-Fold 교차 검증 설정
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # 4. RandomForest 분류기 설정
    clf = RandomForestClassifier(n_estimators= 5,random_state=42)

    # 5. 하드 보팅을 위한 예측 초기화
    y_preds = np.zeros(len(y))

    # 6. 5-Fold 교차 검증 수행
    for fold, (train_index, test_index) in enumerate(skf.split(feature, y)):
        print(f'Fold {fold + 1}')
        X_train, X_test = feature[train_index], feature[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_preds[test_index] = y_pred
        print(f'Fold {fold + 1} predictions: {y_pred[:10]}')  # 첫 10개 예측 출력

    # 7. 최종 성능 평가
    accuracy = accuracy_score(y, y_preds)
    f1 = f1_score(y, y_preds)
    cm = confusion_matrix(y, y_preds)

    print("Confusion Matrix:")
    print(cm)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    return accuracy
column = []
column = X.columns
l = len(column)
t_column = pd.DataFrame()
b_column = pd.DataFrame()
b_rmse = -1
temp = 0
b_feat = ''
for i in range(0, l):
    for r in range (0, (l - i)):
        temp = golden_test(X_resampled, t_column, column[r], b_column, y_resampled)
        if(temp > b_rmse):
            b_rmse = temp
            b_feat = column[r]
        t_column = pd.DataFrame()
    b_rmse = -1
    column  = list(filter(lambda col: col != b_feat, column))
    print(column)
    b_column = pd.concat([b_column, X_resampled[b_feat]], axis=1)  
print(b_column)


In [None]:
b_column

In [None]:
results_df
