# 기반조사와 8차 조사 비교하기

- 사람 수, 심뇌혈관 질환자 수

- 기존:
    - 전체: 10027
    - 심뇌혈관: 9726 301
- 예상: 전체 사람수도 줄어들고, 심뇌혈관 질환자는 거의 없을 듯
    - 전체: 6316
    - 심뇌혈관: 6145 171

In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc

# 기본 글꼴을 NanumGothic으로 변경
rc("font", family="NanumGothic")

## 종속변수 처리하기

- 1기 8기 변수 이름 다름

- 1기에서 필요한 변수 목록
  - as1_disease = AS1_06_DISEASE[["AS1_PDMI", "AS1_PDCH", "AS1_PDCD", "AS1_PDCV"]]
  - as1_treat = AS1_07_TREAT[["AS1_TRTMI", "AS1_TRTCH", "AS1_TRTCD", "AS1_TRTCV"]]
  - as1_drug = AS1_08_DRUG[['AS1_DRUGSTKCU']]

- 8기에서 필요한 변수 목록
  - disease : AS8_MI, AS8_CHF, AS8_CAD, AS8_CEVA
  - treat: 변수가 많이 달라짐
    - AS8_MICU: 1, 2, 3 (4 제외)
    - AS8_CHFCU: (4 제외)
    - AS8_CADCU
    - AS8_CEVACU
  - drug: AS8_D63A
  

In [86]:
AS1_06_DISEASE = pd.read_csv("./Dataset/raw/AS1_06_DISEASE.csv", index_col=0, encoding='utf-8', low_memory=False)
AS1_07_TREAT = pd.read_csv("./Dataset/raw/AS1_07_TREAT.csv", index_col=0, encoding='utf-8', low_memory=False)
AS1_08_DRUG = pd.read_csv("./Dataset/raw/AS1_08_DRUG.csv", index_col=0, encoding='utf-8', low_memory=False)

AS8_DISEASE = pd.read_csv("./Dataset/raw_8/AS8_06_DISEASE.csv", index_col=0, encoding='utf-8', low_memory=False)
AS8_DRUG = pd.read_csv("./Dataset/raw_8/AS8_08_DRUG.csv", index_col=0, encoding='utf-8', low_memory=False)


as1_disease = AS1_06_DISEASE[["AS1_PDMI", "AS1_PDCH", "AS1_PDCD", "AS1_PDCV"]]
as1_treat = AS1_07_TREAT[["AS1_TRTMI", "AS1_TRTCH", "AS1_TRTCD", "AS1_TRTCV"]]
as1_drug = AS1_08_DRUG[['AS1_DRUGSTKCU']]

as8_disease = AS8_DISEASE[["AS8_MI", "AS8_CHF", "AS8_CAD", "AS8_CEVA",
                           "AS8_MICU", "AS8_CHFCU", "AS8_CADCU", "AS8_CEVACU"]]
as8_drug = AS8_DRUG[['AS8_D63A']]

as1_dependent = pd.concat([as1_disease, as1_treat, as1_drug], axis=1)
as8_dependent = pd.concat([as8_disease, as8_drug], axis=1)

as1_dependent.info()
as8_dependent.info()



<class 'pandas.core.frame.DataFrame'>
Index: 10030 entries, NIH23B1298125 to NIH23B1160138
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   AS1_PDMI       10030 non-null  int64
 1   AS1_PDCH       10030 non-null  int64
 2   AS1_PDCD       10030 non-null  int64
 3   AS1_PDCV       10030 non-null  int64
 4   AS1_TRTMI      10030 non-null  int64
 5   AS1_TRTCH      10030 non-null  int64
 6   AS1_TRTCD      10030 non-null  int64
 7   AS1_TRTCV      10030 non-null  int64
 8   AS1_DRUGSTKCU  10030 non-null  int64
dtypes: int64(9)
memory usage: 783.6+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 6318 entries, NIH23B1298125 to NIH23B1160138
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   AS8_MI      6318 non-null   int64
 1   AS8_CHF     6318 non-null   int64
 2   AS8_CAD     6318 non-null   int64
 3   AS8_CEVA    6318 non-null   int64
 4   AS8_MICU    6

In [87]:
as8_dependent.head()

Unnamed: 0_level_0,AS8_MI,AS8_CHF,AS8_CAD,AS8_CEVA,AS8_MICU,AS8_CHFCU,AS8_CADCU,AS8_CEVACU,AS8_D63A
DIST_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
NIH23B1298125,1,1,1,1,77777,77777,77777,77777,1
NIH23B1159376,1,1,1,2,77777,77777,77777,2,2
NIH23B1785393,1,1,1,1,77777,77777,77777,77777,1
NIH23B1463054,1,1,1,1,77777,77777,77777,77777,1
NIH23B1751168,1,1,1,1,77777,77777,77777,77777,1


In [88]:
def create_cvd_column(df): # 8기 조사에 맞게 변경함
    conditions = (
        (df['AS8_MI'] == 2) |
        (df['AS8_CHF'] == 2) |
        (df['AS8_CAD'] == 2) |
        (df['AS8_CEVA'] == 2) |
        (df['AS8_MICU'] == 2) |
        (df['AS8_CHFCU'] == 2) |
        (df['AS8_CADCU'] == 2) |
        (df['AS8_CEVACU'] == 2) |
        (df['AS8_D63A'] == 2)
    )
    
    # 조건에 맞으면 1, 아니면 0 할당
    df['CVD'] = conditions.astype(int)
    
    return df

In [89]:
# 모든 값이 77777 또는 99999이면 drop
mask = ~as8_dependent.isin([77777, 99999]).all(axis=1)
dependent = as8_dependent[mask]

In [90]:
# 2명 drop함. 
dependent.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6316 entries, NIH23B1298125 to NIH23B1160138
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   AS8_MI      6316 non-null   int64
 1   AS8_CHF     6316 non-null   int64
 2   AS8_CAD     6316 non-null   int64
 3   AS8_CEVA    6316 non-null   int64
 4   AS8_MICU    6316 non-null   int64
 5   AS8_CHFCU   6316 non-null   int64
 6   AS8_CADCU   6316 non-null   int64
 7   AS8_CEVACU  6316 non-null   int64
 8   AS8_D63A    6316 non-null   int64
dtypes: int64(9)
memory usage: 493.4+ KB


In [91]:
df = create_cvd_column(dependent.copy())
print(df["CVD"].value_counts())

print(df.info())
df.head()

CVD
0    6145
1     171
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 6316 entries, NIH23B1298125 to NIH23B1160138
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   AS8_MI      6316 non-null   int64
 1   AS8_CHF     6316 non-null   int64
 2   AS8_CAD     6316 non-null   int64
 3   AS8_CEVA    6316 non-null   int64
 4   AS8_MICU    6316 non-null   int64
 5   AS8_CHFCU   6316 non-null   int64
 6   AS8_CADCU   6316 non-null   int64
 7   AS8_CEVACU  6316 non-null   int64
 8   AS8_D63A    6316 non-null   int64
 9   CVD         6316 non-null   int32
dtypes: int32(1), int64(9)
memory usage: 518.1+ KB
None


Unnamed: 0_level_0,AS8_MI,AS8_CHF,AS8_CAD,AS8_CEVA,AS8_MICU,AS8_CHFCU,AS8_CADCU,AS8_CEVACU,AS8_D63A,CVD
DIST_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
NIH23B1298125,1,1,1,1,77777,77777,77777,77777,1,0
NIH23B1159376,1,1,1,2,77777,77777,77777,2,2,1
NIH23B1785393,1,1,1,1,77777,77777,77777,77777,1,0
NIH23B1463054,1,1,1,1,77777,77777,77777,77777,1,0
NIH23B1751168,1,1,1,1,77777,77777,77777,77777,1,0


In [92]:
df.to_csv("./dropped_as8_dependent.csv")

In [93]:
def create_cvd_column(df):
    # 해당 열들의 값이 2인지 확인
    conditions = (
        (df['AS1_PDMI'] == 2) |
        (df['AS1_PDCH'] == 2) |
        (df['AS1_PDCD'] == 2) |
        (df['AS1_PDCV'] == 2) |
        (df['AS1_TRTMI'] == 2) |
        (df['AS1_TRTCH'] == 2) |
        (df['AS1_TRTCD'] == 2) |
        (df['AS1_TRTCV'] == 2) |
        (df['AS1_DRUGSTKCU'] == 2)
    )
    
    # 조건에 맞으면 1, 아니면 0 할당
    df['CVD'] = conditions.astype(int)
    
    return df

In [94]:
# 모든 값이 77777 또는 99999이면 drop
mask = ~as1_dependent.isin([77777, 99999]).all(axis=1)
as1_dependent = as1_dependent[mask]

In [95]:
df = create_cvd_column(as1_dependent.copy())
print(df["CVD"].value_counts())

print(df.info())
df.head()

CVD
0    9726
1     301
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 10027 entries, NIH23B1298125 to NIH23B1160138
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   AS1_PDMI       10027 non-null  int64
 1   AS1_PDCH       10027 non-null  int64
 2   AS1_PDCD       10027 non-null  int64
 3   AS1_PDCV       10027 non-null  int64
 4   AS1_TRTMI      10027 non-null  int64
 5   AS1_TRTCH      10027 non-null  int64
 6   AS1_TRTCD      10027 non-null  int64
 7   AS1_TRTCV      10027 non-null  int64
 8   AS1_DRUGSTKCU  10027 non-null  int64
 9   CVD            10027 non-null  int32
dtypes: int32(1), int64(9)
memory usage: 822.5+ KB
None


Unnamed: 0_level_0,AS1_PDMI,AS1_PDCH,AS1_PDCD,AS1_PDCV,AS1_TRTMI,AS1_TRTCH,AS1_TRTCD,AS1_TRTCV,AS1_DRUGSTKCU,CVD
DIST_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
NIH23B1298125,1,1,1,1,1,1,1,1,77777,0
NIH23B1159376,1,1,1,1,99999,99999,99999,99999,99999,0
NIH23B1785393,1,1,1,1,99999,99999,99999,99999,77777,0
NIH23B1463054,1,1,1,1,1,1,1,1,77777,0
NIH23B1751168,1,1,1,1,99999,99999,99999,99999,99999,0


In [96]:
df.to_csv("./dropped_as1_dependent.csv")

## 1기 8기 비교

In [97]:
as1_dependent = pd.read_csv("dropped_as1_dependent.csv", index_col=0)
as8_dependent = pd.read_csv("dropped_as8_dependent.csv", index_col=0)

In [98]:
as1_dependent.head()

Unnamed: 0_level_0,AS1_PDMI,AS1_PDCH,AS1_PDCD,AS1_PDCV,AS1_TRTMI,AS1_TRTCH,AS1_TRTCD,AS1_TRTCV,AS1_DRUGSTKCU,CVD
DIST_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
NIH23B1298125,1,1,1,1,1,1,1,1,77777,0
NIH23B1159376,1,1,1,1,99999,99999,99999,99999,99999,0
NIH23B1785393,1,1,1,1,99999,99999,99999,99999,77777,0
NIH23B1463054,1,1,1,1,1,1,1,1,77777,0
NIH23B1751168,1,1,1,1,99999,99999,99999,99999,99999,0


In [99]:
as8_dependent.head()

Unnamed: 0_level_0,AS8_MI,AS8_CHF,AS8_CAD,AS8_CEVA,AS8_MICU,AS8_CHFCU,AS8_CADCU,AS8_CEVACU,AS8_D63A,CVD
DIST_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
NIH23B1298125,1,1,1,1,77777,77777,77777,77777,1,0
NIH23B1159376,1,1,1,2,77777,77777,77777,2,2,1
NIH23B1785393,1,1,1,1,77777,77777,77777,77777,1,0
NIH23B1463054,1,1,1,1,77777,77777,77777,77777,1,0
NIH23B1751168,1,1,1,1,77777,77777,77777,77777,1,0


In [100]:
as8_dependent['CVD'].value_counts()

CVD
0    6145
1     171
Name: count, dtype: int64

In [116]:
def process_and_merge_data(X_df, as1_df, as8_df):
    """
    데이터프레임들을 처리하고 병합하는 함수입니다.
    
    Parameters:
    -----------
    X_df : pandas.DataFrame
        처리할 원본 X 데이터프레임
    as1_df : pandas.DataFrame
        as1_dependent 데이터프레임
    as8_df : pandas.DataFrame
        as8_dependent 데이터프레임
    
    Returns:
    --------
    dict : 처리 결과와 변환 과정 정보를 담은 딕셔너리
    """
    # 초기 상태 저장
    process_log = {
        '초기_상태': {
            'X_크기': len(X_df),
            'as1_크기': len(as1_df),
            'as8_크기': len(as8_df),
            'X_컬럼': X_df.columns.tolist(),
            'as1_인덱스': as1_df.index.tolist(),
            'as8_인덱스': as8_df.index.tolist()
        }
    }
    
    # 1단계: as8에 존재하는 인덱스만 필터링
    common_indices_X_as8 = X_df.index.intersection(as8_df.index)
    X_filtered_by_as8 = X_df.loc[common_indices_X_as8]
    
    process_log['as8_필터링_후'] = {
        '필터링_전_X_크기': len(X_df),
        '필터링_후_X_크기': len(X_filtered_by_as8),
        '제거된_row_수': len(X_df) - len(X_filtered_by_as8)
    }
    
    # 2단계: CVD 값이 1에서 0으로 변경된 케이스 식별
    common_indices_all = X_filtered_by_as8.index.intersection(as1_df.index).intersection(as8_df.index)
    
    cvd_changed_indices = []
    for idx in common_indices_all:
        if as1_df.loc[idx, 'CVD'] == 1 and as8_df.loc[idx, 'CVD'] == 0:
            cvd_changed_indices.append(idx)
    
    # CVD 변경 케이스 제거
    X_final = X_filtered_by_as8.drop(cvd_changed_indices)
    
    process_log['CVD_변경_필터링_후'] = {
        '필터링_전_크기': len(X_filtered_by_as8),
        '필터링_후_크기': len(X_final),
        'CVD_변경_케이스_수': len(cvd_changed_indices),
        'CVD_변경_인덱스': cvd_changed_indices
    }
    
    # 3단계: as8의 CVD 값 추가
    final_df = X_final.copy()
    final_df['as8_CVD'] = as8_df.loc[final_df.index, 'CVD']
    
    process_log['최종_결과'] = {
        '최종_데이터_크기': len(final_df),
        '최종_컬럼_목록': final_df.columns.tolist(),
        '누락값_여부': final_df['as8_CVD'].isna().sum()
    }
    
    return {
        '최종_데이터': final_df,
        '처리_과정_로그': process_log,
        'CVD_변경_인덱스': cvd_changed_indices,
        '제거된_전체_인덱스': list(set(X_df.index) - set(final_df.index))
    }

In [117]:
# 함수 실행
X = pd.read_csv("Dataset/X_240905.csv", index_col=0)
results = process_and_merge_data(X, as1_dependent, as8_dependent)

# 결과 확인
print("=== 데이터 처리 과정 ===")
print(f"1. 초기 상태:")
print(f"   - X 데이터: {results['처리_과정_로그']['초기_상태']['X_크기']:,}행")
print(f"   - as8 데이터: {results['처리_과정_로그']['초기_상태']['as8_크기']:,}행")

print("\n2. as8 기준 필터링:")
print(f"   - 제거된 행: {results['처리_과정_로그']['as8_필터링_후']['제거된_row_수']:,}개")

print("\n3. CVD 변경 케이스 필터링:")
print(f"   - 제거된 행: {results['처리_과정_로그']['CVD_변경_필터링_후']['CVD_변경_케이스_수']:,}개")

print("\n4. 최종 결과:")
print(f"   - 최종 데이터 크기: {results['처리_과정_로그']['최종_결과']['최종_데이터_크기']:,}행")

# 최종 데이터프레임 저장
final_dataset = results['최종_데이터']

=== 데이터 처리 과정 ===
1. 초기 상태:
   - X 데이터: 9,622행
   - as8 데이터: 6,316행

2. as8 기준 필터링:
   - 제거된 행: 3,543개

3. CVD 변경 케이스 필터링:
   - 제거된 행: 134개

4. 최종 결과:
   - 최종 데이터 크기: 5,945행


In [118]:
final_dataset.head()

Unnamed: 0_level_0,AS1_SEX,AS1_AGE,AS1_MARRYA,AS1_JOBB,AS1_EDUA,AS1_INCOME,AS1_DRINK,AS1_SMOKEA,AS1_HVSMAM,AS1_PHYACTL,...,DII,HAS_HYPERTENSION,HAS_DIABETES,HAS_HYPERLIPIDEMI,HOMA_IR,WAIST_AVG,BMI,RC,AI,as8_CVD
DIST_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NIH23B1298125,1,53,2,2,3,5,3,1,0,1,...,2.062933,1,1,1,2.721235,91.933333,200.95488,39.6,3.416667,0
NIH23B1159376,2,44,2,5,1,2,1,0,0,7,...,2.69981,0,0,1,0.306667,87.066667,135.594778,62.0,4.2,1
NIH23B1785393,1,47,2,3,1,2,3,3,8,2,...,1.297841,0,0,1,0.911111,80.333333,170.473543,66.8,2.659091,0
NIH23B1463054,2,43,2,3,1,2,3,0,0,2,...,1.274731,0,0,1,1.274074,98.5,170.236012,21.0,3.025,0
NIH23B1751168,1,61,2,3,3,1,3,3,20,0,...,1.615857,0,0,1,0.380247,80.666667,159.671232,19.0,2.431818,0


In [119]:
final_dataset['as8_CVD'].value_counts()

as8_CVD
0    5778
1     167
Name: count, dtype: int64

In [120]:
final_dataset.to_csv("X_250213.csv")

In [None]:
y = pd.read_csv("Dataset/y_240905.csv", index_col=0)

common_indices_y = y.index.intersection(final_dataset.index)
y_filtered_by_final = y.loc[common_indices_y]

y_filtered_by_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   DIST_ID  0 non-null      object
 1   CVD      0 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 0.0+ bytes
