In [353]:
import pandas as pd
import re

### Hypertension 관련 ICD-9 CODE 가져오기
##### -> "hypertension_list"에 총 56개의 ICD-9 CODE 저장

In [355]:
df = pd.read_csv("/data/PUBLIC_DATA/MIMIC-III/D_ICD_DIAGNOSES.csv")

In [356]:
# DIAGNOSIS 데이터의 LONG_TITLE 칼럼에서 "hypertension"이라는 단어가 등장하는 경우
df1 = df[df['LONG_TITLE'].str.contains('hypertension')]
print(len(df1))

56


In [357]:
hypertension_list = df1["ICD9_CODE"].to_list()
print(len(hypertension_list))

56


### Hypertension 진단 받은 SUBJECT ID 선별
##### -> "subject_list"에 18,871개 SUBJECT ID 저장

In [372]:
df = pd.read_csv("/data/PUBLIC_DATA/MIMIC-III/DIAGNOSES_ICD.csv")

In [373]:
def add(group):
    return ' '.join(group['ICD9_CODE'].fillna(""))

df1 = df.groupby(['SUBJECT_ID', 'HADM_ID']).apply(add).reset_index(name="ICD9_CODE")

In [374]:
mask = df1['ICD9_CODE'].str.contains('|'.join(hypertension_list), na=False)
df2 = df1[mask]

In [375]:
# 동일한 환자의 경우 최초 진단 기록만 사용하기 위해 admission table 입원 시간 기록 데이터 가져와서 합치기
df = pd.read_csv("/data/PUBLIC_DATA/MIMIC-III/ADMISSIONS.csv")
df3 = df[["HADM_ID", "ADMITTIME"]]
df4 = pd.merge(df2, df3, on="HADM_ID")
df4['ADMITTIME'] = pd.to_datetime(df4['ADMITTIME'])
df4

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE,ADMITTIME
0,9,150750,431 5070 4280 5849 2765 4019,2149-11-09 13:06:00
1,12,112213,1570 57410 9971 4275 99811 4019 5680 55321 E8782,2104-08-07 10:15:00
2,13,143045,41401 4111 25000 4019 2720,2167-01-08 18:43:00
3,18,188822,25080 78039 29633 V5867 E9323 V5869 47829 7805...,2167-10-02 11:18:00
4,19,109235,80502 5990 5964 E8809 8220 73300 2948 4019 44321,2108-08-05 16:25:00
...,...,...,...,...
22424,99965,101083,99811 2851 185 4019 49320,2191-07-13 19:39:00
22425,99966,167228,4260 42789 7802 9100 4019 2724 53081 V1046 V12...,2191-08-23 23:16:00
22426,99983,117390,41001 5849 41401 60001 2724 4019 V4582 78820,2193-04-26 11:35:00
22427,99992,197084,9999 56881 5772 2851 5849 5799 72992 53081 401...,2144-07-25 18:03:00


In [376]:
df5 = df4.sort_values(by=['SUBJECT_ID', 'ADMITTIME'], ascending=[True, True])
df5 = df5.drop_duplicates(subset=['SUBJECT_ID'], keep='first')
df5.sort_values(by=["SUBJECT_ID"], ascending=True, inplace=True)
df5.head(10) # 7,8번 index의 값 중 ADMITTIME이 빠른 index만 남은 것을 확인할 수 있다.

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE,ADMITTIME
0,9,150750,431 5070 4280 5849 2765 4019,2149-11-09 13:06:00
1,12,112213,1570 57410 9971 4275 99811 4019 5680 55321 E8782,2104-08-07 10:15:00
2,13,143045,41401 4111 25000 4019 2720,2167-01-08 18:43:00
3,18,188822,25080 78039 29633 V5867 E9323 V5869 47829 7805...,2167-10-02 11:18:00
4,19,109235,80502 5990 5964 E8809 8220 73300 2948 4019 44321,2108-08-05 16:25:00
5,20,157681,41401 4111 25000 2724 4019,2183-04-28 09:45:00
6,22,165315,9678 9693 E9502 E9503 3488 29620 4019,2196-04-09 12:26:00
8,23,152223,41401 4111 4241 V4582 2724 4019 60000 3899,2153-09-03 07:15:00
9,25,129635,41071 25011 41401 4019,2160-11-02 02:06:00
10,30,104557,99674 4160 4111 4281 4271 41401 4019 2720 5939,2172-10-14 14:17:00


In [365]:
subject_list = df5["SUBJECT_ID"].to_list()
print(len(subject_list))

18871


### Hypertension 진단 받은 환자 사망률 확인
-> 전체 18,871명 중 생존(0)은 11,412명, 사망(1)은 7459명으로 6:4 정도의 비율 <p>
-> 성별에 따른 비는 남자 10,525명, 여자 7,459명으로 5.5:4.5 정도의 비율

In [366]:
df = pd.read_csv("/data/PUBLIC_DATA/MIMIC-III/PATIENTS.csv")

In [367]:
df1 = df[df["SUBJECT_ID"].isin(subject_list)]

In [368]:
# Hypertension 진단 받은 사람의 사망 비율
df2 = df1["EXPIRE_FLAG"].value_counts().to_frame(name="Count")
df2["ratio"] = df1["EXPIRE_FLAG"].value_counts(normalize=True)
df2

Unnamed: 0,Count,ratio
0,11412,0.604737
1,7459,0.395263


In [369]:
# Hypertension 진단 받은 사람의 성별 비율
df3 = df1["GENDER"].value_counts().to_frame(name="Gender")
df3["ratio"] = df1["GENDER"].value_counts(normalize=True)
df3

Unnamed: 0,Gender,ratio
M,10525,0.557734
F,8346,0.442266


In [370]:
# Hypertension 진단 받은 사람의 성별에 따른 사망 비율
df4 = df1.groupby(["GENDER", "EXPIRE_FLAG"]).size().unstack(fill_value=0)
df4["SUM"] = df1["GENDER"].value_counts()
df4

EXPIRE_FLAG,0,1,SUM
GENDER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,4693,3653,8346
M,6719,3806,10525
