<a href="https://colab.research.google.com/github/Soojin-Im/Delphi/blob/main/train_data_add_labels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
import numpy as np
import pandas as pd

# 이미 정제 완료된 snythetic data의 형식을 확인하는 코드 (아래는 실제 ukb 데이터가 있을 때 그것을 변환하는 코드)

train_path = "/content/drive/MyDrive/STUDY/Delphi/data/ukb_simulated_data/train.bin"
data = np.fromfile(train_path, dtype=np.uint32).reshape(-1, 3)

df = pd.DataFrame(data, columns=["patient_id", "patient_age_days", "token_id"])

print("원본 데이터:")
print(df.head())
print(f"\n데이터 shape: {df.shape}")
print(f"\npatient_id 분포:")
print(df["patient_id"].value_counts().head())
print(f"\ntoken_id 분포 (상위 10개):")
print(df["token_id"].value_counts().head(10))



원본 데이터:
   patient_id  patient_age_days  token_id
0      402867              1251       602
1      402867              8870       724
2      402867             15534       714
3      402867             21457       498
4      402867             21834       265

데이터 shape: (181293, 3)

patient_id 분포:
patient_id
416380    64
422428    64
403098    63
414908    63
417552    62
Name: count, dtype: int64

token_id 분포 (상위 10개):
token_id
498    4181
833    3677
1      3639
2      3503
797    3302
265    3083
791    2918
586    2883
817    2687
438    2407
Name: count, dtype: int64


In [47]:
data

array([[402867,   1251,    602],
       [402867,   8870,    724],
       [402867,  15534,    714],
       ...,
       [427985,  21171,    312],
       [427985,  21579,    265],
       [427985,  24842,    217]], dtype=uint32)

In [30]:
# labels.csv 파일 읽기
labels_path = "/content/drive/MyDrive/STUDY/Delphi/data/ukb_simulated_data/labels.csv"
with open(labels_path, 'r', encoding='utf-8') as f:
    labels = [line.strip() for line in f.readlines()]

# labels.csv의 1번째 행(Padding)과 2번째 행(No event)는 모두 token_id 0에 해당
# labels.csv의 3번째 행(Female)부터 token_id 1로 시작
# 즉: token_id 0 -> Padding 또는 No event, token_id 1 -> Female, token_id 2 -> Male, ...
label_dict = {}
# token_id 0은 Padding (또는 No event도 가능하지만 Padding을 기본으로)
label_dict[0] = labels[0]  # Padding
# token_id 1부터는 labels.csv의 인덱스 2부터 매핑
for token_id in range(1, len(labels) - 1):  # token_id 1부터 시작
    label_dict[token_id] = labels[token_id + 1]  # labels[2] -> token_id 1, labels[3] -> token_id 2, ...
label_dict

{0: 'Padding',
 1: 'Female',
 2: 'Male',
 3: 'BMI_low',
 4: 'BMI_mid',
 5: 'BMI_high',
 6: 'Smoking_low',
 7: 'Smoking_mid',
 8: 'Smoking_high',
 9: 'Alcohol_low',
 10: 'Alcohol_mid',
 11: 'Alcohol_high',
 12: 'A00 (cholera)',
 13: 'A01 (typhoid and paratyphoid fevers)',
 14: 'A02 (other salmonella infections)',
 15: 'A03 (shigellosis)',
 16: 'A04 (other bacterial intestinal infections)',
 17: 'A05 (other bacterial foodborne intoxications)',
 18: 'A06 (amoebiasis)',
 19: 'A07 (other protozoal intestinal diseases)',
 20: 'A08 (viral and other specified intestinal infections)',
 21: 'A09 (diarrhoea and gastro-enteritis of presumed infectious origin)',
 22: 'A15 (respiratory tuberculosis, bacteriologically and histologically confirmed)',
 23: 'A16 (respiratory tuberculosis, not confirmed bacteriologically or histologically)',
 24: 'A17 (tuberculosis of nervous system)',
 25: 'A18 (tuberculosis of other organs)',
 26: 'A19 (miliary tuberculosis)',
 27: 'A20 (plague)',
 28: 'A22 (anthrax)',

In [31]:
df["label"] = df["token_id"].map(label_dict)

print("\n라벨이 추가된 데이터:")
print(df.head(20))
print(f"\n라벨이 없는 token_id 개수: {df['label'].isna().sum()}")




라벨이 추가된 데이터:
    patient_id  patient_age_days  token_id  \
0       402867              1251       602   
1       402867              8870       724   
2       402867             15534       714   
3       402867             21457       498   
4       402867             21834       265   
5       402867             21871       833   
6       402867             22127       372   
7       402867             23264       660   
8       402867             23451       382   
9       402867             23811      1197   
10      402867             24322       797   
11      402867             24386       467   
12      402867             24486       817   
13      402867             24500       835   
14      402867             25921       427   
15      402867             26032       586   
16      402867             26896      1181   
17      402867             27195       822   
18      402867             27710       508   
19      402867             27996       542   

                   

In [32]:
# token_id의 최소값과 최대값 확인
token_min = df["token_id"].min()
token_max = df["token_id"].max()
print(f"\ntoken_id 최소값: {token_min}")
print(f"token_id 최대값: {token_max}")
print(f"token_id 범위: {token_min} ~ {token_max}")

# token_id가 1 또는 2인 관측치의 patient_id가 서로 겹치지 않고 unique한지 확인
df_token_1 = df[df["token_id"] == 1]
df_token_2 = df[df["token_id"] == 2]

patient_ids_token_1 = set(df_token_1["patient_id"].unique())
patient_ids_token_2 = set(df_token_2["patient_id"].unique())

print(f"\n=== token_id 1과 2의 patient_id 겹침 확인 ===")
print(f"token_id 1인 관측치 개수: {len(df_token_1)}")
print(f"token_id 1인 unique patient_id 개수: {len(patient_ids_token_1)}")
print(f"token_id 2인 관측치 개수: {len(df_token_2)}")
print(f"token_id 2인 unique patient_id 개수: {len(patient_ids_token_2)}")

# 교집합 확인
intersection = patient_ids_token_1 & patient_ids_token_2
print(f"\n겹치는 patient_id 개수: {len(intersection)}")
if len(intersection) > 0:
    print(f"⚠️ 경고: token_id 1과 2에 공통으로 나타나는 patient_id가 {len(intersection)}개 있습니다!")
    print(f"겹치는 patient_id (처음 10개): {sorted(list(intersection))[:10]}")
else:
    print("✓ token_id 1과 2의 patient_id는 완전히 겹치지 않습니다 (unique합니다).")




token_id 최소값: 1
token_id 최대값: 1268
token_id 범위: 1 ~ 1268

=== token_id 1과 2의 patient_id 겹침 확인 ===
token_id 1인 관측치 개수: 3639
token_id 1인 unique patient_id 개수: 3639
token_id 2인 관측치 개수: 3503
token_id 2인 unique patient_id 개수: 3503

겹치는 patient_id 개수: 0
✓ token_id 1과 2의 patient_id는 완전히 겹치지 않습니다 (unique합니다).


In [46]:

# 특정 patient_id로 필터링 (예시)
df_filtered = df[df["patient_id"] == 402868]
df_filtered = df_filtered.copy()
df_filtered["patient_age_years"] = df_filtered["patient_age_days"] / 365.25
print("\n필터링된 데이터 (patient_id == 402868):")
print(df_filtered)


필터링된 데이터 (patient_id == 402868):
    patient_id  patient_age_days  token_id  \
26      402868                 0         1   
27      402868             11184       991   
28      402868             13919       817   
29      402868             14272       306   
30      402868             14835       815   
31      402868             16275       832   
32      402868             17010       918   
33      402868             17849       679   
34      402868             17863       576   
35      402868             17927       797   
36      402868             17936       587   
37      402868             18450       466   
38      402868             18536       472   
39      402868             18730       470   
40      402868             18975       486   
41      402868             19851       450   
42      402868             19881       737   
43      402868             20294       484   
44      402868             20451        99   
45      402868             21044       771   


In [45]:
(df['patient_id'].unique())


array([402867, 402868, 402869, ..., 427983, 427984, 427985], dtype=uint32)

In [52]:
df_token_dm = df[df["token_id"] == 214]
df_token_dm

Unnamed: 0,patient_id,patient_age_days,token_id,label
112,402875,27745,214,E11 (non-insulin-dependent diabetes mellitus)
176,402884,28193,214,E11 (non-insulin-dependent diabetes mellitus)
373,402900,27480,214,E11 (non-insulin-dependent diabetes mellitus)
501,402923,28769,214,E11 (non-insulin-dependent diabetes mellitus)
544,402933,16655,214,E11 (non-insulin-dependent diabetes mellitus)
...,...,...,...,...
180588,427908,27707,214,E11 (non-insulin-dependent diabetes mellitus)
180927,427945,22981,214,E11 (non-insulin-dependent diabetes mellitus)
181070,427956,26225,214,E11 (non-insulin-dependent diabetes mellitus)
181110,427962,23852,214,E11 (non-insulin-dependent diabetes mellitus)


In [56]:
# 특정 patient_id로 필터링 (예시)
df_filtered = df[df["patient_id"] == 402875]
df_filtered = df_filtered.copy()
df_filtered["patient_age_years"] = df_filtered["patient_age_days"] / 365.25
print("\n필터링된 데이터 (patient_id == 402875):")
print(df_filtered.head())


필터링된 데이터 (patient_id == 402875):
     patient_id  patient_age_days  token_id  \
98       402875                 0         2   
99       402875             22611       874   
100      402875             23084       889   
101      402875             23845       737   
102      402875             24034       833   

                                                 label  patient_age_years  
98                                                Male           0.000000  
99                       N23 (unspecified renal colic)          61.905544  
100                      N40 (hyperplasia of prostate)          63.200548  
101  L57 (skin changes due to chronic exposure to n...          65.284052  
102  M79 (other soft tissue disorders, not elsewher...          65.801506  
