### 모델 학습용 코드 구현 및 실행

- 학습별 코드 분리 (구분선 사용 및 해당 모델 이름 작성)
- 학습된 파라미터는 ./parameters 에 .pth 형식으로 저장하여 사용

In [19]:
input_file_path = ['./data/2022Data_part1.csv', './data/2022Data_part2.csv']

### Colab 사용시 주석 제거

# !rm -rf SKN19_2ND_5TEAM
# !git clone https://github.com/SKNetworks-AI19-250818/SKN19_2ND_5TEAM.git
# %cd SKN19_2ND_5TEAM

# import sys
# sys.path.append('/content/SKN19_2ND_5TEAM')
# input_file_path = ['/content/SKN19_2ND_5TEAM/data/encoded_dataset.csv']

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch.utils.data import random_split, DataLoader, ConcatDataset
import torch.nn as nn
import torch.optim as optim

import modules.DataAnalysis as DataAnalysis
import modules.ModelAnalysis as ModelAnalysis
import modules.DataModify as DataModify
from modules.DataSelect import DataPreprocessing

import modules.Models as Models

In [21]:

print(torch.__version__)
print(torch.version.cuda)


2.8.0+cu126
12.6


In [22]:
# 랜덤 시드 고정 : 결과 비교용
Models.set_seed(42)

dp = DataPreprocessing()

# device 설정 (cuda 사용 가능 시 cuda 사용)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Dataset 로드
dataset = DataModify.CancerDataset(
    target_column='target_label',              # target column
    time_column='Survival months_bin_3m',      # Survival months
    file_paths=input_file_path,
    transform=dp.run                           # 기존에 정제가 완료된 데이터를 사용할 경우 None
)

sui_input_file_path = ['./data/Suicide.csv']
sui_dataset = DataModify.CancerDataset(
    target_column='target_label',              # target column
    time_column='Survival months_bin_3m',      # Survival months
    file_paths=sui_input_file_path,
    transform=dp.run                           # 기존에 정제가 완료된 데이터를 사용할 경우 None
)

dp.save_category()


Using device: cuda


  df = pd.read_csv(path)


In [23]:
# train set size 설정 및 분리
# 전체 길이
n = len(dataset)

# 비율 설정
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# 각 세트 크기 계산
train_size = int(n * train_ratio)
val_size = int(n * val_ratio)
test_size = n - train_size - val_size  # 합이 정확히 맞도록 조정

# 분리 수행
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
train_dataset = ConcatDataset([train_dataset, sui_dataset])

batch_size = 64

# 데이터를 로드
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)


# 모델 초기화
input_dim = dataset.data.shape[1]   # input dimension : data의 feature의 개수
hidden_size = (128, 64)             # 1번째, 2번째 hidden layer의 size
time_bins = 91                     # 3개월 단위로 time을 split하여 각 구간으로 삼음 -> 270개월+ 는 하나로 취급
num_events = 4                      # 사건의 개수

# 모델 선언
model = Models.DeepHitSurvWithSEBlockAnd2DCNN(input_dim, hidden_size, time_bins, num_events, dropout=.2).to(device)

# 손실함수 및 optimizer 선언
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [24]:
print(len(train_dataset))

422453


In [25]:
df_save = ModelAnalysis.dataset_to_dataframe(test_dataset)

df_save.to_csv("./data/test dataset.csv")

In [26]:
# 모델 학습
def train_epoch(model, loader, optimizer, device=device):
    # 모델을 train 모드로 설정
    model.train()
    # loss 변수 선언
    total_loss, total_lik, total_rank = 0, 0, 0

    # loader에서 불러온 데이터를 기반으로 학습
    for x, times, events in loader:
        x, times, events = x.to(device), times.to(device), events.to(device)


        optimizer.zero_grad()
        logits, pmf, cif = model(x)
        loss, L_lik, L_rank = Models.deephit_loss(pmf, cif, times, events)

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x.size(0)
        total_lik += L_lik.item() * x.size(0)
        total_rank += L_rank.item() * x.size(0)

    n = len(loader.dataset)
    return total_loss/n, total_lik/n, total_rank/n

# 모델 평가
def evaluate(model, loader, device=device):
    # 모델을 평가 모드로 설정
    model.eval()
    total_loss, total_lik, total_rank = 0, 0, 0
    
    with torch.no_grad():
        for x, times, events in loader:
            x, times, events = x.to(device), times.to(device), events.to(device)

            logits, pmf, cif = model(x)
            loss, L_lik, L_rank = Models.deephit_loss(pmf, cif, times, events)

            total_loss += loss.item() * x.size(0)
            total_lik += L_lik.item() * x.size(0)
            total_rank += L_rank.item() * x.size(0)

    n = len(loader.dataset)
    return total_loss/n, total_lik/n, total_rank/n

def get_cif_from_model(model, loader, device=device):
    model.eval()
    all_cif = []
    all_times = []
    all_events = []
    with torch.no_grad():
        for x, times, events in loader:
            x = x.to(device)
            logits, pmf, cif = model(x)
            all_cif.append(cif.cpu())
            all_times.append(times)
            all_events.append(events)
    all_cif = torch.cat(all_cif, dim=0)  # (num_samples, num_events, time_bins)
    all_times = torch.cat(all_times, dim=0)
    all_events = torch.cat(all_events, dim=0)
    return all_cif, all_times, all_events

In [27]:
n_epochs = 20
for epoch in range(1, n_epochs+1):
    train_loss, train_lik, train_rank = train_epoch(model, train_loader, optimizer)
    val_loss, val_lik, val_rank = evaluate(model, val_loader)

    print(f"[{epoch:03d}] "
          f"Train Loss={train_loss:.4f} (L={train_lik:.4f}, R={train_rank:.4f}) | "
          f"Val Loss={val_loss:.4f} (L={val_lik:.4f}, R={val_rank:.4f})")


[001] Train Loss=0.8027 (L=0.7935, R=0.0183) | Val Loss=0.7405 (L=0.7333, R=0.0144)
[002] Train Loss=0.7818 (L=0.7738, R=0.0160) | Val Loss=0.7159 (L=0.7082, R=0.0152)
[003] Train Loss=0.7714 (L=0.7638, R=0.0151) | Val Loss=0.7855 (L=0.7791, R=0.0127)
[004] Train Loss=0.7823 (L=0.7747, R=0.0151) | Val Loss=0.7031 (L=0.6967, R=0.0129)
[005] Train Loss=0.7551 (L=0.7476, R=0.0150) | Val Loss=0.6950 (L=0.6881, R=0.0137)
[006] Train Loss=0.7572 (L=0.7497, R=0.0150) | Val Loss=0.6995 (L=0.6928, R=0.0133)
[007] Train Loss=0.7593 (L=0.7517, R=0.0151) | Val Loss=0.6989 (L=0.6926, R=0.0125)
[008] Train Loss=0.7544 (L=0.7470, R=0.0150) | Val Loss=0.6982 (L=0.6909, R=0.0146)
[009] Train Loss=0.7509 (L=0.7435, R=0.0149) | Val Loss=0.6938 (L=0.6877, R=0.0123)
[010] Train Loss=0.7514 (L=0.7439, R=0.0150) | Val Loss=0.7014 (L=0.6949, R=0.0130)
[011] Train Loss=0.7420 (L=0.7346, R=0.0147) | Val Loss=0.7096 (L=0.7026, R=0.0139)
[012] Train Loss=0.7536 (L=0.7462, R=0.0148) | Val Loss=0.7031 (L=0.6961, R=

In [28]:
torch.save(model.state_dict(), "./parameters/deephit_model_2D_CNN.pth")

In [29]:
input_params_path = './parameters/deephit_model_feature.pth'

input_dim = dataset.data.shape[1]   # input dimension : data의 feature의 개수
hidden_size = (128, 64)             # 1번째, 2번째 hidden layer의 size
time_bins = 91                      # 3개월 단위로 time을 split하여 각 구간으로 삼음 -> 최대 270개월 + 그 후
num_events = 4                      # 사건의 개수

# 모델 정의 (학습할 때 사용한 모델 클래스)
model = Models.DeepHitSurvWithSEBlock(input_dim, 
                    hidden_size, 
                    time_bins, 
                    num_events,
                    )  # 사건 수 맞게 설정
model.load_state_dict(torch.load(input_params_path, map_location=device))
model.to(device)
model.eval()  # 평가 모드

DeepHitSurvWithSEBlock(
  (se_block): Sequential(
    (0): Linear(in_features=17, out_features=4, bias=True)
    (1): ReLU()
    (2): Linear(in_features=4, out_features=17, bias=True)
    (3): Sigmoid()
  )
  (se_block_event): ModuleList(
    (0-3): 4 x Sequential(
      (0): Linear(in_features=64, out_features=16, bias=True)
      (1): ReLU()
      (2): Linear(in_features=16, out_features=64, bias=True)
      (3): Sigmoid()
    )
  )
  (shared): Sequential(
    (0): Linear(in_features=17, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
  )
  (heads): ModuleList(
    (0-3): 4 x Linear(in_features=64, out_features=91, bias=True)
  )
)

In [30]:
# train set CIF 추출
cif_train, times_train, events_train = get_cif_from_model(model, train_loader)

KeyboardInterrupt: 

In [None]:
def compute_risk_score_sigmoid(pmf, time_lambda=0.05, event_weights=None):
    """
    pmf: torch.Tensor, shape (B, E, T) - 사건별 시간 확률
    time_lambda: float, 지수 감쇠 계수 (시간대 가중치)
    event_weights: list or torch.Tensor, 길이 E, 사건별 가중치
    """
    B, E, T = pmf.shape
    device = pmf.device

    # 시간 가중치
    time_weights = torch.exp(-time_lambda * torch.arange(T, device=device))
    
    # 사건 가중치
    if event_weights is None:
        event_weights = torch.ones(E, device=device)
    else:
        event_weights = torch.tensor(event_weights, device=device, dtype=torch.float32)
    
    # 가중치 적용
    weighted_pmf = pmf * time_weights.view(1, 1, T)
    weighted_pmf = weighted_pmf * event_weights.view(1, E, 1)

    # 가중합 계산
    risk_score_raw = weighted_pmf.sum(dim=(1, 2))

    # 0 기준으로 offset 제거 → 음수도 나오게
    risk_score_raw = risk_score_raw - risk_score_raw.mean()

    # 시그모이드 + 0~100 스케일
    risk_score = torch.sigmoid(risk_score_raw) * 100

    return risk_score

In [None]:
# 모델에서 PMF 추출
def get_pmf_from_model(model, loader, device=device):
    model.eval()
    all_pmf = []
    all_times = []
    all_events = []
    with torch.no_grad():
        for x, times, events in loader:
            x = x.to(device)
            logits, pmf, _ = model(x)  # CIF는 필요 없음

            pmf = pmf[:, :, :91]  # (batch_size, num_events, time_bins-1)
            
            all_pmf.append(pmf.cpu())
            all_times.append(times)
            all_events.append(events)
    all_pmf = torch.cat(all_pmf, dim=0)  # (num_samples, num_events, time_bins)
    all_times = torch.cat(all_times, dim=0)
    all_events = torch.cat(all_events, dim=0)
    return all_pmf, all_times, all_events
 
# train set PMF 추출
pmf_train, times_train, events_train = get_pmf_from_model(model, train_loader)

In [None]:
# 사건별 가중치 설정
event_weights = [2.0, 3.0, 3.0, 7.0]  # 예시

# 위험 점수 계산 (시그모이드 + 0~100)
risk_scores = compute_risk_score_sigmoid(pmf_train, time_lambda=0.05, event_weights=event_weights).numpy()

# 통계 확인
print("최대값:", np.max(risk_scores))
print("최소값:", np.min(risk_scores))
print("평균값:", np.mean(risk_scores))
print("앞 10개 값:", risk_scores[:10])

# 사건별 통계
events_np = events_train.numpy()
unique_events = np.unique(events_np)

print("=== 라벨별 Risk Score 통계 ===")
for e in unique_events:
    mask = (events_np == e)
    scores_e = risk_scores[mask]
    if len(scores_e) == 0:
        continue
    print(f"\nEvent {e}:")
    print(f"  개수: {len(scores_e)}")
    print(f"  최대값: {np.max(scores_e):.4f}")
    print(f"  최소값: {np.min(scores_e):.4f}")
    print(f"  평균값: {np.mean(scores_e):.4f}")


최대값: 99.41755
최소값: 38.345165
평균값: 49.816166
앞 10개 값: [55.928837 51.859367 38.469154 45.206867 51.962406 71.034904 40.639694
 41.435307 41.39525  58.480625]
=== 라벨별 Risk Score 통계 ===

Event -1:
  개수: 366123
  최대값: 98.9653
  최소값: 38.3452
  평균값: 48.3918

Event 0:
  개수: 41868
  최대값: 98.0343
  최소값: 38.4692
  평균값: 61.6722

Event 1:
  개수: 5857
  최대값: 98.0431
  최소값: 38.4692
  평균값: 52.0595

Event 2:
  개수: 6164
  최대값: 97.9004
  최소값: 38.4692
  평균값: 51.0170

Event 3:
  개수: 2441
  최대값: 99.4175
  최소값: 38.4692
  평균값: 51.6823


In [None]:
# 사건별 마지막 CIF를 입력으로 사용
X_risk = cif_train[:, :, -2].numpy()  # (num_samples, num_events)
weights = [0.3, 0.3, 1, 3]

risk_target = np.zeros(X_risk.shape[0])
for i in range(len(events_train)):
    t_i = min(times_train[i], cif_train.shape[2]-2)  # 최대값 제한
    if events_train[i] >= 0:
        risk_target[i] = cif_train[i, events_train[i], t_i].item()
    else:
        risk_target[i] = cif_train[i, :, t_i].sum().item()  # 검열 처리

risk_model = Models.WeightedCoxRiskEstimator(num_events=X_risk.shape[1], weights=weights, device=device)
risk_model.fit(X_risk, times_train, events_train)

torch.save(risk_model.event_linears.state_dict(), "./data/parameters/risk_model_event_linears.pth")

  times = torch.tensor(times, dtype=torch.float32, device=self.device)
  events = torch.tensor(events, dtype=torch.float32, device=self.device)


In [None]:
risk_scores = risk_model.predict(X_risk)

print("최대값:", np.max(risk_scores))
print("최소값:", np.min(risk_scores))
print("평균값:", np.mean(risk_scores))
print("앞 10개 값:", risk_scores[:10])

최대값: 99.9127
최소값: 83.37278
평균값: 91.87544
앞 10개 값: [94.69282 93.39063 85.93979 95.19147 96.79798 94.41105 99.55224 95.50266
 89.5379  93.62158]


In [None]:

# tensor → numpy 변환
events_np = events_train.numpy()

# 사건 라벨 종류 (-1은 검열)
unique_events = np.unique(events_np)

print("=== 라벨별 Risk Score 통계 ===")
for e in unique_events:
    mask = (events_np == e)
    scores_e = risk_scores[mask]

    if len(scores_e) == 0:
        continue

    print(f"\nEvent {e}:")
    print(f"  개수: {len(scores_e)}")
    print(f"  최대값: {np.max(scores_e):.4f}")
    print(f"  최소값: {np.min(scores_e):.4f}")
    print(f"  평균값: {np.mean(scores_e):.4f}")



=== 라벨별 Risk Score 통계 ===

Event -1:
  개수: 366123
  최대값: 99.9127
  최소값: 83.3728
  평균값: 91.8788

Event 0:
  개수: 41868
  최대값: 99.8873
  최소값: 84.1983
  평균값: 91.8673

Event 1:
  개수: 5857
  최대값: 99.8445
  최소값: 84.3004
  평균값: 91.7705

Event 2:
  개수: 6164
  최대값: 99.8123
  최소값: 84.2024
  평균값: 91.8592

Event 3:
  개수: 2441
  최대값: 99.8139
  최소값: 84.3451
  평균값: 91.8066
