In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.optim as optim
from torch import nn
from torch.nn.utils import weight_norm
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, recall_score
import math
import re
import random
from optuna.trial import TrialState
import optuna
from random import *
import copy
import gc



In [2]:
# Seed
import random
seed = 2023
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class PositionalEmbedding(nn.Module):

    def __init__(self, d_model, max_len = 6): # model dim = 60, max_len = 5
        super().__init__()

        pe = torch.zeros(max_len, d_model).float() # 5*60
        pe.require_grad = False

        position = torch.arange(0, max_len).float().unsqueeze(1) # 5*1
        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp() # ([0,2,...,58] * -0.9...)^2

        pe[:, 0::2] = torch.sin(position * div_term) # 偶數維度sin
        pe[:, 1::2] = torch.cos(position * div_term) # 奇數維度cos

        pe = pe.unsqueeze(0) # 1*5*60
        
        self.register_buffer('pe', pe)
        

    def forward(self, x): # x = batch * max_len * d_model
        x = x + self.pe[:, :x.size(1)].clone().detach().requires_grad_(False)
        return  x

In [4]:
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, query, key, value, mask = None, dropout = None):
        """
        Args:
            `query`: shape (batch_size, n_heads, max_len, d_q)
            `key`: shape (batch_size, n_heads, max_len, d_k)
            `value`: shape (batch_size, n_heads, max_len, d_v)
            `mask`: shape (batch_size, 1, 1, max_len)
            `dropout`: nn.Dropout
        Returns:
            `weighted value`: shape (batch_size, n_heads, max_len, d_v)
            `weight matrix`: shape (batch_size, n_heads, max_len, max_len)
        """
        d_k = query.size(-1)  # d_k = d_model / n_heads
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)  # B*H*L*L
        if mask is not None:
            scores = scores.masked_fill(mask.eq(1), -1e20)
        p_attn = F.softmax(scores, dim=-1)  # B*H*L*L
        
        if dropout is not None:
            p_attn = dropout(p_attn)
        
        return torch.matmul(p_attn, value), p_attn

class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, d_model, dropout):
        super(MultiHeadAttention, self).__init__()
        assert d_model % n_heads == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // n_heads
        self.h = n_heads
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.sdpa = ScaledDotProductAttention()
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask = None):
        """
        Args: 
            `query`: shape (batch_size, max_len, d_model)
            `key`: shape (batch_size, max_len, d_model)
            `value`: shape (batch_size, max_len, d_model)
            `mask`: shape (batch_size, max_len)
        
        Returns:
            shape (batch_size, max_len, d_model)
        """
        if mask is not None:
            # Same mask applied to all h heads. B*1*1*L
            mask = mask.unsqueeze(1).unsqueeze(1)
        batch_size = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2) for l, x in
                             zip(self.linears, (query, key, value))]

        # 2) Apply attention on all the projected vectors in batch.
        # x: B x H x L x D_v
        x, self.attn = self.sdpa(query, key, value, mask=mask, dropout=self.dropout)
        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
        return self.linears[-1](x)

In [5]:
class TransformerEncoder(nn.Module):
    def __init__(self, n_heads, d_model, d_ff, max_len, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.positional_embedding = PositionalEmbedding(d_model, max_len)
        
        self.block1 = TransformerEncoderBlock(n_heads, d_model, d_ff, dropout)
#         self.block2 = TransformerEncoderBlock(n_heads, d_model, d_ff, dropout)
        
        self.dropout = nn.Dropout(dropout)
        
        self.linear1 = nn.Linear(60, 1)
        self.sigmoid = nn.Sigmoid()
        self.initialize_parameters()

    def initialize_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
                
    def forward(self, x, mask=None):
        x = self.positional_embedding(x)
        
        x = self.block1(x, mask)
#         x = self.block2(x, mask)

        x = self.dropout(x)
        a = self.linear1(x[:, -1, :])
        out = self.sigmoid(a)
        return out


class TransformerEncoderBlock(nn.Module):
    def __init__(self, n_heads, d_model, d_ff, dropout=0.1):
        super(TransformerEncoderBlock, self).__init__()
        self.multihead_attention = MultiHeadAttention(n_heads, d_model, dropout)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attended = self.multihead_attention(x, x, x, mask=mask)
        residual1 = x + self.dropout(attended)
        norm1_output = self.norm1(residual1)

        feed_forward_output = self.feed_forward(norm1_output)
        residual2 = norm1_output + self.dropout(feed_forward_output)
        norm2_output = self.norm2(residual2)

        return norm2_output


In [6]:
df1 = pd.read_csv('1_60_standardized.csv',low_memory=False,index_col=0)
df23 = pd.read_csv('2_3_60_standardized.csv',low_memory=False,index_col=0)
df45 = pd.read_csv('4_5_60_standardized.csv',low_memory=False,index_col=0)
check_for_any_nan= df45.isna().values.any()
print(check_for_any_nan)
check_for_nan = df45.isna().sum().sum()
print(check_for_nan)

False
0


In [7]:
def dftransform(data, lenth):
    # lenth時間最大長度
    if lenth !=1:
        people_list = list(data["病歷號"].unique())
        d = dict.fromkeys(people_list, 0)
        for index,row in data.iterrows():
            if data.loc[index,"病歷號"] in people_list:
                d[data.loc[index,"病歷號"]] += 1 # d will show how many visits a person have
        # we got a csv with two years 
        # we try to split them into different csv
        P = []    
        p = []
        Count = 0
        count = 0
        for j,k in d.items():
            if k == lenth:
                Count += 1
                P.append(j)
            else:
                count += 1
                p.append(j)

        a = data[data['病歷號'].isin(p)]
        b = data[data['病歷號'].isin(P)]

        return a, b, p, P # a b is all of that year's columns, p P is the person list 
    
    else:
        people_list = list(data["病歷號"].unique())
        d = dict.fromkeys(people_list, 0)
        for index,row in data.iterrows():
            if data.loc[index,"病歷號"] in people_list:
                d[data.loc[index,"病歷號"]] += 1
 
        p = []
        count = 0
        for j,k in d.items():
                count += 1
                p.append(j)

        a = data[data['病歷號'].isin(p)]


        return a, p # a is all of that year's columns, p is the person list 

In [8]:
filtered_df1, p1 = dftransform(df1, 1)
filtered_df2, filtered_df3, p2, p3 = dftransform(df23, 3)
filtered_df4, filtered_df5, p4, p5 = dftransform(df45, 5)

In [13]:
def get_features(filtered_df, p): # p : person list
    features = []
    targets = []
    group = filtered_df.groupby("病歷號")
    for i in p: # i : each person
        count = 0 
        age = []
        temp_df  = group.get_group(i)
        data = []
        for index, row in temp_df.iterrows():
            if data == []:
                for ln in range(5):
                    age.append(int(temp_df.loc[index, '年齡']+ln))
            
            for l in range(len(age)):
                if int(temp_df.loc[index, '年齡']) == age[count]:
                    count+=1
                    data.append(list(temp_df.loc[index][2:]))
                    break
                else:
                    count+=1
                    data.append([0]*60)# padding 0
                    
        max_len = 5 - count 
        if max_len != 0:
            for m in range(max_len):
                data.insert(0,[0]*60)
        data.append([1]*60)
        
        features.append(data)
        targets.append([temp_df.iloc[0][0]])
    targets = np.array(targets,np.dtype(np.float32))
    features = np.array(features,np.dtype(np.float32))

    return targets, features

In [14]:
targets1, features1 = get_features(filtered_df1, p1)
targets2, features2 = get_features(filtered_df2, p2)
targets3, features3 = get_features(filtered_df3, p3)
targets4, features4 = get_features(filtered_df4, p4)
targets5, features5 = get_features(filtered_df5, p5)
print(targets1.shape)
print(features1.shape)
print(targets2.shape)
print(features2.shape)
print(targets3.shape)
print(features3.shape)
print(targets4.shape)
print(features4.shape)
print(targets5.shape)
print(features5.shape)

(1282, 1)
(1282, 6, 60)
(1156, 1)
(1156, 6, 60)
(1021, 1)
(1021, 6, 60)
(1394, 1)
(1394, 6, 60)
(3800, 1)
(3800, 6, 60)


In [11]:
def to_tensor(tensor):
    tensor = np.array(tensor)
    tensor = torch.from_numpy(tensor)
    return tensor

In [12]:
targets1 = to_tensor(targets1)
targets2 = to_tensor(targets2)
targets3 = to_tensor(targets3)
targets4 = to_tensor(targets4)
targets5 = to_tensor(targets5)
features1 = to_tensor(features1)
features2 = to_tensor(features2)
features3 = to_tensor(features3)
features4 = to_tensor(features4)
features5 = to_tensor(features5)

In [13]:
def split_dataset(X, y, test_size=0.2, val_size=0.5, random_state=seed):
    """
    將資料集拆分為訓練集、驗證集和測試集。

    參數：
    - X：特徵資料
    - y：目標變數
    - test_size：測試集的比例（預設為0.2）
    - val_size：驗證集的比例（預設為0.2）
    - random_state：隨機種子（預設為None）

    返回值：
    - X_train, X_val, X_test：拆分後的特徵資料（訓練集、驗證集、測試集）
    - y_train, y_val, y_test：拆分後的目標變數（訓練集、驗證集、測試集）
    """
    # 先拆分出測試集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify = y)
    
    
    # 再從剩餘的資料中拆分出驗證集
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=random_state, stratify = y_test)
    
    return X_train, X_val, X_test, y_train, y_val, y_test


In [14]:
datasets = [(features1, targets1), (features2, targets2), (features3, targets3), (features4, targets4), (features5, targets5)]
train_size = 0.8
val_size = 0.5

x_train = []
x_val = []
x_test = []
y_train = []
y_val = []
y_test = []

for i, (features, targets) in enumerate(datasets):
    x_train_i, x_val_i, x_test_i, y_train_i, y_val_i, y_test_i = split_dataset(features, targets, 
                                                                               test_size=(1-train_size), 
                                                                               val_size=val_size, random_state=seed)
    x_train.append(x_train_i)
    x_val.append(x_val_i)
    x_test.append(x_test_i)
    y_train.append(y_train_i)
    y_val.append(y_val_i)
    y_test.append(y_test_i)
    print(f"訓練集{i+1}大小:", len(x_train_i))
    print(f"驗證集{i+1}大小:", len(x_val_i))
    print(f"測試集{i+1}大小:", len(x_test_i))

訓練集1大小: 1025
驗證集1大小: 129
測試集1大小: 128
訓練集2大小: 924
驗證集2大小: 116
測試集2大小: 116
訓練集3大小: 816
驗證集3大小: 103
測試集3大小: 102
訓練集4大小: 1115
驗證集4大小: 140
測試集4大小: 139
訓練集5大小: 3040
驗證集5大小: 380
測試集5大小: 380


In [15]:
x_train_combined = np.concatenate(x_train, axis=0)
x_test_combined = np.concatenate(x_test, axis=0)
x_val_combined = np.concatenate(x_val, axis=0)

y_train_combined = np.concatenate(y_train, axis=0)
y_test_combined = np.concatenate(y_test, axis=0)
y_val_combined = np.concatenate(y_val, axis=0)

print("整合後的訓練集大小:", len(x_train_combined), len(y_train_combined))
print("整合後的測試集大小:", len(x_test_combined), len(y_test_combined))
print("整合後的驗證集大小:", len(x_val_combined), len(y_val_combined))


整合後的訓練集大小: 6920 6920
整合後的測試集大小: 865 865
整合後的驗證集大小: 868 868


In [16]:
num_ones = np.sum(y_train_combined == 1)
num_zeros = np.sum(y_train_combined == 0)

# 印出結果
print("1 的數量:", num_ones)
print("0 的數量:", num_zeros)

1 的數量: 1008
0 的數量: 5912


In [17]:
# from imblearn.over_sampling import BorderlineSMOTE
# n_samples, n_time_steps, n_features = x_train_combined.shape
# x_train_combined = x_train_combined.reshape(n_samples, n_features * n_time_steps)
# smote = BorderlineSMOTE(random_state = 2023, sampling_strategy = 1)
# x_train_oversampled, y_train_oversampled = smote.fit_resample(x_train_combined, y_train_combined)
# x_train_oversampled = x_train_oversampled.reshape(-1, n_time_steps, n_features)
# y_train_oversampled = y_train_oversampled.reshape(y_train_oversampled.shape[0],1)
# # 確認過採樣後的訓練集大小
# print("過採樣後的訓練集大小:", x_train_oversampled.shape, y_train_oversampled.shape)


In [18]:
# num_ones = np.sum(y_train_oversampled == 1)
# num_zeros = np.sum(y_train_oversampled == 0)

# # 印出結果
# print("1 的數量:", num_ones)
# print("0 的數量:", num_zeros)

In [19]:
def make_mask(seq, size):
    data = torch.empty(size, 6) # CLS.ver
    for i in range(len(seq)):
        for j in range(len(seq[i])):
            if seq[i, j, 1] == 0:
                data[i, j] = 1 #遮起來
            else:
                data[i, j] = 0#不遮起來
    return data

In [20]:
x_train_mask = make_mask(x_train_combined, x_train_combined.shape[0])
x_test_mask = make_mask(x_test_combined, x_test_combined.shape[0])
x_val_mask = make_mask(x_val_combined, x_val_combined.shape[0])
print(len(x_train_mask))
print(len(x_test_mask))
print(len(x_val_mask))

6920
865
868


In [21]:
train_ind=[]
test_ind=[]
val_ind=[]

for i in range(len(x_train_combined)):
    train_ind.append(i)
for i in range(len(x_test_combined)):
    test_ind.append(i)
for i in range(len(x_val_combined)):
    val_ind.append(i)

len(train_ind)    

6920

In [22]:
class MyDataset(Dataset):
    def __init__(self, features, targets, mask, index):
        self.features = features
        self.targets = targets
        self.masks = mask
        self.indexs = index
        
    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx], self.masks[idx], self.indexs[idx]

In [23]:
train_dataset = MyDataset(x_train_combined, y_train_combined, x_train_mask, train_ind)
test_dataset = MyDataset(x_test_combined, y_test_combined, x_test_mask, test_ind)
val_dataset = MyDataset(x_val_combined, y_val_combined, x_val_mask, val_ind)


train_loader = DataLoader(dataset=train_dataset,
              batch_size=8868,
              shuffle=True)
test_loader = DataLoader(dataset=test_dataset,
              batch_size=865,
              shuffle=False)
val_loader = DataLoader(dataset=val_dataset,
              batch_size=868,
              shuffle=False)

In [24]:
for X, Y, M, I in (train_loader):
    print(X.shape, Y.shape, M.shape, I.shape)

torch.Size([6920, 6, 60]) torch.Size([6920, 1]) torch.Size([6920, 6]) torch.Size([6920])


In [72]:
def TrainModel(model, loss_fn, optimizer, train_loader, test_loader, epochs, patience):
    best_loss = float('inf')
    early_stop_counter = 0
    
    for epoch in range(epochs):
        model.train()
        t_losses = []
        v_losses = []
        
        for X, Y, M, I in train_loader:
            X = X.to(torch.float32).to(device)
            M = M.to(torch.float32).to(device)
            Y = Y.to(torch.float32).to(device)
            
            Y_preds = model(X, M)
            loss = loss_fn(Y_preds, Y)
            t_losses.append(loss.item())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        model.eval()
        with torch.no_grad():
            for X, Y, M, I in test_loader:
                X = X.to(torch.float32).to(device)
                M = M.to(torch.float32).to(device)
                Y = Y.to(torch.float32).to(device)
                
                
                Y_preds = model(X, M)
                loss = loss_fn(Y_preds, Y)
                v_losses.append(loss.item())
        
        avg_t_loss = sum(t_losses) / len(t_losses)
        avg_v_loss = sum(v_losses) / len(v_losses)
        
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_t_loss:.5f}, Validation Loss: {avg_v_loss:.5f}")

        
        # Early stopping check
        if avg_v_loss < best_loss:
            best_loss = avg_v_loss
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("Early stopping triggered. No improvement in validation loss.")
                break
    
 
        

In [73]:
def MakePredictions(model, loader):
    Y_shuffled, Y_preds = [], []
    model.eval()
    with torch.no_grad(): 
        for X, Y, M,I in (loader):
            X=X.to(torch.float32).to(device)
            M=M.to(torch.float32).to(device)
            Y=Y.to(torch.float32).to(device)
            preds = model(X, M)
            preds=preds.gt(0.5).int()
            Y_preds.append(preds)
            Y_shuffled.append(Y)
        gc.collect()
        Y_preds, Y_shuffled = torch.cat(Y_preds), torch.cat(Y_shuffled)

        return Y_shuffled.detach().cpu().numpy(), Y_preds.detach().cpu().numpy()
    



In [74]:
import numpy as np
from collections import Counter

class BalancedDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.labels = np.array([label.item() for _, label, _, _ in self.dataset])  # 將標籤轉換成可雜湊的型別
        self.num_samples = len(self.labels)
        self.class_counts = Counter(self.labels)
        self.minority_class = min(self.class_counts, key=self.class_counts.get)
        self.majority_class = max(self.class_counts, key=self.class_counts.get)
        self.num_minority_samples = self.class_counts[self.minority_class]
        self.num_majority_samples = self.class_counts[self.majority_class]
        self.balance_indices()

    def balance_indices(self):
        majority_indices = np.where(self.labels == self.majority_class)[0]
        minority_indices = np.where(self.labels == self.minority_class)[0]
        balanced_majority_indices = np.random.choice(majority_indices, self.num_minority_samples, replace=False)
        balanced_indices = np.concatenate([balanced_majority_indices, minority_indices])
        np.random.shuffle(balanced_indices)
        self.balanced_indices = balanced_indices

    def __len__(self):
        return len(self.balanced_indices)

    def __getitem__(self, idx):
        original_idx = self.balanced_indices[idx]
        return self.dataset[original_idx]
# 建立平衡的訓練資料集
balanced_train_dataset = BalancedDataset(train_dataset)

# 使用平衡後的資料集建立 DataLoader
balanced_train_loader = DataLoader(dataset=balanced_train_dataset, batch_size=8868, shuffle=True)

for X, Y, M, I in (balanced_train_loader):
    print(X.shape, Y.shape, M.shape, I.shape)

torch.Size([2016, 6, 60]) torch.Size([2016, 1]) torch.Size([2016, 6]) torch.Size([2016])


In [95]:
# 基底模型的數量
num_base_models = 1

base_models = []
for i in range(num_base_models):
    base_model = TransformerEncoder(n_heads=4, d_model=60, d_ff=120, max_len=6).to(device)
    base_models.append(base_model)
    
for i, base_model in enumerate(base_models):
    print(f'model_{i}')
    balanced_train_dataset = BalancedDataset(train_dataset)
    balanced_train_loader = DataLoader(dataset=balanced_train_dataset, batch_size=2016, shuffle=True)

    loss_fn = nn.BCELoss()
    optimizer = torch.optim.AdamW(base_model.parameters(), lr=0.001)

    epochs = 250
    patience = 100
    
    TrainModel(base_model, loss_fn, optimizer, balanced_train_loader, val_loader, epochs, patience)

    torch.save(base_model.state_dict(), f"base_model_{i}.pt")


In [96]:
# 加載已訓練的基底模型並進行預測
predictions = []
for i in range(num_base_models):
    base_model = TransformerEncoder(n_heads=4, d_model=60, d_ff=120, max_len=6).to(device)
    base_model.load_state_dict(torch.load(f"base_model_{i}.pt"))
    base_model.eval()

    # 進行預測
    Y_shuffled, Y_preds = MakePredictions(base_model, val_loader)
    predictions.append(Y_preds)

# 對預測結果進行投票
ensemble_predictions = np.mean(predictions, axis=0)
final_predictions = (ensemble_predictions > 0.5).astype(int)

print(final_predictions.shape)

target_classes = ['沒確診','確診']
print("Test Accuracy : {}".format(accuracy_score(Y_shuffled, final_predictions)))
print("\nClassification Report : ")
print(classification_report(Y_shuffled, final_predictions, target_names=target_classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_shuffled, final_predictions))


(868, 1)
Test Accuracy : 0.7638248847926268

Classification Report : 
              precision    recall  f1-score   support

         沒確診       0.96      0.75      0.84       740
          確診       0.37      0.83      0.51       128

    accuracy                           0.76       868
   macro avg       0.66      0.79      0.68       868
weighted avg       0.87      0.76      0.80       868


Confusion Matrix : 
[[557 183]
 [ 22 106]]
