In [None]:
import os
import re
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import label_binarize
from tqdm import tqdm
import scipy.io

In [None]:
#read csv folder
extract_path = '../visit_1/visit_1'
csv_files = [f for f in os.listdir(extract_path) if f.endswith('.csv')]

csv_files = csv_files[:int(len(csv_files)*0.6)]

In [None]:
#read excel file
xlxs_path = '../shhs1_ahi_pruebas.xlsx'
xlxs_data = pd.read_excel(xlxs_path)

In [None]:
#process data
from joblib import Parallel, delayed

def process_csv(csv_file, extract_path):
    match = re.search(r'(\d+)_extraction', csv_file)
    if match:
        current_id = match.group(1)
        current_data = pd.read_csv(os.path.join(extract_path, csv_file))
        current_data['ID'] = current_id
        return current_data
    else:
        return pd.DataFrame()

In [None]:
# use parallel processors
num_processes = 6 #change any number of processes you want to use
all_data = pd.DataFrame()

processed_data = Parallel(n_jobs=num_processes)(
    delayed(process_csv)(csv_file, extract_path)
    for csv_file in tqdm(csv_files)
)

all_data = pd.concat([df for df in processed_data if not df.empty], ignore_index=True) # type: ignore

channels = ['H.R.', 'SaO2', 'ABDO RES', 'THOR RES', 'AIRFLOW','ID']
all_data = all_data[channels]

# Reset index after concatenation
all_data.reset_index(drop=True, inplace=True)

In [None]:
#create data that channel as feature, ahi as label
xlxs_data['ID'] = xlxs_data['ID'].astype(int)
all_data['ID'] = all_data['ID'].astype(int)

merged_data = pd.merge(all_data, xlxs_data, on='ID', how='left')


merged_data['nsrr_ahi_hp3r_aasm15'] = merged_data.groupby('ID')['nsrr_ahi_hp3r_aasm15'].transform('first')
merged_data['nsrr_ahi_hp4u_aasm15'] = merged_data.groupby('ID')['nsrr_ahi_hp4u_aasm15'].transform('first')
'''
merged_data = merged_data.drop(['nsrr_ahi_hp4u_aasm15 ', 'nsrr_ahi_hp3r_aasm15'], axis=1)

'''
merged_data = merged_data.fillna(0)

merged_data = merged_data.astype(float)

In [None]:
#Process features and labels
import numpy as np

X = merged_data[['H.R.', 'SaO2', 'ABDO RES', 'THOR RES', 'AIRFLOW']].values
Y = merged_data['nsrr_ahi_hp3r_aasm15'].values

def categorize_ahi(ahi):
  if ahi < 5:
    return 0 # no sleep apnea
  elif ahi < 15:
    return 1 # mild
  elif ahi < 30:
    return 2 # moderate
  else:
    return 3 # severe

processed_data = Parallel(n_jobs=num_processes)(
    delayed(categorize_ahi)(ahi) for ahi in tqdm(Y)
)

Y = np.array(processed_data, dtype=np.int64)

In [None]:
directory = "/content/drive/MyDrive/transformer_data"
os.makedirs(directory, exist_ok = True)
x_dir = os.path.join(directory,"x.npy")
y_dir = os.path.join(directory,"y.npy")
np.save(x_dir, X)
np.save(y_dir, Y)

In [None]:
directory = "/content/drive/MyDrive/transformer_data"
x_dir = os.path.join(directory,"x.npy")
y_dir = os.path.join(directory,"y.npy")
X = np.load(x_dir)
Y = np.load(y_dir)
Y_bin = label_binarize(Y, classes=[0, 1, 2, 3])

In [None]:
#load mat data
mat_path = '../Final_Aout/sleep_heart_fractional_visit_1_12_partition_original.mat'
mat_data = scipy.io.loadmat(mat_path)
new_input_data = mat_data['data']

scaler = StandardScaler()
new_input_data = scaler.fit_transform(new_input_data)
new_input_data = new_input_data.reshape(-1, 5)

In [None]:
#split train and test
X_train, X_val, Y_train, Y_val = train_test_split(X, Y_bin, test_size=0.2, random_state=42)
X_train_new = new_input_data[:len(X_train)]
X_val_new = new_input_data[len(X_train):]

In [None]:
#define the Dataset
class VisitDataset(Dataset):
    def __init__(self, features1, features2, labels):
        self.features1 = features1
        self.features2 = features2
        self.labels = labels.astype(np.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features1[idx], self.features2[idx], self.labels[idx]

In [None]:
train_dataset = VisitDataset(X_train, X_train_new, Y_train)
test_dataset = VisitDataset(X_val, X_val_new, Y_val)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=20480, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=20480, shuffle=False)

In [None]:
#two-tower transformer
class TwoTowerTransformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads, num_layers, num_classes):
        super().__init__()
        self.embedding1 = nn.Linear(input_dim, hidden_dim)
        self.embedding2 = nn.Linear(input_dim, hidden_dim)
        self.transformer1 = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(hidden_dim, num_heads, hidden_dim),
            num_layers=num_layers)
        self.transformer2 = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(hidden_dim, num_heads, hidden_dim),
            num_layers=num_layers)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x1, x2):
        x1 = self.embedding1(x1)
        x2 = self.embedding2(x2)
        x1 = x1.permute(1, 0, 2)
        x2 = x2.permute(1, 0, 2)
        x1 = self.transformer1(x1)
        x2 = self.transformer2(x2)
        x1 = x1.permute(1, 0, 2).reshape(x1.size(1), -1)
        x2 = x2.permute(1, 0, 2).reshape(x2.size(1), -1)

        x = torch.cat((x1, x2), dim=1)
        x = self.fc(x)
        return x

In [None]:
input_dim = 5
hidden_dim = 512
num_heads = 2
num_layers = 2
num_classes = 4
device_id = 0
device = torch.device('cuda:{}'.format(device_id)) if torch.cuda.is_available() else 'cpu'
print(device)

model = TwoTowerTransformer(input_dim, hidden_dim, num_heads, num_layers, num_classes).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = CosineAnnealingLR(optimizer, T_max=50, eta_min=0)

In [None]:
#train
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    all_outputs = []
    all_labels = []
    for inputs1, inputs2, labels in tqdm(train_loader):
        inputs1 = inputs1.unsqueeze(1).to(torch.float32)
        inputs2 = inputs2.unsqueeze(1).to(torch.float32)
        inputs1, inputs2, labels = inputs1.to(device), inputs2.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs1, inputs2)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        all_outputs.append(outputs.detach().cpu().numpy())
        all_labels.append(labels.detach().cpu().numpy())

    all_outputs = np.concatenate(all_outputs)
    all_labels = np.concatenate(all_labels)
    auc_score = roc_auc_score(all_labels, all_outputs, multi_class='ovr')
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}, AUC: {auc_score:.4f}')

    if epoch % 5 == 0:
        torch.save(model.state_dict(), f'temp_epoch{epoch}.pth')
        print("Model saved")

In [None]:
#test
model.eval()
total_loss = 0.0
all_outputs = []
all_labels = []
for inputs1, inputs2, labels in test_loader:
    inputs1 = inputs1.unsqueeze(1).to(torch.float32)
    inputs2 = inputs2.unsqueeze(1).to(torch.float32)
    inputs1, inputs2, labels = inputs1.to(device), inputs2.to(device), labels.to(device)
    outputs = model(inputs1, inputs2)
    loss = criterion(outputs, labels)

    total_loss += loss.item()
    all_outputs.append(outputs.detach().cpu().numpy())
    all_labels.append(labels.detach().cpu().numpy())

all_outputs = np.concatenate(all_outputs)
all_labels = np.concatenate(all_labels)
auc_score = roc_auc_score(all_labels, all_outputs, multi_class='ovr')
print(f"Validation Loss: {total_loss / len(test_loader):.4f}, Validation AUC: {auc_score:.4f}")