# SWEEM Model Implementation

This file is used to illustrate the preprocessing, training, and evaluation 
stages of our model. Comments and more information will be provided per section.

## Preprocessing

Here we load in the data and establish our train-test split. We also set up dataloaders
for us to be able to properly use the data within our training loop.

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset

train_data = pd.read_csv('./Data/Multiple/train.csv')
test_data = pd.read_csv('./Data/Multiple/test.csv')

## Training Data
rna_train = train_data.columns[:2708]                       # 2708 rna
scna_train = train_data.columns[2708:5404]                  # 2696 scna
mutation_train = train_data.columns[5404:5591]              # 187 mutation
methy_train = train_data.columns[5591:7957]                 # 2366 methy
target_train = train_data.columns[-3:]                      # 3 target

## Testing Data
rna_test = test_data.columns[:2708]                         # 2708 rna
scna_test = test_data.columns[2708:5404]                    # 2696 scna
mutation_test = test_data.columns[5404:5591]                # 187 mutation
methy_test = test_data.columns[5591:7957]                   # 2366 methy
target_test = test_data.columns[-3:]                        # 3 target

In [4]:
### Sanity Checks on Data

# Data Shapes; should have same number of features
print('Training Data Shape: ', train_data.shape)    # (380, 7961)
print('Testing Data Shape: ', test_data.shape)      # (48 , 7961)
print()

# Check header information
print(f"RNA Labels:         {list(train_data.columns[0:2])} ... {list(train_data.columns[2706:2708])}")
print(f"SCNA Labels:        {list(train_data.columns[2708:2710])} ... {list(train_data.columns[5402:5404])}")
print(f"Mutation Labels:    {list(train_data.columns[5404:5406])} ... {list(train_data.columns[5589:5591])}")
print(f"Methylation Labels: {list(train_data.columns[5591:5593])} ... {list(train_data.columns[7955:7957])}")
print(f"Target Labels:      {list(train_data.columns[-3:])}")

Training Data Shape:  (380, 7961)
Testing Data Shape:  (48, 7961)

RNA Labels:         ['ST3GAL2_rna', 'ELOVL1_rna'] ... ['TK2_rna', 'DHODH_rna']
SCNA Labels:        ['ST3GAL2_scna', 'ELOVL1_scna'] ... ['TK2_scna', 'DHODH_scna']
Mutation Labels:    ['ABCG4_mutation', 'DAPK1_mutation'] ... ['CDH4_mutation', 'ACSL4_mutation']
Methylation Labels: ['ST3GAL2_methy', 'ELOVL1_methy'] ... ['PRKCQ_methy', 'TK2_methy']
Target Labels:      ['SAMPLE_ID', 'OS_MONTHS', 'OS_EVENT']


In [None]:
## RIGHT NOW, WE ONLY CARE ABOUT OS_MONTHS, WHICH IS THE SECOND TO LAST COLUMN

# Split the data into train and validation sets.
train_features, val_features, train_labels, val_labels = train_test_split(
    train_data.iloc[:, :-3], train_data.iloc[:, -2], test_size=0.2, random_state=42)

test_features, test_labels = test_data.iloc[:, :-3], test_data.iloc[:, -2]

# Create Tensor datasets
train_dataset = TensorDataset(torch.tensor(train_features.values), torch.tensor(train_labels.values))
val_dataset   = TensorDataset(torch.tensor(val_features.values),   torch.tensor(val_labels.values))
test_dataset  = TensorDataset(torch.tensor(test_features.values),  torch.tensor(test_labels.values))

# Create DataLoader objects
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader   = DataLoader(val_dataset,   batch_size=32, shuffle=False)
test_dataloader  = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Define Self-Attention Model

In [None]:
from model import SelfAttentionModel 
import torch.optim as optim
import torch.nn as nn
import tqdm

model = SelfAttentionModel(train_dataset.size()[1], 64, 64)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Running on", device)
model.to(device)

In [None]:
### Training Loop
num_epochs = 10
epoch_train_losses = []
epoch_val_losses   = []

for epoch in range(num_epochs):
    epoch_train_loss = 0
    epoch_val_loss   = 0
    print(f"Epoch {epoch + 1} training:")
    progress_bar = tqdm(range(len(train_dataloader)))

    model.train()
    ## Training
    for (batchX, batchY) in train_dataloader:
        # Forward pass
        outputs = model(batchX.to(device))

        # Backward pass
        loss = criterion(outputs, batchY.to(device))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        epoch_train_loss += loss.item()
        progress_bar.update(1)

    model.eval()
    ## Validation
    with torch.no_grad():
        for (batchX, batchY) in val_dataloader:
            outputs = model(batchX.to(device))
            loss = criterion(outputs, batchY.to(device))
            epoch_val_loss += loss.item()

    # Save and print losses
    epoch_train_loss /= len(train_dataloader)
    epoch_val_loss /= len(val_dataloader)
    epoch_train_losses.append(epoch_train_loss)
    epoch_val_losses.append(epoch_val_loss)
    print(f"Epoch {epoch + 1} training loss: {epoch_train_loss}")
    print(f"Epoch {epoch + 1} validation loss: {epoch_val_loss}")

In [None]:
### Testing/Evaluation Loop
test_loss = 0
with torch.no_grad():
    for (batchX, batchY) in test_dataloader:
        outputs = model(batchX.to(device))
        loss = criterion(outputs, batchY.to(device))
        test_loss += loss.item()

print(f"Test loss: {test_loss / len(test_dataloader)}")