# SWEEM Model Implementation

This file is used to illustrate the preprocessing, training, and evaluation 
stages of our model. Comments and more information will be provided per section.

## Preprocessing

Here we load in the data and establish our train-test split. We also set up dataloaders
for us to be able to properly use the data within our training loop.

## Define Baseline Linear

In [10]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset

train_data = pd.read_csv('./Data/Multiple/train.csv')
test_data = pd.read_csv('./Data/Multiple/test.csv')

## Training Data
rna_train = train_data.columns[:2708]                       # 2708 rna
scna_train = train_data.columns[2708:5404]                  # 2696 scna
mutation_train = train_data.columns[5404:5591]              # 187 mutation
methy_train = train_data.columns[5591:7957]                 # 2366 methy
target_train = train_data.columns[-3:]                      # 3 target

## Testing Data
rna_test = test_data.columns[:2708]                         # 2708 rna
scna_test = test_data.columns[2708:5404]                    # 2696 scna
mutation_test = test_data.columns[5404:5591]                # 187 mutation
methy_test = test_data.columns[5591:7957]                   # 2366 methy
target_test = test_data.columns[-3:]                        # 3 target

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
### Sanity Checks on Data

# Data Shapes; should have same number of features
print('Training Data Shape: ', train_data.shape)    # (380, 7961)
print('Testing Data Shape: ', test_data.shape)      # (48 , 7961)
print()

# Check header information
print(f"RNA Labels:         {list(train_data.columns[0:2])} ... {list(train_data.columns[2706:2708])}")
print(f"SCNA Labels:        {list(train_data.columns[2708:2710])} ... {list(train_data.columns[5402:5404])}")
print(f"Mutation Labels:    {list(train_data.columns[5404:5406])} ... {list(train_data.columns[5589:5591])}")
print(f"Methylation Labels: {list(train_data.columns[5591:5593])} ... {list(train_data.columns[7955:7957])}")
print(f"Target Labels:      {list(train_data.columns[-3:])}")

Training Data Shape:  (380, 7961)
Testing Data Shape:  (48, 7961)

RNA Labels:         ['ST3GAL2_rna', 'ELOVL1_rna'] ... ['TK2_rna', 'DHODH_rna']
SCNA Labels:        ['ST3GAL2_scna', 'ELOVL1_scna'] ... ['TK2_scna', 'DHODH_scna']
Mutation Labels:    ['ABCG4_mutation', 'DAPK1_mutation'] ... ['CDH4_mutation', 'ACSL4_mutation']
Methylation Labels: ['ST3GAL2_methy', 'ELOVL1_methy'] ... ['PRKCQ_methy', 'TK2_methy']
Target Labels:      ['SAMPLE_ID', 'OS_MONTHS', 'OS_EVENT']


In [30]:
## RIGHT NOW, WE ONLY CARE ABOUT OS_MONTHS, WHICH IS THE SECOND TO LAST COLUMN

# Split the data into train and validation sets.
train_features, val_features, train_labels, val_labels = train_test_split(
    train_data.iloc[:, :-3], train_data.iloc[:, -2], test_size=0.2, random_state=42)

test_features, test_labels = test_data.iloc[:, :-3], test_data.iloc[:, -2]

# Create Tensor datasets
train_dataset = TensorDataset(torch.tensor(train_features.values), torch.tensor(train_labels.values))
val_dataset   = TensorDataset(torch.tensor(val_features.values),   torch.tensor(val_labels.values))
test_dataset  = TensorDataset(torch.tensor(test_features.values),  torch.tensor(test_labels.values))

# Create DataLoader objects
batch_size = 2
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)
test_dataloader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Define Self-Attention Model

In [31]:
from model import SelfAttentionModel 
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm

model = SelfAttentionModel(7961-3, 2000, 2000)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Running on", device)
model.to(device)


criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

Running on cpu


In [42]:
class LinearRegression(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LinearRegression, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, 5000)
        self.linear3 = nn.Linear(5000, 4000)
        self.linear4 = nn.Linear(4000, 3000)
        self.linear5 = nn.Linear(3000, 2000)
        self.linearOut = nn.Linear(2000, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.linear1(x)
        out = self.relu(out)
        out = self.linear2(out)
        out = self.relu(out)
        out = self.linear3(out)
        out = self.relu(out)
        out = self.linear4(out)
        out = self.relu(out)
        out = self.linear5(out)
        out = self.relu(out)
        out = self.linearOut(out)
        return out
    
model = LinearRegression(7961-3, 6000, 1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Running on", device)
model.to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

Running on cpu


In [43]:
### Training Loop
num_epochs = 10
epoch_train_losses = []
epoch_val_losses   = []

for epoch in range(num_epochs):
    epoch_train_loss = 0
    epoch_val_loss   = 0
    print(f"Epoch {epoch + 1} training:")
    progress_bar = tqdm(range(len(train_dataloader)))

    model.train()
    ## Training
    for (batchX, batchY) in train_dataloader:
        # Forward pass
        outputs = model(batchX.to(device).to(torch.float32))
        outputs = outputs.squeeze()

        # Backward pass
        loss = criterion(outputs, batchY.to(device).to(torch.float32))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        epoch_train_loss += loss.item()
        progress_bar.update(1)

    model.eval()
    ## Validation
    with torch.no_grad():
        for (batchX, batchY) in val_dataloader:
            outputs = model(batchX.to(device).to(torch.float32))
            outputs = outputs.squeeze()
            loss = criterion(outputs, batchY.to(device).to(torch.float32))
            epoch_val_loss += loss.item()

    # Save and print losses
    epoch_train_loss /= len(train_dataloader)
    epoch_val_loss /= len(val_dataloader)
    epoch_train_losses.append(epoch_train_loss)
    epoch_val_losses.append(epoch_val_loss)
    print(f"Epoch {epoch + 1} training loss: {epoch_train_loss}")
    print(f"Epoch {epoch + 1} validation loss: {epoch_val_loss}")

Epoch 1 training:


 25%|██▌       | 38/152 [00:19<00:59,  1.92it/s]


Epoch 1 training loss: 1679699.6676137825
Epoch 1 validation loss: 1276320.9078947369
Epoch 2 training:


100%|██████████| 152/152 [01:02<00:00,  2.42it/s]
100%|██████████| 152/152 [00:54<00:00,  2.77it/s]

Epoch 2 training loss: 1193506.3272496273
Epoch 2 validation loss: 1003827.7563605057
Epoch 3 training:


100%|██████████| 152/152 [00:56<00:00,  2.70it/s]


Epoch 3 training loss: 1123895.7756516307
Epoch 3 validation loss: 1297066.6363075657
Epoch 4 training:


100%|██████████| 152/152 [01:10<00:00,  2.15it/s]
100%|██████████| 152/152 [00:57<00:00,  2.99it/s]

Epoch 4 training loss: 1067024.1388168335
Epoch 4 validation loss: 938254.8535220497
Epoch 5 training:


100%|██████████| 152/152 [00:58<00:00,  2.60it/s]


Epoch 5 training loss: 1005734.5088179739
Epoch 5 validation loss: 922399.107421875
Epoch 6 training:


100%|██████████| 152/152 [00:53<00:00,  2.82it/s]
100%|██████████| 152/152 [00:52<00:00,  3.06it/s]

Epoch 6 training loss: 1001877.4781108656
Epoch 6 validation loss: 925122.3597058748
Epoch 7 training:


100%|██████████| 152/152 [00:54<00:00,  2.77it/s]


KeyboardInterrupt: 

In [48]:
### Sanity Checking Outputs

# Check the outputs of the model on the test set
model.eval()
sample_number = 5
for test_images, test_labels in test_dataloader: 
    outputs = model(test_images.to(device).to(torch.float32))
    print("Intended Output: ", test_labels)
    print("Actual Output: ", outputs)
    break

Intended Output:  tensor([837, 800])
Actual Output:  tensor([[1186.3157],
        [1242.8912]], grad_fn=<AddmmBackward0>)
