# Hands-on Exercise: Processing Legacy Lab Data for ML

## Duration: 90 minutes

### Overview

In this exercise, you'll learn how to: 
1. Clean and standardize heterogeneous lab data 
2. Merge data from multiple sources
3. Handle missing values and inconsistencies
4. Prepare data for ML model training
5. Build a basic ML model for antibody property prediction


### Feature Engineering and ML Preparation

You should start by cleaning the data using the `antibody_data_preprocessing` notebook.

In [3]:
def prepare_for_ml(merged_df):
    """Prepare merged data for ML modeling"""
    # Create feature matrix
    features = [
        'binding_affinity_kd',
        'thermostability_tm1_celsius',
        'asec_monomerpct'
    ]
    
    # Handle missing values
    X = merged_df[features].copy()
    X = X.fillna(X.mean())
    
    # Create a simple target variable (should be customized).
    # Here we're creating a "quality score" combining multiple properties,
    # quantifying whether a given sequence is better than the median
    y = (
        (X['thermostability_tm1_celsius'] > X['thermostability_tm1_celsius'].median()).astype(int) + 
        (X['asec_monomerpct'] > X['asec_monomerpct'].median()).astype(int) +
        (X['binding_affinity_kd'] < X['binding_affinity_kd'].median()).astype(int)
    )
    
    return X, y

### Build an LSTM to predict the target variable from antibody sequence

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from typing import List, Tuple

class AntibodyDataset(Dataset):
    def __init__(self, merged_experimental_df: pd.DataFrame):
        self.data = merged_experimental_df
        #sequences_df.merge(experimental_df, on='Sample ID')

        # Create amino acid vocabulary
        self.aa_vocab = {aa: idx for idx, aa in enumerate('ACDEFGHIKLMNPQRSTVWY')}
'''
        # Standardize experimental features
        self.scaler = StandardScaler()
        self.exp_features = ['KD (nM)', 'Tm1', 'Tm2', '% POI']
        self.exp_data = self.scaler.fit_transform(self.data[self.exp_features])
'''
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # Convert sequences to numerical arrays
        vh_tensor = torch.tensor([self.aa_vocab[aa] for aa in row['sequences_hc_sequence']], dtype=torch.long)
        vl_tensor = torch.tensor([self.aa_vocab[aa] for aa in row['sequences_lc_sequence']], dtype=torch.long)

        # Get experimental features
      #  exp_tensor = torch.tensor(self.exp_data[idx], dtype=torch.float)

        # Get target variables
        y = prepare_for_ml(self.data)[1]

        return {
            'antibody_id': row['antibody_id'],
            'vh': vh_tensor,
            'vl': vl_tensor,
            'target': y
        }

class AntibodyPropertyPredictor(nn.Module):
    def __init__(self, vocab_size: int = 20, embedding_dim: int = 32,
                 hidden_dim: int = 64, num_exp_features: int = 4):
        super().__init__()

        # Sequence processing
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.vh_lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.vl_lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)

        # Experimental feature processing
        self.exp_linear = nn.Linear(num_exp_features, hidden_dim)

        # Combined processing
        self.combine_layer = nn.Linear(hidden_dim * 4 + hidden_dim, hidden_dim)

        # Output layers
        self.output_layer = nn.Linear(hidden_dim, 1)

    def forward(self, vh, vl):
        # Process VH sequence
        vh_emb = self.embedding(vh)
        vh_out, (vh_hidden, _) = self.vh_lstm(vh_emb)
        vh_feat = torch.cat((vh_hidden[-2,:,:], vh_hidden[-1,:,:]), dim=1)

        # Process VL sequence
        vl_emb = self.embedding(vl)
        vl_out, (vl_hidden, _) = self.vl_lstm(vl_emb)
        vl_feat = torch.cat((vl_hidden[-2,:,:], vl_hidden[-1,:,:]), dim=1)

        # Process experimental features
       # exp_feat = self.exp_linear(exp_features)

        # Combine all features
        combined = torch.cat([vh_feat, vl_feat], dim=1)
        hidden = F.relu(self.combine_layer(combined))

        # Generate prediction
        output = self.output_layer(hidden)
        return output

def train_model(model: nn.Module, train_loader: DataLoader,
                num_epochs: int = 100, learning_rate: float = 0.001):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    train_losses = []

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0.0

        for batch in train_loader:
            optimizer.zero_grad()

            # Forward pass
            output = model(batch['vh'], batch['vl'])
            loss = criterion(output, batch['target'].unsqueeze(1))

            # Backward pass
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        avg_loss = epoch_loss / len(train_loader)
        train_losses.append(avg_loss)

        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

    return train_losses

def evaluate_model(model: nn.Module, test_loader: DataLoader) -> Tuple[List[float], List[float]]:
    model.eval()
    predictions = []
    actuals = []

    with torch.no_grad():
        for batch in test_loader:
            output = model(batch['vh'], batch['vl'], batch['exp_features'])
            predictions.extend(output.squeeze().tolist())
            actuals.extend(batch['target'].tolist())

    return predictions, actuals

ModuleNotFoundError: No module named 'torch'

### Train the model on the example data


In [None]:

def main():
    # Load data
    seq_df = pd.read_csv('antibody_sequences.csv')
    aggregation_df = pd.read_csv('asec_data.csv')
    binding_df = pd.read_csv('binding_data.csv')
    stability_df = pd.read_csv('stability_data.csv')

    aggregation_df = clean_aggregation_data(aggregation_df)
    binding_df = clean_binding_data(binding_df)
    stability_df = clean_stability_data(stability_df)

    integrated_dataset = integrate_datasets(binding_df, stability_df, aggregation_df)
    targets_df = prepare_for_ml(integrated_dataset)[0]

    # Create dataset
    dataset = AntibodyDataset(seq_df, targets_df)

    # Split dataset
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(
        dataset, [train_size, val_size]
    )

    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=32,
        shuffle=True,
        num_workers=4
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=32,
        shuffle=False,
        num_workers=4
    )

    # Initialize model
    model = AntibodyTransformer(
        vocab_size=20,
        d_model=128,
        nhead=8,
        num_layers=3,
        dropout=0.1
    )

    # Train model
    train_model(model, train_loader, val_loader)

if __name__ == "__main__":
    main()

# Transformer architecture

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from typing import List, Tuple

class AntibodyDataset(Dataset):
    def __init__(self, merged_experimental_df: pd.DataFrame):
        self.data = pd.read_csv(merged_experimental_df)
        #sequences_df.merge(experimental_df, on='Sample ID')

        # Create amino acid vocabulary
        self.aa_vocab = {aa: idx for idx, aa in enumerate('ACDEFGHIKLMNPQRSTVWY')}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # Convert sequences to numerical arrays
        vh_tensor = torch.tensor([self.aa_vocab[aa] for aa in row['sequences_hc_sequence']], dtype=torch.long)
        vl_tensor = torch.tensor([self.aa_vocab[aa] for aa in row['sequences_lc_sequence']], dtype=torch.long)

        # Pad the VL to have the same length as the VH
        
        
        # Get experimental features
      #  exp_tensor = torch.tensor(self.exp_data[idx], dtype=torch.float)

        # Get target variables
        #y = prepare_for_ml(self.data)[1]
        targets = self.data[['binding_affinity_kd','thermostability_tm1_celsius','asec_monomerpct']]

        return pad_sequence([vh_tensor, vl_tensor]), torch.tensor(targets.values)

class AntibodyTransformer(nn.Module):
    def __init__(self, vocab_size=20, d_model=128, nhead=8, num_layers=3, dropout=0.1):
        super().__init__()

        # Embedding layer for amino acid sequences
        self.embedding = nn.Embedding(vocab_size, d_model)

        # Positional encoding
        self.pos_encoder = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=512,
            dropout=dropout
        )

        # Transformer encoder
        self.transformer = nn.TransformerEncoder(
            encoder_layer=self.pos_encoder,
            num_layers=num_layers
        )

        # Output layers
        self.fc1 = nn.Linear(d_model * 2, 256)  # *2 because we have VH and VL
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 4)  # 3 outputs: aggregation, KD, Tm1

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x shape: [batch_size, 2, seq_length]
        batch_size = x.shape[0]

        # Process VH and VL sequences separately
        vh = x[:, 0, :]  # [batch_size, seq_length]
        vl = x[:, 1, :]  # [batch_size, seq_length]

        # Embed sequences
        vh_embedded = self.embedding(vh).transpose(0, 1)  # [seq_length, batch_size, d_model]
        vl_embedded = self.embedding(vl).transpose(0, 1)  # [seq_length, batch_size, d_model]

        # Pass through transformer
        vh_encoded = self.transformer(vh_embedded)  # [seq_length, batch_size, d_model]
        vl_encoded = self.transformer(vl_embedded)  # [seq_length, batch_size, d_model]

        # Pool sequence dimension
        vh_pooled = vh_encoded.mean(dim=0)  # [batch_size, d_model]
        vl_pooled = vl_encoded.mean(dim=0)  # [batch_size, d_model]

        # Concatenate VH and VL features
        combined = torch.cat([vh_pooled, vl_pooled], dim=1)

        # Final MLP layers
        x = F.relu(self.fc1(combined))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)

        return x

def train_model(model, train_loader, val_loader, num_epochs=10, learning_rate=1e-4):
    """Training loop with validation"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=5, verbose=True
    )

    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        for sequences, targets in train_loader:
            sequences = sequences.to(device)
            targets = targets.to(device)

            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for sequences, targets in val_loader:
                sequences = sequences.to(device)
                targets = targets.to(device)

                outputs = model(sequences)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

        # Update learning rate
        scheduler.step(val_loss)

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')

        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'Training Loss: {train_loss/len(train_loader):.4f}')
        print(f'Validation Loss: {val_loss/len(val_loader):.4f}')


In [26]:
def main():
    # Create dataset
    dataset = AntibodyDataset('../data/merged_antibody_data.csv')

    # Split dataset
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(
        dataset, [train_size, val_size]
    )

    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=1,
        shuffle=True,
        num_workers=1
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=1,
        shuffle=False,
        num_workers=1
    )

    # Initialize model
    model = AntibodyTransformer(
        vocab_size=20,
        d_model=128,
        nhead=8,
        num_layers=3,
        dropout=0.1
    )

    # Train model
    train_model(model, train_loader, val_loader)


In [27]:
main()

  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (4) must match the size of tensor b (3) at non-singleton dimension 2

### Exercise Tasks

1.  Data Loading and Assessment
    -   Load all three data files
    -   Run initial quality assessment
    -   Identify key data issues
2.  Data Cleaning
    -   Implement temperature standardization
    -   Clean binding affinity values
    -   Handle missing values
    -   Standardize units
3.  Data Integration
    -   Merge datasets
    -   Handle duplicate measurements
    -   Create final feature matrix
4.  ML Model Development
    -   Create feature matrix
    -   Define target variable
    -   Train simple model
    -   Evaluate results

### Bonus Challenges

1.  Advanced Data Cleaning
    -   Implement outlier detection
    -   Add data validation rules
    -   Create data quality reports
2.  Feature Engineering
    -   Create derived features
    -   Implement domain-specific transformations
    -   Add sequence-based features
3.  Model Improvements
    -   Implement cross-validation
    -   Try different ML algorithms
    -   Add uncertainty quantification

### Tips for Success

1.  Data Quality
    -   Always plot your data
    -   Check for outliers
    -   Validate units
    -   Document assumptions
2.  Integration
    -   Verify sample IDs
    -   Check for duplicates
    -   Validate merged data
3.  ML Development
    -   Start simple
    -   Test assumptions
    -   Validate results
