# Hands-on Exercise: Processing Legacy Lab Data for ML

## Duration: 90 minutes

### Overview

In this exercise, you'll learn how to: 
1. Clean and standardize heterogeneous lab data 
2. Merge data from multiple sources
3. Handle missing values and inconsistencies
4. Prepare data for ML model training
5. Build a basic ML model for antibody property prediction

### Data Loading and Initial Assessment

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import re

# Load data files
def load_binding_data(file_path):
    df = pd.read_csv(file_path)
    return df

def load_stability_data(file_path):
    df = pd.read_excel(file_path)
    return df

def load_aggregation_data(file_path):
    df = pd.read_csv(file_path)
    return df

# Function to assess data quality
def assess_data_quality(df, dataset_name):
    """Analyze common data quality issues"""
    print(f"\nAnalyzing {dataset_name}:")
    print(f"Shape: {df.shape}")
    print("\nMissing values:")
    print(df.isnull().sum())
    print("\nUnique values per column:")
    for col in df.columns:
        print(f"{col}: {df[col].nunique()} unique values")

### Data Cleaning and Standardization

In [2]:
def clean_binding_data(df):
    """Clean and standardize binding data"""
    # Handle temperature standardization
    def standardize_temp(temp):
        if pd.isna(temp):
            return None
        temp = str(temp).lower()
        if 'rt' in temp or 'room' in temp:
            return 25.0
        return float(re.findall(r'[-+]?\d*\.*\d+', temp)[0])
    
    # Clean KD values
    def clean_kd(kd):
        if pd.isna(kd) or kd == 'n.b.':
            return None
        if isinstance(kd, str) and '<' in kd:
            return float(kd.replace('<', ''))
        return float(kd)
    
    df_clean = df.copy()
    df_clean['Temperature'] = df_clean['Temperature'].apply(standardize_temp)
    df_clean['KD (nM)'] = df_clean['KD (nM)'].apply(clean_kd)
    
    return df_clean

def clean_stability_data(df):
    """Clean and standardize stability data"""
    df_clean = df.copy()
    # Ensure numeric Tm values
    df_clean['Tm1'] = pd.to_numeric(df_clean['Tm1'], errors='coerce')
    df_clean['Tm2'] = pd.to_numeric(df_clean['Tm2'], errors='coerce')
    
    return df_clean

def clean_aggregation_data(df):
    """Clean and standardize aggregation data"""
    df_clean = df.copy()
    # Convert percentage strings to floats if necessary
    df_clean['% POI'] = pd.to_numeric(df_clean['% POI'], errors='coerce')
    df_clean['% Aggregate'] = pd.to_numeric(df_clean['% Aggregate'], errors='coerce')
    
    return df_clean


### Data Integration 

In [7]:
def integrate_datasets(binding_df, stability_df, aggregation_df):
    """Merge all datasets on sample ID"""
    # Standardize sample ID columns
    binding_df['Sample_ID'] = binding_df['Sample ID']
    stability_df['Sample_ID'] = stability_df['Sample']
    aggregation_df['Sample_ID'] = aggregation_df['ID']
    
    # Get mean values for repeated measurements
    binding_summary = binding_df.groupby('Sample_ID')['KD (nM)'].mean().reset_index()
    
    # Merge all datasets
    merged_df = binding_summary.merge(
        stability_df[['Sample_ID', 'Tm1', 'Tm2']], 
        on='Sample_ID', 
        how='outer'
    ).merge(
        aggregation_df[['Sample_ID', '% POI', '% Aggregate']], 
        on='Sample_ID', 
        how='outer'
    )
    
    return merged_df

### Feature Engineering and ML Preparation

In [3]:
def prepare_for_ml(merged_df):
    """Prepare merged data for ML modeling"""
    # Create feature matrix
    features = [
        'binding_affinity_kd',
        'thermostability_tm1_celsius',
        'asec_monomerpct'
    ]
    
    # Handle missing values
    X = merged_df[features].copy()
    X = X.fillna(X.mean())
    
    # Create a simple target variable (could be customized)
    # Here we're creating a "quality score" combining multiple properties,
    # quantifying whether a given sequence is better than the median
    y = (
        (X['thermostability_tm1_celsius'] > X['thermostability_tm1_celsius'].median()).astype(int) + 
        (X['asec_monomerpct'] > X['asec_monomerpct'].median()).astype(int) +
        (X['binding_affinity_kd'] < X['binding_affinity_kd'].median()).astype(int)
    )
    
    return X, y

### Build an LSTM to predict the target variable from antibody sequence

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from typing import List, Tuple

class AntibodyDataset(Dataset):
    def __init__(self, merged_experimental_df: pd.DataFrame):
        self.data = merged_experimental_df
        #sequences_df.merge(experimental_df, on='Sample ID')

        # Create amino acid vocabulary
        self.aa_vocab = {aa: idx for idx, aa in enumerate('ACDEFGHIKLMNPQRSTVWY')}
'''
        # Standardize experimental features
        self.scaler = StandardScaler()
        self.exp_features = ['KD (nM)', 'Tm1', 'Tm2', '% POI']
        self.exp_data = self.scaler.fit_transform(self.data[self.exp_features])
'''
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # Convert sequences to numerical arrays
        vh_tensor = torch.tensor([self.aa_vocab[aa] for aa in row['sequences_hc_sequence']], dtype=torch.long)
        vl_tensor = torch.tensor([self.aa_vocab[aa] for aa in row['sequences_lc_sequence']], dtype=torch.long)

        # Get experimental features
      #  exp_tensor = torch.tensor(self.exp_data[idx], dtype=torch.float)

        # Get target variables
        y = prepare_for_ml(self.data)[1]

        return {
            'antibody_id': row['antibody_id'],
            'vh': vh_tensor,
            'vl': vl_tensor,
            'target': y
        }

class AntibodyPropertyPredictor(nn.Module):
    def __init__(self, vocab_size: int = 20, embedding_dim: int = 32,
                 hidden_dim: int = 64, num_exp_features: int = 4):
        super().__init__()

        # Sequence processing
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.vh_lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.vl_lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)

        # Experimental feature processing
        self.exp_linear = nn.Linear(num_exp_features, hidden_dim)

        # Combined processing
        self.combine_layer = nn.Linear(hidden_dim * 4 + hidden_dim, hidden_dim)

        # Output layers
        self.output_layer = nn.Linear(hidden_dim, 1)

    def forward(self, vh, vl):
        # Process VH sequence
        vh_emb = self.embedding(vh)
        vh_out, (vh_hidden, _) = self.vh_lstm(vh_emb)
        vh_feat = torch.cat((vh_hidden[-2,:,:], vh_hidden[-1,:,:]), dim=1)

        # Process VL sequence
        vl_emb = self.embedding(vl)
        vl_out, (vl_hidden, _) = self.vl_lstm(vl_emb)
        vl_feat = torch.cat((vl_hidden[-2,:,:], vl_hidden[-1,:,:]), dim=1)

        # Process experimental features
       # exp_feat = self.exp_linear(exp_features)

        # Combine all features
        combined = torch.cat([vh_feat, vl_feat], dim=1)
        hidden = F.relu(self.combine_layer(combined))

        # Generate prediction
        output = self.output_layer(hidden)
        return output

def train_model(model: nn.Module, train_loader: DataLoader,
                num_epochs: int = 100, learning_rate: float = 0.001):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    train_losses = []

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0.0

        for batch in train_loader:
            optimizer.zero_grad()

            # Forward pass
            output = model(batch['vh'], batch['vl'])
            loss = criterion(output, batch['target'].unsqueeze(1))

            # Backward pass
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        avg_loss = epoch_loss / len(train_loader)
        train_losses.append(avg_loss)

        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

    return train_losses

def evaluate_model(model: nn.Module, test_loader: DataLoader) -> Tuple[List[float], List[float]]:
    model.eval()
    predictions = []
    actuals = []

    with torch.no_grad():
        for batch in test_loader:
            output = model(batch['vh'], batch['vl'], batch['exp_features'])
            predictions.extend(output.squeeze().tolist())
            actuals.extend(batch['target'].tolist())

    return predictions, actuals

ModuleNotFoundError: No module named 'torch'

### Train the model on the example data


In [None]:

def main():
    # Load data
    seq_df = pd.read_csv('antibody_sequences.csv')
    aggregation_df = pd.read_csv('asec_data.csv')
    binding_df = pd.read_csv('binding_data.csv')
    stability_df = pd.read_csv('stability_data.csv')

    aggregation_df = clean_aggregation_data(aggregation_df)
    binding_df = clean_binding_data(binding_df)
    stability_df = clean_stability_data(stability_df)

    integrated_dataset = integrate_datasets(binding_df, stability_df, aggregation_df)
    targets_df = prepare_for_ml(integrated_dataset)[0]

    # Create dataset
    dataset = AntibodyDataset(seq_df, targets_df)

    # Split dataset
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(
        dataset, [train_size, val_size]
    )

    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=32,
        shuffle=True,
        num_workers=4
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=32,
        shuffle=False,
        num_workers=4
    )

    # Initialize model
    model = AntibodyTransformer(
        vocab_size=20,
        d_model=128,
        nhead=8,
        num_layers=3,
        dropout=0.1
    )

    # Train model
    train_model(model, train_loader, val_loader)

if __name__ == "__main__":
    main()

### Exercise Tasks

1.  Data Loading and Assessment
    -   Load all three data files
    -   Run initial quality assessment
    -   Identify key data issues
2.  Data Cleaning
    -   Implement temperature standardization
    -   Clean binding affinity values
    -   Handle missing values
    -   Standardize units
3.  Data Integration
    -   Merge datasets
    -   Handle duplicate measurements
    -   Create final feature matrix
4.  ML Model Development
    -   Create feature matrix
    -   Define target variable
    -   Train simple model
    -   Evaluate results

### Bonus Challenges

1.  Advanced Data Cleaning
    -   Implement outlier detection
    -   Add data validation rules
    -   Create data quality reports
2.  Feature Engineering
    -   Create derived features
    -   Implement domain-specific transformations
    -   Add sequence-based features
3.  Model Improvements
    -   Implement cross-validation
    -   Try different ML algorithms
    -   Add uncertainty quantification

### Tips for Success

1.  Data Quality
    -   Always plot your data
    -   Check for outliers
    -   Validate units
    -   Document assumptions
2.  Integration
    -   Verify sample IDs
    -   Check for duplicates
    -   Validate merged data
3.  ML Development
    -   Start simple
    -   Test assumptions
    -   Validate results
