# MoA Prediction: Data Exploration and Processing

This notebook demonstrates the data collection and processing pipeline for the MoA prediction framework.

In [5]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from moa.utils.config import Config
from moa.data.collectors import DataCollectorFactory
from moa.data.processors import DataProcessor
from moa.data.validators import DataValidator

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## 1. Configuration Setup

In [7]:
# Load configuration
config = Config('../configs/config.yaml')

# Display key configuration settings
print("Configuration Summary:")
print(f"Prediction type: {config.get('scope.prediction_type')}")
print(f"Enabled modalities: {config.get('scope.modalities')}")
print(f"ChEMBL version: {config.get('chembl.version')}")
print(f"Data benchmarks: {config.get('data.benchmarks')}")

Configuration Summary:
Prediction type: multi_label
Enabled modalities: {'chemistry': True, 'perturbation': True, 'targets': True, 'pathways': True, 'structures': False}
ChEMBL version: None
Data benchmarks: ['chembl', 'drugbank', 'lincs_l1000']


## 2. Data Collection Demo

**Note**: This is a demonstration. In practice, you would run the data collection scripts to download full datasets.

In [8]:
# Create a small sample dataset for demonstration
sample_data = {
    'molecule_chembl_id': ['CHEMBL1', 'CHEMBL2', 'CHEMBL3', 'CHEMBL4', 'CHEMBL5'],
    'canonical_smiles': [
        'CCO',  # Ethanol
        'CC(=O)OC1=CC=CC=C1C(=O)O',  # Aspirin
        'CN1C=NC2=C1C(=O)N(C(=O)N2C)C',  # Caffeine
        'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O',  # Ibuprofen
        'CN(C)CCOC1=CC=C(C=C1)C(C2=CC=CC=C2)C3=CC=CC=C3'  # Diphenhydramine
    ],
    'mechanism_of_action': [
        'CNS depressant',
        'Cyclooxygenase inhibitor',
        'Adenosine receptor antagonist',
        'Cyclooxygenase inhibitor',
        'Histamine H1 receptor antagonist'
    ],
    'target_chembl_id': ['CHEMBL1', 'CHEMBL230', 'CHEMBL1824', 'CHEMBL230', 'CHEMBL231']
}

sample_df = pd.DataFrame(sample_data)
print("Sample dataset:")
print(sample_df)

Sample dataset:
  molecule_chembl_id                                canonical_smiles  \
0            CHEMBL1                                             CCO   
1            CHEMBL2                        CC(=O)OC1=CC=CC=C1C(=O)O   
2            CHEMBL3                    CN1C=NC2=C1C(=O)N(C(=O)N2C)C   
3            CHEMBL4                   CC(C)CC1=CC=C(C=C1)C(C)C(=O)O   
4            CHEMBL5  CN(C)CCOC1=CC=C(C=C1)C(C2=CC=CC=C2)C3=CC=CC=C3   

                mechanism_of_action target_chembl_id  
0                    CNS depressant          CHEMBL1  
1          Cyclooxygenase inhibitor        CHEMBL230  
2     Adenosine receptor antagonist       CHEMBL1824  
3          Cyclooxygenase inhibitor        CHEMBL230  
4  Histamine H1 receptor antagonist        CHEMBL231  


## 3. Data Processing Pipeline

In [9]:
# Initialize data processor
processor = DataProcessor(config)

# Process SMILES
print("Processing SMILES...")
processed_df = processor.smiles_processor.process_smiles_column(sample_df)
print(f"Processed {len(processed_df)} compounds")
print(processed_df[['canonical_smiles', 'standardized_smiles']].head())

Processing SMILES...
Processed 0 compounds
Empty DataFrame
Columns: [canonical_smiles, standardized_smiles]
Index: []


In [10]:
# Process labels
print("Processing MoA labels...")
processed_df = processor.label_processor.process_moa_labels(processed_df)

# Show label columns
label_cols = [col for col in processed_df.columns if col.startswith('moa_')]
print(f"Created {len(label_cols)} label columns:")
print(label_cols[:10])  # Show first 10

if len(label_cols) > 0:
    print("\nLabel matrix:")
    print(processed_df[label_cols].head())

Processing MoA labels...
Created 3 label columns:
['moa_cleaned', 'moa_list', 'moa_classes']

Label matrix:
Empty DataFrame
Columns: [moa_cleaned, moa_list, moa_classes]
Index: []


## 4. Data Validation

In [17]:
# Initialize validator
validator = DataValidator(config)

# Run validation checks
print("Running validation checks...")

# SMILES validation
smiles_results = validator.validate_smiles(processed_df)
print("\nSMILES validation:")
for check, passed in smiles_results.items():
    status = "✓" if passed else "✗"
    print(f"  {status} {check}")

# Label validation
label_results = validator.validate_labels(processed_df)
print("\nLabel validation:")
for check, passed in label_results.items():
    status = "✓" if passed else "✗"
    print(f"  {status} {check}")

# Dataset size validation
size_results = validator.validate_dataset_size(processed_df)
print("\nDataset size validation:")
for check, passed in size_results.items():
    status = "✓" if passed else "✗"
    print(f"  {status} {check}")

Running validation checks...

SMILES validation:
  ✓ smiles_column_exists
  ✓ no_missing_smiles
  ✓ all_smiles_valid
  ✓ no_duplicate_smiles

Label validation:
  ✓ label_columns_exist
  ✓ no_missing_labels
  ✗ sufficient_samples_per_label
  ✓ no_empty_labels


TypeError: '>=' not supported between instances of 'int' and 'NoneType'

## 5. Data Visualization

In [16]:
# Visualize MoA distribution
if 'moa_cleaned' in processed_df.columns:
    plt.figure(figsize=(10, 6))
    moa_counts = processed_df['moa_cleaned'].value_counts()
    moa_counts.plot(kind='bar')
    plt.title('Distribution of Mechanisms of Action')
    plt.xlabel('Mechanism of Action')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# Molecular weight distribution (if RDKit is available)
try:
    from rdkit import Chem
    from rdkit.Chem import Descriptors
    
    def get_mol_weight(smiles):
        try:
            mol = Chem.MolFromSmiles(smiles)
            return Descriptors.MolWt(mol) if mol else None
        except:
            return None
    
    processed_df['mol_weight'] = processed_df['standardized_smiles'].apply(get_mol_weight)
    
    plt.figure(figsize=(10, 6))
    processed_df['mol_weight'].dropna().hist(bins=20, alpha=0.7)
    plt.title('Distribution of Molecular Weights')
    plt.xlabel('Molecular Weight (Da)')
    plt.ylabel('Count')
    plt.show()
    
except ImportError:
    print("RDKit not available for molecular property calculations")

IndexError: index 0 is out of bounds for axis 0 with size 0

## 6. Next Steps

This notebook demonstrated the basic data processing pipeline. Next steps include:

1. **Full Data Collection**: Run the data collection scripts to download complete datasets from ChEMBL, LINCS, and other sources
2. **Feature Engineering**: Implement chemical graph features, mechanism tokens, and perturbational biology features
3. **Model Development**: Build the multi-modal architecture with hypergraph fusion
4. **Training Pipeline**: Implement the training loop with multiple objectives
5. **Evaluation**: Comprehensive evaluation on multiple splits and metrics

### Commands to run full pipeline:

```bash
# Download data
python scripts/download_data.py --sources chembl reactome

# Process data
python scripts/process_data.py --create-splits --validate

# Or use the CLI
moa-data collect --source chembl
moa-data process --create-splits
```