# Data Exploration and Initial Setup

This notebook explores the key datasets for the hybrid GNN-RNN framework and sets up the data preprocessing pipeline optimized for M1 MacBook Pro with 16GB RAM.

## Objectives
1. Explore temporal data (GSE175634) for RNN training
2. Examine spatial transcriptomics data for GNN training
3. Assess data quality and memory requirements
4. Set up memory-optimized preprocessing pipeline

In [None]:
import os
import sys
import yaml
import pandas as pd
import numpy as np
import scanpy as sc
import anndata as adata
import warnings
warnings.filterwarnings('ignore')

# Memory optimization for M1 Mac
import psutil
import gc
from memory_profiler import profile

# Set scanpy settings for memory efficiency
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=80, facecolor='white')

# Check system resources
def check_system_resources():
    memory = psutil.virtual_memory()
    cpu_count = psutil.cpu_count()
    print(f"Available RAM: {memory.available / (1024**3):.2f} GB")
    print(f"Total RAM: {memory.total / (1024**3):.2f} GB")
    print(f"CPU cores: {cpu_count}")
    print(f"Platform: {sys.platform}")

check_system_resources()

In [None]:
# Load dataset catalog
with open('../data_catalog/datasets.yaml', 'r') as f:
    datasets_config = yaml.safe_load(f)

print("Available datasets:")
for category, datasets in datasets_config['datasets'].items():
    print(f"\n{category.upper()}:")
    for name, info in datasets.items():
        print(f"  - {info['name']}")
        print(f"    Priority: {info['priority']}")
        print(f"    Platform: {info['platform']}")

## 1. Temporal Data Exploration (GSE175634)

This is our primary dataset for RNN training with 7 timepoints and 230K+ cells.

In [None]:
# Explore temporal data structure
temporal_path = "../data/selected_datasets/temporal_data/"

# Load experimental design
exp_design = pd.read_csv(
    os.path.join(temporal_path, "GSE175634_experimental_design.txt.gz"), 
    sep='\t'
)

print("Experimental Design Overview:")
print(f"Shape: {exp_design.shape}")
print("\nColumns:", exp_design.columns.tolist())
print("\nFirst 10 rows:")
print(exp_design.head(10))

# Analyze time points
timepoint_counts = exp_design['Day'].value_counts().sort_index()
print(f"\nTimepoint distribution:")
print(timepoint_counts)

In [None]:
# Load cell metadata (memory-optimized)
cell_metadata = pd.read_csv(
    os.path.join(temporal_path, "GSE175634_cell_metadata.tsv.gz"), 
    sep='\t',
    nrows=1000  # Load first 1000 rows for exploration
)

print("Cell Metadata Overview:")
print(f"Shape (first 1000 rows): {cell_metadata.shape}")
print("\nColumns:", cell_metadata.columns.tolist())
print("\nCell types available:")
print(cell_metadata['type'].value_counts())

# Check for key differentiation markers
print("\nKey columns for analysis:")
for col in ['diffday', 'type', 'dpt_pseudotime', 'S.Score', 'G2M.Score']:
    if col in cell_metadata.columns:
        print(f"  - {col}: {cell_metadata[col].dtype}")

In [None]:
# Memory-efficient loading of count matrix metadata
gene_indices = pd.read_csv(
    os.path.join(temporal_path, "GSE175634_gene_indices_counts.tsv.gz"),
    sep='\t',
    nrows=100  # First 100 genes for exploration
)

print("Gene Information:")
print(f"Shape (first 100 genes): {gene_indices.shape}")
print(f"Columns: {gene_indices.columns.tolist()}")
print("\nSample genes:")
print(gene_indices.head())

# Check for key cardiac genes
cardiac_genes = ['TNNT2', 'MYH6', 'MYH7', 'NKX2-5', 'GATA4', 'TBX5']
if 'gene_name' in gene_indices.columns:
    found_genes = gene_indices[gene_indices['gene_name'].isin(cardiac_genes)]
    print(f"\nCardiac genes found: {found_genes['gene_name'].tolist()}")

## 2. Spatial Transcriptomics Data Exploration

Exploring spatial datasets for GNN training.

In [None]:
# Explore spatial data structure
spatial_paths = {
    'space_ranger': "../data/selected_datasets/spatial_transcriptomics/Spatial Gene Expression dataset analyzed using Space Ranger 1.1.0/",
    'mi_rna': "../data/selected_datasets/spatial_transcriptomics/All-snRNA-Spatial multi-omic map of human myocardial infarction/",
    'xenium': "../data/selected_datasets/spatial_transcriptomics/In Situ Gene Expression dataset analyzed using Xenium Onboard Analysis 1.9.0/"
}

for name, path in spatial_paths.items():
    print(f"\n{name.upper()} Dataset:")
    if os.path.exists(path):
        files = os.listdir(path)
        print(f"  Files found: {len(files)}")
        print(f"  Key files: {[f for f in files if f.endswith(('.h5', '.csv', '.tsv', '.mtx'))][:5]}")
    else:
        print(f"  Path not found: {path}")

In [None]:
# Load Space Ranger data (memory optimized)
space_ranger_path = spatial_paths['space_ranger']

if os.path.exists(space_ranger_path):
    # Check for standard Space Ranger outputs
    h5_files = [f for f in os.listdir(space_ranger_path) if f.endswith('.h5')]
    csv_files = [f for f in os.listdir(space_ranger_path) if f.endswith('.csv')]
    
    print("Space Ranger Files:")
    print(f"  H5 files: {h5_files}")
    print(f"  CSV files: {csv_files}")
    
    # Load metrics summary if available
    metrics_files = [f for f in csv_files if 'metrics' in f.lower()]
    if metrics_files:
        metrics = pd.read_csv(os.path.join(space_ranger_path, metrics_files[0]))
        print(f"\nMetrics Summary:")
        print(metrics.head())

## 3. Memory Optimization Strategy

Setting up memory-efficient data loading and processing strategies.

In [None]:
# Memory optimization utilities
class MemoryOptimizer:
    def __init__(self, max_cells=10000, max_genes=5000):
        self.max_cells = max_cells
        self.max_genes = max_genes
        
    def subsample_data(self, adata, n_cells=None, n_genes=None):
        """Subsample data for memory efficiency"""
        if n_cells is None:
            n_cells = min(self.max_cells, adata.n_obs)
        if n_genes is None:
            n_genes = min(self.max_genes, adata.n_vars)
            
        # Subsample cells
        if adata.n_obs > n_cells:
            sc.pp.subsample(adata, n_obs=n_cells)
            
        # Select highly variable genes
        if adata.n_vars > n_genes:
            sc.pp.highly_variable_genes(adata, n_top_genes=n_genes)
            adata = adata[:, adata.var.highly_variable]
            
        return adata
    
    def memory_efficient_load(self, path, backed=True):
        """Load data with memory mapping when possible"""
        try:
            if backed and path.endswith('.h5ad'):
                return sc.read_h5ad(path, backed='r')
            else:
                return sc.read_h5ad(path)
        except Exception as e:
            print(f"Error loading {path}: {e}")
            return None

# Initialize memory optimizer
memory_opt = MemoryOptimizer(
    max_cells=datasets_config['memory_optimization']['max_cells_per_batch'],
    max_genes=datasets_config['memory_optimization']['max_genes_per_analysis']
)

print("Memory optimization strategy:")
print(f"  Max cells per batch: {memory_opt.max_cells}")
print(f"  Max genes per analysis: {memory_opt.max_genes}")

In [None]:
# Test memory usage with sample data
def estimate_memory_usage(n_cells, n_genes, dtype='float32'):
    """Estimate memory usage for given data dimensions"""
    bytes_per_element = np.dtype(dtype).itemsize
    memory_gb = (n_cells * n_genes * bytes_per_element) / (1024**3)
    return memory_gb

# Calculate memory requirements for different scenarios
scenarios = [
    (10000, 5000, "Small batch"),
    (50000, 10000, "Medium batch"), 
    (100000, 20000, "Large batch"),
    (230787, 25000, "Full GSE175634")
]

print("Memory usage estimates:")
for n_cells, n_genes, desc in scenarios:
    memory_gb = estimate_memory_usage(n_cells, n_genes)
    print(f"  {desc}: {n_cells} cells × {n_genes} genes = {memory_gb:.2f} GB")

## 4. Data Quality Assessment

Quick quality assessment of key datasets to inform preprocessing strategies.

In [None]:
# Quality control function
def quick_qc_assessment(metadata_df, qc_config):
    """Perform quick QC assessment on metadata"""
    results = {}
    
    if 'type' in metadata_df.columns:
        cell_type_dist = metadata_df['type'].value_counts()
        results['cell_types'] = cell_type_dist
        print("Cell type distribution:")
        print(cell_type_dist)
    
    # Check for missing values
    missing_data = metadata_df.isnull().sum()
    results['missing_data'] = missing_data[missing_data > 0]
    
    if len(results['missing_data']) > 0:
        print(f"\nMissing data columns:")
        print(results['missing_data'])
    
    return results

# Load full cell metadata for QC (in chunks if needed)
print("Loading full cell metadata for QC assessment...")

try:
    # Load in chunks to manage memory
    chunk_size = 50000
    metadata_chunks = []
    
    for chunk in pd.read_csv(
        os.path.join(temporal_path, "GSE175634_cell_metadata.tsv.gz"), 
        sep='\t',
        chunksize=chunk_size
    ):
        metadata_chunks.append(chunk[['cell', 'diffday', 'type', 'dpt_pseudotime']])
        if len(metadata_chunks) >= 5:  # Limit to first 5 chunks for exploration
            break
    
    sample_metadata = pd.concat(metadata_chunks, ignore_index=True)
    print(f"Loaded sample metadata: {sample_metadata.shape}")
    
    # Perform QC assessment
    qc_results = quick_qc_assessment(sample_metadata, datasets_config['qc_thresholds'])
    
except Exception as e:
    print(f"Error during QC assessment: {e}")

## 5. Next Steps

Based on this exploration, we'll proceed with:

1. **Memory-optimized preprocessing** using the strategies defined above
2. **Batch processing** for large datasets (GSE175634)
3. **Subsampling strategies** for model development phase
4. **Quality control pipelines** for all datasets

The next notebook will implement the preprocessing pipeline for each dataset type.

In [None]:
# Save exploration results for next notebook
exploration_results = {
    'temporal_data': {
        'timepoints': timepoint_counts.to_dict() if 'timepoint_counts' in locals() else {},
        'estimated_cells': 230787,
        'key_columns': ['cell', 'diffday', 'type', 'dpt_pseudotime']
    },
    'memory_strategy': {
        'max_cells_per_batch': memory_opt.max_cells,
        'max_genes_per_analysis': memory_opt.max_genes,
        'use_subsampling': True
    },
    'next_steps': [
        'Implement preprocessing pipeline',
        'Create memory-efficient data loaders',
        'Develop GNN and RNN model architectures',
        'Set up training pipeline'
    ]
}

# Save to file
import json
with open('../experiments/exploration_results.json', 'w') as f:
    json.dump(exploration_results, f, indent=2)

print("Exploration complete! Results saved to experiments/exploration_results.json")
print("\nNext: Run preprocessing notebook (1_preprocess/)")