In [None]:
import os
import gzip
import urllib.request
from pathlib import Path
import pandas as pd
import numpy as np
import scipy.sparse as sp
import anndata as ad

# URLs for the dataset files
URLS = {
    "barcodes": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE289nnn/GSE289084/suppl/GSE289084_pbmc_barcodes.tsv.gz",
    "features": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE289nnn/GSE289084/suppl/GSE289084_pbmc_features.tsv.gz",
    "features_adt": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE289nnn/GSE289084/suppl/GSE289084_pbmc_features_adt.tsv.gz",
    "matrix": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE289nnn/GSE289084/suppl/GSE289084_pbmc_matrix.tsv.gz",
    "matrix_adt": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE289nnn/GSE289084/suppl/GSE289084_pbmc_matrix_adt.tsv.gz",
    "readme": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE289nnn/GSE289084/suppl/GSE289084_Readme_CITESeq_Ab.xlsx"
}

def download_file(url, output_path):
    """Download a file from a URL to the specified output path."""
    if not os.path.exists(output_path):
        print(f"Downloading {url} to {output_path}")
        urllib.request.urlretrieve(url, output_path)
    else:
        print(f"File already exists: {output_path}")

def download_dataset(data_dir):
    """Download all dataset files to the specified directory."""
    os.makedirs(data_dir, exist_ok=True)
    
    for name, url in URLS.items():
        output_path = os.path.join(data_dir, os.path.basename(url))
        download_file(url, output_path)
    
    return {name: os.path.join(data_dir, os.path.basename(url)) for name, url in URLS.items()}

def parse_barcodes(file_path):
    """Parse cell barcodes from a gzipped TSV file."""
    with gzip.open(file_path, 'rt') as f:
        barcodes = [line.strip() for line in f]
    return barcodes

def parse_features(file_path):
    """Parse gene features from a gzipped TSV file."""
    with gzip.open(file_path, 'rt') as f:
        features = [line.strip() for line in f]
    return features

def process_chunk(chunk, feature_to_idx, data, row_indices, col_indices):
    """Process a chunk of lines from the matrix file."""
    for line in chunk:
        parts = line.strip().split('\t')
        feature = parts[0].strip('"')
        
        # Skip if feature is not in our features list
        if feature not in feature_to_idx:
            continue
        
        feature_idx = feature_to_idx[feature]
        
        # Process each cell's expression value
        for j, val in enumerate(parts[1:]):
            try:
                val = int(val)
                if val > 0:  # Only store non-zero values
                    data.append(val)
                    row_indices.append(feature_idx)
                    col_indices.append(j)
            except ValueError:
                pass  # Skip non-integer values

def parse_matrix_tsv_in_chunks(matrix_path, features_path, barcodes_path, chunk_size=1000):
    """
    Parse a matrix from a TSV file in chunks to minimize memory usage.
    Returns a sparse matrix in CSR format.
    """
    print(f"Parsing matrix from {matrix_path}")
    
    # Load features and barcodes
    features = parse_features(features_path)
    barcodes = parse_barcodes(barcodes_path)
    
    # Create feature index mapping
    feature_to_idx = {f: i for i, f in enumerate(features)}
    
    # Initialize lists for sparse matrix construction
    data = []
    row_indices = []
    col_indices = []
    
    # Read the matrix file
    with gzip.open(matrix_path, 'rt') as f:
        # Read header to get cell barcodes
        header = f.readline().strip().split('\t')
        header = [h.strip('"') for h in header[1:]]  # Skip first column and remove quotes
        
        # Process each line (gene) in chunks
        chunk = []
        for i, line in enumerate(f):
            if i % 100 == 0:
                print(f"Processed {i} rows")
            
            chunk.append(line)
            
            if len(chunk) >= chunk_size:
                process_chunk(chunk, feature_to_idx, data, row_indices, col_indices)
                chunk = []
        
        # Process any remaining lines
        if chunk:
            process_chunk(chunk, feature_to_idx, data, row_indices, col_indices)
    
    # Create a sparse matrix
    matrix = sp.csr_matrix((data, (row_indices, col_indices)), 
                          shape=(len(features), len(header)))
    
    return matrix, features, header

def extract_metadata_from_barcodes(barcodes):
    """Extract metadata from cell barcodes."""
    metadata = {}
    
    # Extract sample IDs and time points
    for barcode in barcodes:
        parts = barcode.split('-')
        sample_id = parts[0]
        
        # Extract patient ID and time point
        if sample_id.startswith('M'):
            patient_id = sample_id.split('_')[0]
            
            # Determine time point (W0 or W6)
            if 'week0' in sample_id or '_S1' in sample_id:
                time_point = 'W0'
            elif 'week6' in sample_id:
                time_point = 'W6'
            else:
                time_point = 'Unknown'
            
            metadata[barcode] = {
                'patient_id': patient_id,
                'time_point': time_point,
                'sample_id': sample_id
            }
        else:
            metadata[barcode] = {
                'patient_id': 'Unknown',
                'time_point': 'Unknown',
                'sample_id': sample_id
            }
    
    return pd.DataFrame.from_dict(metadata, orient='index')

def process_dataset(data_dir):
    """Process the GSE289084 dataset and return AnnData objects."""
    # Download the dataset if needed
    file_paths = download_dataset(data_dir)
    
    # Parse matrices
    print("Parsing gene expression matrix...")
    gene_matrix, gene_features, gene_barcodes = parse_matrix_tsv_in_chunks(
        file_paths['matrix'], file_paths['features'], file_paths['barcodes']
    )
    
    print("Parsing protein expression matrix...")
    protein_matrix, protein_features, protein_barcodes = parse_matrix_tsv_in_chunks(
        file_paths['matrix_adt'], file_paths['features_adt'], file_paths['barcodes']
    )
    
    # Extract metadata from barcodes
    print("Extracting metadata from barcodes...")
    metadata = extract_metadata_from_barcodes(gene_barcodes)
    
    # Read the README file to get antibody information
    print("Reading antibody information...")
    antibody_info = pd.read_excel(file_paths['readme'])
    
    # Clean up antibody info
    antibody_info = antibody_info.iloc[1:, :]  # Skip the header row
    antibody_info.columns = ['Category', 'Barcode_ID', 'Target', 'Clone', 'Species', 
                            'Barcode', 'Reference', 'Company']
    
    # Create AnnData objects
    print("Creating AnnData objects...")
    adata_gene = ad.AnnData(gene_matrix.T, obs=metadata)
    adata_gene.var_names = gene_features
    
    adata_protein = ad.AnnData(protein_matrix.T, obs=metadata)
    adata_protein.var_names = protein_features
    
    # Add antibody information to protein AnnData
    protein_var_df = pd.DataFrame(index=protein_features)
    for _, row in antibody_info.iterrows():
        if row['Target'] in protein_features:
            for col in antibody_info.columns:
                protein_var_df.loc[row['Target'], col] = row[col]
    
    adata_protein.var = protein_var_df
    
    # Ensure we only keep cells that have both gene and protein data
    common_barcodes = list(set(gene_barcodes).intersection(set(protein_barcodes)))
    print(f"Found {len(common_barcodes)} cells with both gene and protein data")
    
    adata_gene = adata_gene[common_barcodes, :]
    adata_protein = adata_protein[common_barcodes, :]
    
    return adata_gene, adata_protein

def update_metadata(adata):
    """Update metadata of an AnnData object with additional annotations."""
    adata.obs['organism'] = 'Homo sapiens'
    adata.obs['cell_type'] = 'PBMC'
    adata.obs['crispr_type'] = 'None'
    adata.obs['cancer_type'] = 'Melanoma'
    adata.obs['condition'] = adata.obs['time_point'].map({
        'W0': 'Baseline',
        'W6': 'Post-treatment',
        'Unknown': 'Unknown'
    })
    adata.obs['perturbation_name'] = 'nivolumab + ipilimumab'
    
    # Add dataset information in the uns attribute
    adata.uns['dataset_id'] = 'GSE289084'
    adata.uns['dataset_name'] = 'MELANFα clinical study'
    adata.uns['dataset_description'] = (
        'CITE-seq PBMC from 8 advanced melanoma patients taken at baseline (Week 0, W0) '
        'and 6 weeks after the initiation of treatment (Week 6, W6) with nivolumab (1mg/kg) '
        '+ ipilimumab (3mg/kg).'
    )
    return adata

def harmonize_data(adata_gene, adata_protein):
    """Harmonize the data into a multimodal AnnData object with RNA as the main modality."""
    adata = adata_gene.copy()
    
    # Add protein data as a separate modality
    adata.obsm['protein_expression'] = adata_protein.X
    
    # Add protein feature information
    adata.uns['protein_features'] = adata_protein.var_names.tolist()
    if hasattr(adata_protein, 'var') and not adata_protein.var.empty:
        adata.uns['protein_var'] = adata_protein.var
    
    # The metadata has already been updated for adata_gene; add any additional fields if needed.
    return adata

def main():
    """Main function to process the dataset and save the results."""
    # Set the data directory (relative to the current working directory)
    data_dir = os.path.join(os.getcwd(), 'GSE289084')
    os.makedirs(data_dir, exist_ok=True)
    
    print(f"Processing GSE289084 dataset in {data_dir}")
    
    # Process the dataset
    adata_gene, adata_protein = process_dataset(data_dir)
    
    # Update metadata for both gene and protein AnnData objects
    adata_gene = update_metadata(adata_gene)
    adata_protein = update_metadata(adata_protein)
    
    # Harmonize the data into a multimodal AnnData object
    adata_harmonized = harmonize_data(adata_gene, adata_protein)
    
    # Save the results
    harmonized_path = os.path.join(data_dir, 'GSE289084_harmonized.h5ad')
    gene_output_path = os.path.join(data_dir, 'GSE289084_gene_expression.h5ad')
    protein_output_path = os.path.join(data_dir, 'GSE289084_protein_expression.h5ad')
    
    print(f"Saving harmonized data to {harmonized_path}")
    adata_harmonized.write(harmonized_path)
    
    print(f"Saving gene expression data to {gene_output_path}")
    adata_gene.write(gene_output_path)
    
    print(f"Saving protein expression data to {protein_output_path}")
    adata_protein.write(protein_output_path)
    
    print("Processing complete!")

# Run the main function directly in the notebook
main()
