In [None]:
import os
import urllib.request
from pathlib import Path
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

# Set random seed for reproducibility
np.random.seed(42)

# Disable scanpy warnings
sc.settings.verbosity = 0

def download_file(url, dest_path):
    """Download a file from a URL to a destination path."""
    print(f"Downloading {url} to {dest_path}")
    try:
        urllib.request.urlretrieve(url, dest_path)
        return True
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return False

def ensure_data_files(data_dir):
    """Ensure all necessary data files are present, downloading them if needed."""
    data_dir = Path(data_dir)
    data_dir.mkdir(exist_ok=True, parents=True)
    
    # Define file URLs and paths
    files = [
        ("GSM8561441_sample1_barcodes.tsv.gz", "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8561nnn/GSM8561441/suppl/GSM8561441%5Fsample1%5Fbarcodes%2Etsv%2Egz"),
        ("GSM8561441_sample1_features.tsv.gz", "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8561nnn/GSM8561441/suppl/GSM8561441%5Fsample1%5Ffeatures%2Etsv%2Egz"),
        ("GSM8561441_sample1_matrix.mtx.gz", "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8561nnn/GSM8561441/suppl/GSM8561441%5Fsample1%5Fmatrix%2Emtx%2Egz"),
        ("GSM8561442_sample2_barcodes.tsv.gz", "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8561nnn/GSM8561442/suppl/GSM8561442%5Fsample2%5Fbarcodes%2Etsv%2Egz"),
        ("GSM8561442_sample2_features.tsv.gz", "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8561nnn/GSM8561442/suppl/GSM8561442%5Fsample2%5Ffeatures%2Etsv%2Egz"),
        ("GSM8561442_sample2_matrix.mtx.gz", "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8561nnn/GSM8561442/suppl/GSM8561442%5Fsample2%5Fmatrix%2Emtx%2Egz"),
        ("GSM8561443_sample5_barcodes.tsv.gz", "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8561nnn/GSM8561443/suppl/GSM8561443%5Fsample5%5Fbarcodes%2Etsv%2Egz"),
        ("GSM8561443_sample5_features.tsv.gz", "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8561nnn/GSM8561443/suppl/GSM8561443%5Fsample5%5Ffeatures%2Etsv%2Egz"),
        ("GSM8561443_sample5_matrix.mtx.gz", "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8561nnn/GSM8561443/suppl/GSM8561443%5Fsample5%5Fmatrix%2Emtx%2Egz"),
        ("GSE279098_feature_reference.csv.gz", "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279098/suppl/GSE279098%5Ffeature%5Freference%2Ecsv%2Egz")
    ]
    
    # Check and download missing files
    for filename, url in files:
        file_path = data_dir / filename
        if not file_path.exists():
            print(f"File {filename} not found. Downloading...")
            download_file(url, file_path)
        else:
            print(f"File {filename} already exists.")
    
    return True

def add_metadata(adata, sample_id):
    """Add standardized metadata to the AnnData object."""
    # Sample-specific metadata
    sample_metadata = {
        '441': {
            'condition': 'native',
            'perturbation_name': 'None',
            'treatment': 'mock'
        },
        '442': {
            'condition': 'control',
            'perturbation_name': 'sgSCR',
            'treatment': 'AAV-sgSCR'
        },
        '443': {
            'condition': 'BAAT-knockout',
            'perturbation_name': 'sgBaat',
            'treatment': 'AAV-sgBaat'
        }
    }
    
    # Common metadata for all samples
    adata.obs['organism'] = 'Mus musculus'
    adata.obs['cell_type'] = 'Liver immune cells'  # Based on study description
    adata.obs['crispr_type'] = 'CRISPR KO'  # Based on study description
    adata.obs['cancer_type'] = 'Liver cancer'  # Based on study description
    
    # Sample-specific metadata
    adata.obs['condition'] = sample_metadata[sample_id]['condition']
    adata.obs['perturbation_name'] = sample_metadata[sample_id]['perturbation_name']
    adata.obs['treatment'] = sample_metadata[sample_id]['treatment']
    adata.obs['sample_id'] = f'sample{sample_id[-1]}'
    
    return adata

def harmonize_dataset(data_dir):
    """Harmonize the GSE279098 dataset into h5ad format."""
    data_dir = Path(data_dir)
    
    # Ensure data files are present
    ensure_data_files(data_dir)
    
    # Process each sample - note that sample 3 is actually named sample5 in the files
    sample_mapping = {
        '441': '1',  # sample1 - native liver (mock)
        '442': '2',  # sample2 - control (AAV-sgSCR)
        '443': '5'   # sample5 - BAAT-knockout (AAV-sgBaat)
    }
    
    gene_adatas = []
    protein_adatas = []
    
    for gsm_id, sample_num in sample_mapping.items():
        print(f"Processing GSM8561{gsm_id} (sample{sample_num})...")
        
        # Define file paths
        matrix_file = data_dir / f"GSM8561{gsm_id}_sample{sample_num}_matrix.mtx.gz"
        features_file = data_dir / f"GSM8561{gsm_id}_sample{sample_num}_features.tsv.gz"
        barcodes_file = data_dir / f"GSM8561{gsm_id}_sample{sample_num}_barcodes.tsv.gz"
        
        # Read the data
        adata = sc.read_mtx(matrix_file).transpose()
        
        # Read features
        features_df = pd.read_csv(features_file, sep='\t', header=None)
        gene_ids = features_df[0].values
        gene_names = features_df[1].values
        feature_types = features_df[2].values
        
        # Read barcodes
        barcodes = pd.read_csv(barcodes_file, sep='\t', header=None)[0].values
        
        # Set the correct barcodes and features
        adata.obs_names = pd.Index(barcodes)
        adata.var_names = pd.Index(gene_names)
        adata.var['feature_types'] = feature_types
        adata.var['gene_ids'] = gene_ids
        
        # Identify gene expression and antibody capture features
        gene_mask = feature_types == 'Gene Expression'
        antibody_mask = feature_types == 'Antibody Capture'
        
        # Split into gene expression and antibody capture
        gene_adata = adata[:, gene_mask].copy()
        antibody_adata = adata[:, antibody_mask].copy()
        
        # Ensure var_names are unique
        gene_adata.var_names_make_unique()
        antibody_adata.var_names_make_unique()
        
        # Add metadata
        gene_adata = add_metadata(gene_adata, gsm_id)
        antibody_adata = add_metadata(antibody_adata, gsm_id)
        
        gene_adatas.append(gene_adata)
        protein_adatas.append(antibody_adata)
    
    # Concatenate all samples
    print("Concatenating samples...")
    gene_adata_combined = ad.concat(
        gene_adatas,
        join='outer',
        label='sample_id',
        keys=[f'sample{sample_mapping[s]}' for s in sample_mapping.keys()],
        index_unique='-'
    )
    
    protein_adata_combined = ad.concat(
        protein_adatas,
        join='outer',
        label='sample_id',
        keys=[f'sample{sample_mapping[s]}' for s in sample_mapping.keys()],
        index_unique='-'
    )
    
    # Ensure var_names are unique
    gene_adata_combined.var_names_make_unique()
    protein_adata_combined.var_names_make_unique()
    
    # Remove the "TotalSeq_" prefix from protein names in var_names
    protein_adata_combined.var_names = protein_adata_combined.var_names.str.replace("TotalSeq_", "", regex=False)
    
    # Save the harmonized data
    output_gene_file = data_dir / "GSE279098_gene_expression_harmonized.h5ad"
    output_protein_file = data_dir / "GSE279098_protein_expression_harmonized.h5ad"
    
    print(f"Saving gene expression data to {output_gene_file}")
    gene_adata_combined.write(output_gene_file, compression='gzip')
    
    print(f"Saving protein expression data to {output_protein_file}")
    protein_adata_combined.write(output_protein_file, compression='gzip')
    
    print("Harmonization complete!")
    
    return gene_adata_combined, protein_adata_combined

def main(data_dir=None):
    """Main function to process the GSE279098 dataset."""
    # Use provided data_dir or default to the current working directory
    if data_dir is None:
        data_dir = os.getcwd()
    
    print(f"Processing GSE279098 dataset in directory: {data_dir}")
    
    # Harmonize the dataset
    gene_adata, protein_adata = harmonize_dataset(data_dir)
    
    # Print summary statistics
    print("\nGene Expression Data Summary:")
    print(f"Number of cells: {gene_adata.n_obs}")
    print(f"Number of genes: {gene_adata.n_vars}")
    if hasattr(gene_adata.X, "nnz"):
        sparsity = gene_adata.X.nnz / (gene_adata.n_obs * gene_adata.n_vars)
        print(f"Sparsity: {sparsity:.4f}")
    
    print("\nProtein Expression Data Summary:")
    print(f"Number of cells: {protein_adata.n_obs}")
    print(f"Number of proteins: {protein_adata.n_vars}")
    if hasattr(protein_adata.X, "nnz"):
        sparsity = protein_adata.X.nnz / (protein_adata.n_obs * protein_adata.n_vars)
        print(f"Sparsity: {sparsity:.4f}")
    
    return gene_adata, protein_adata

# Directly call the main function in the notebook
gene_adata, protein_adata = main()
