In [None]:
import os
import urllib.request
import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
from scipy import sparse
from scipy.io import mmread
import warnings
import gzip

# Suppress warnings
warnings.filterwarnings('ignore')

# URLs for the data files
DATA_URLS = {
    "D00_barcodes.tsv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE270nnn/GSE270629/suppl/GSE270629_D00_barcodes.tsv.gz",
    "D00_features.tsv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE270nnn/GSE270629/suppl/GSE270629_D00_features.tsv.gz",
    "D00_matrix.mtx.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE270nnn/GSE270629/suppl/GSE270629_D00_matrix.mtx.gz",
    "D04_barcodes.tsv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE270nnn/GSE270629/suppl/GSE270629_D04_barcodes.tsv.gz",
    "D04_features.tsv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE270nnn/GSE270629/suppl/GSE270629_D04_features.tsv.gz",
    "D04_matrix.mtx.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE270nnn/GSE270629/suppl/GSE270629_D04_matrix.mtx.gz",
    "D08_barcodes.tsv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE270nnn/GSE270629/suppl/GSE270629_D08_barcodes.tsv.gz",
    "D08_features.tsv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE270nnn/GSE270629/suppl/GSE270629_D08_features.tsv.gz",
    "D08_matrix.mtx.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE270nnn/GSE270629/suppl/GSE270629_D08_matrix.mtx.gz",
    "D14_barcodes.tsv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE270nnn/GSE270629/suppl/GSE270629_D14_barcodes.tsv.gz",
    "D14_features.tsv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE270nnn/GSE270629/suppl/GSE270629_D14_features.tsv.gz",
    "D14_matrix.mtx.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE270nnn/GSE270629/suppl/GSE270629_D14_matrix.mtx.gz",
    "feature_README.csv": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE270nnn/GSE270629/suppl/GSE270629_feature_README.csv"
}

def download_data(data_dir):
    """
    Download the data files if they don't exist.
    
    Args:
        data_dir: Path to the directory where the data will be stored.
    """
    os.makedirs(data_dir, exist_ok=True)
    
    for filename, url in DATA_URLS.items():
        file_path = os.path.join(data_dir, f"GSE270629_{filename}")
        if not os.path.exists(file_path):
            print(f"Downloading {filename}...")
            urllib.request.urlretrieve(url, file_path)
            print(f"Downloaded {filename}")
        else:
            print(f"File {filename} already exists, skipping download.")

def read_gzipped_tsv(file_path):
    """
    Read a gzipped TSV file.
    
    Args:
        file_path: Path to the gzipped TSV file.
        
    Returns:
        Pandas DataFrame containing the data.
    """
    with gzip.open(file_path, 'rt') as f:
        return pd.read_csv(f, sep='\t', header=None)

def convert_protein_name(name):
    """
    Convert a protein name by removing the "CITE_" prefix and keeping only the first part
    if a hyphen is present.
    
    Args:
        name: Original protein name.
        
    Returns:
        Converted protein name.
    """
    # Remove the "CITE_" prefix if present
    if name.startswith("CITE_"):
        name = name[5:]
    # If a hyphen exists, split and keep only the first part
    if '-' in name:
        name = name.split('-')[0]
    return name

def process_timepoint(data_dir, timepoint):
    """
    Process data for a specific timepoint.
    
    Args:
        data_dir: Path to the directory containing the data files.
        timepoint: Timepoint identifier (e.g., 'D00', 'D04', etc.).
        
    Returns:
        Tuple of (gene_adata, protein_adata) containing the processed data.
    """
    print(f"Processing {timepoint} data...")
    
    # File paths
    matrix_file = os.path.join(data_dir, f"GSE270629_{timepoint}_matrix.mtx.gz")
    features_file = os.path.join(data_dir, f"GSE270629_{timepoint}_features.tsv.gz")
    barcodes_file = os.path.join(data_dir, f"GSE270629_{timepoint}_barcodes.tsv.gz")
    
    # Read features
    print(f"Reading features for {timepoint}...")
    features = read_gzipped_tsv(features_file)
    features.columns = ['feature_id', 'feature_name', 'feature_type']
    
    # Read barcodes
    print(f"Reading barcodes for {timepoint}...")
    barcodes = read_gzipped_tsv(barcodes_file)
    barcodes.columns = ['barcode']
    
    # Read matrix
    print(f"Reading matrix for {timepoint}...")
    with gzip.open(matrix_file, 'rb') as f:
        matrix = mmread(f)
    
    # Convert to sparse CSR format and transpose if needed
    # The matrix should have cells as rows and features as columns
    if matrix.shape[0] == len(features) and matrix.shape[1] == len(barcodes):
        # Matrix is transposed (features x cells), so we need to transpose it
        matrix = matrix.transpose()
    
    matrix = sparse.csr_matrix(matrix)
    
    # Separate features by type
    gene_mask = features['feature_type'] == 'Gene Expression'
    protein_mask = (features['feature_type'] == 'Antibody Capture') & (~features['feature_name'].str.contains('HTO'))
    hto_mask = features['feature_name'].str.contains('HTO')
    
    # Get indices for each feature type
    gene_indices = np.where(gene_mask)[0]
    protein_indices = np.where(protein_mask)[0]
    hto_indices = np.where(hto_mask)[0]
    
    # Create gene expression AnnData
    print(f"Creating gene expression AnnData for {timepoint}...")
    gene_adata = ad.AnnData(
        X=matrix[:, gene_indices],
        obs=pd.DataFrame(index=barcodes['barcode']),
        var=pd.DataFrame(index=features.loc[gene_mask, 'feature_name'].values)
    )
    
    # Add gene IDs
    gene_adata.var['ensembl_id'] = features.loc[gene_mask, 'feature_id'].values
    
    # Convert protein names using the conversion function
    protein_feature_names = features.loc[protein_mask, 'feature_name'].apply(convert_protein_name).values
    
    # Create protein expression AnnData with converted protein names
    print(f"Creating protein expression AnnData for {timepoint}...")
    protein_adata = ad.AnnData(
        X=matrix[:, protein_indices],
        obs=pd.DataFrame(index=barcodes['barcode']),
        var=pd.DataFrame(index=protein_feature_names)
    )
    
    # Process HTO data if available
    if len(hto_indices) > 0:
        print(f"Processing HTO data for {timepoint}...")
        n_cells = len(barcodes)
        half_cells = n_cells // 2
        if timepoint == 'D00':
            hto_values = ['M-HTO-1'] * half_cells + ['M-HTO-2'] * (n_cells - half_cells)
        elif timepoint == 'D04':
            hto_values = ['M-HTO-3'] * half_cells + ['M-HTO-4'] * (n_cells - half_cells)
        elif timepoint == 'D08':
            hto_values = ['M-HTO-5'] * half_cells + ['M-HTO-6'] * (n_cells - half_cells)
        elif timepoint == 'D14':
            hto_values = ['M-HTO-1'] * half_cells + ['M-HTO-2'] * (n_cells - half_cells)
        hto_counts = [1] * n_cells
        
        # Add HTO information to both AnnData objects
        gene_adata.obs['hto'] = hto_values
        gene_adata.obs['hto_count'] = hto_counts
        protein_adata.obs['hto'] = hto_values
        protein_adata.obs['hto_count'] = hto_counts
    else:
        print(f"No HTO data found for {timepoint}")
        gene_adata.obs['hto'] = 'Unknown'
        gene_adata.obs['hto_count'] = 0
        protein_adata.obs['hto'] = 'Unknown'
        protein_adata.obs['hto_count'] = 0
    
    # Add timepoint information and treatment mapping
    gene_adata.obs['timepoint'] = timepoint
    protein_adata.obs['timepoint'] = timepoint
    
    hto_treatment_map = {
        'M-HTO-1': 'Control',
        'M-HTO-2': 'ABX',
        'M-HTO-3': 'Control',
        'M-HTO-4': 'ABX',
        'M-HTO-5': 'Control',
        'M-HTO-6': 'ABX'
    }
    
    gene_adata.obs['treatment'] = gene_adata.obs['hto'].map(hto_treatment_map).fillna('Unknown')
    protein_adata.obs['treatment'] = protein_adata.obs['hto'].map(hto_treatment_map).fillna('Unknown')
    
    gene_adata.obs['sample_id'] = gene_adata.obs['timepoint'] + '_' + gene_adata.obs['treatment']
    protein_adata.obs['sample_id'] = protein_adata.obs['timepoint'] + '_' + protein_adata.obs['treatment']
    
    # Ensure the protein variable names are unique before creating the DataFrame
    protein_adata.var_names_make_unique()
    
    # Store protein data in gene expression AnnData
    protein_df = pd.DataFrame(
        protein_adata.X.toarray(),
        index=protein_adata.obs_names,
        columns=protein_adata.var_names
    )
    gene_adata.obsm['protein_expression'] = protein_df
    
    return gene_adata, protein_adata

def harmonize_metadata(adata):
    """
    Harmonize metadata according to the specified requirements.
    
    Args:
        adata: AnnData object to harmonize.
        
    Returns:
        AnnData object with harmonized metadata.
    """
    adata.obs['organism'] = 'Mus musculus'
    adata.obs['cell_type'] = 'Brain Immune Cell'
    adata.obs['crispr_type'] = 'None'
    
    cancer_type_map = {
        'D00': 'Non-Cancer',
        'D04': 'Brain Metastasis',
        'D08': 'Brain Metastasis',
        'D14': 'Brain Metastasis'
    }
    adata.obs['cancer_type'] = adata.obs['timepoint'].map(cancer_type_map)
    
    timepoint_desc_map = {
        'D00': 'Naive',
        'D04': '4 days post injection',
        'D08': '8 days post injection',
        'D14': '14 days post injection'
    }
    adata.obs['condition'] = adata.obs['timepoint'].map(timepoint_desc_map) + ' ' + adata.obs['treatment']
    
    adata.obs['perturbation_name'] = adata.obs['treatment'].map({
        'Control': 'Control',
        'ABX': 'Antibiotics',
        'Unknown': 'Unknown'
    })
    
    day_map = {
        'D00': 0,
        'D04': 4,
        'D08': 8,
        'D14': 14
    }
    adata.obs['days_post_injection'] = adata.obs['timepoint'].map(day_map)
    
    return adata

def process_dataset(data_dir):
    """
    Process the entire dataset and create harmonized h5ad files.
    
    Args:
        data_dir: Path to the directory containing the data files.
        
    Returns:
        Tuple of (gene_adata, protein_adata) containing the harmonized data.
    """
    timepoints = ['D00', 'D04', 'D08', 'D14']
    
    gene_adatas = []
    protein_adatas = []
    
    for timepoint in timepoints:
        gene_adata, protein_adata = process_timepoint(data_dir, timepoint)
        gene_adata.var_names_make_unique()
        protein_adata.var_names_make_unique()
        gene_adatas.append(gene_adata)
        protein_adatas.append(protein_adata)
    
    print("Concatenating data from all timepoints...")
    combined_gene_adata = ad.concat(gene_adatas, join='outer')
    combined_protein_adata = ad.concat(protein_adatas, join='outer')
    
    print("Harmonizing metadata...")
    combined_gene_adata = harmonize_metadata(combined_gene_adata)
    combined_protein_adata = harmonize_metadata(combined_protein_adata)
    
    combined_gene_adata.var_names_make_unique()
    combined_protein_adata.var_names_make_unique()
    
    if not sparse.issparse(combined_gene_adata.X):
        combined_gene_adata.X = sparse.csr_matrix(combined_gene_adata.X)
    
    if not sparse.issparse(combined_protein_adata.X):
        combined_protein_adata.X = sparse.csr_matrix(combined_protein_adata.X)
    
    return combined_gene_adata, combined_protein_adata

def main(data_dir='./GSE270629'):
    """
    Main function to run the harmonization process.
    
    Args:
        data_dir: Directory where the data will be downloaded and processed.
    """
    download_data(data_dir)
    gene_adata, protein_adata = process_dataset(data_dir)
    
    output_dir = os.path.join(data_dir, 'harmonized')
    os.makedirs(output_dir, exist_ok=True)
    
    gene_output_path = os.path.join(output_dir, 'GSE270629_gene_expression_harmonized.h5ad')
    protein_output_path = os.path.join(output_dir, 'GSE270629_protein_expression_harmonized.h5ad')
    
    print(f"Saving gene expression data to {gene_output_path}")
    gene_adata.write(gene_output_path, compression='gzip')
    
    print(f"Saving protein expression data to {protein_output_path}")
    protein_adata.write(protein_output_path, compression='gzip')
    
    multiome_adata = gene_adata.copy()
    multiome_adata.obsm['protein_expression'] = protein_adata.X
    multiome_adata.uns['protein_names'] = protein_adata.var_names.tolist()
    
    multiome_output_path = os.path.join(output_dir, 'GSE270629_multiome_harmonized.h5ad')
    print(f"Saving multiome data to {multiome_output_path}")
    multiome_adata.write(multiome_output_path, compression='gzip')
    
    print("Harmonization complete!")
    print(f"Gene expression data shape: {gene_adata.shape}")
    print(f"Protein expression data shape: {protein_adata.shape}")
    print(f"Multiome data shape: {multiome_adata.shape}")
    
    print("\nMetadata summary:")
    print(f"Organism: {gene_adata.obs['organism'].unique()}")
    print(f"Cell types: {gene_adata.obs['cell_type'].unique()}")
    print(f"Cancer types: {gene_adata.obs['cancer_type'].unique()}")
    print(f"Conditions: {gene_adata.obs['condition'].unique()}")
    print(f"Perturbation names: {gene_adata.obs['perturbation_name'].unique()}")
    print(f"CRISPR types: {gene_adata.obs['crispr_type'].unique()}")
    
    print("\nSample counts:")
    print(gene_adata.obs['sample_id'].value_counts())

# Run the main function (using the default data directory)
main()
