In [None]:
import os
import gzip
import re
from pathlib import Path

import numpy as np
import pandas as pd
import scipy.io
import anndata as ad
import urllib.request

# Set random seed for reproducibility
np.random.seed(42)

# URLs for downloading the dataset files
URLS = {
    'matrix': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE269nnn/GSE269478/suppl/GSE269478_mono_RNA_protein_15.mtx.gz',
    'obs': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE269nnn/GSE269478/suppl/GSE269478_mono_RNA_protein_obs.txt.gz',
    'var': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE269nnn/GSE269478/suppl/GSE269478_mono_RNA_protein_var.txt.gz',
    'readme': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE269nnn/GSE269478/suppl/GSE269478_readme.txt'
}

def download_files(data_dir):
    """
    Download dataset files if they don't exist.
    
    Args:
        data_dir (Path): Directory to save the downloaded files.
    """
    data_dir.mkdir(exist_ok=True, parents=True)
    
    for name, url in URLS.items():
        file_path = data_dir / os.path.basename(url)
        if not file_path.exists():
            print(f"Downloading {name} file from {url}...")
            urllib.request.urlretrieve(url, file_path)
            print(f"Downloaded {file_path}")
        else:
            print(f"File {file_path} already exists, skipping download.")

def read_mtx_file(file_path):
    """
    Read a gzipped MTX file.
    
    Args:
        file_path (Path): Path to the gzipped MTX file.
        
    Returns:
        scipy.sparse.csr_matrix: Sparse matrix from the MTX file.
    """
    print(f"Reading matrix file {file_path}...")
    return scipy.io.mmread(gzip.open(file_path, 'rb')).T.tocsr()

def read_txt_file(file_path):
    """
    Read a gzipped text file.
    
    Args:
        file_path (Path): Path to the gzipped text file.
        
    Returns:
        pandas.DataFrame: DataFrame containing the file contents.
    """
    print(f"Reading text file {file_path}...")
    return pd.read_csv(gzip.open(file_path, 'rt'), sep='\t')

def extract_sample_info(cell_ids):
    """
    Extract sample information from cell IDs.
    
    Args:
        cell_ids (pandas.Index): Cell IDs from the dataset.
        
    Returns:
        pandas.DataFrame: DataFrame with extracted sample information.
    """
    # Extract sample IDs from cell IDs (e.g., RPM211A_AAACCCAAGTTCCGGC-1 -> RPM211A)
    sample_ids = [re.match(r'([^_]+)', cell_id).group(1) for cell_id in cell_ids]
    
    # Extract subject IDs (e.g., RPM211A -> RPM211)
    subject_ids = [re.match(r'([A-Z]+\d+)', sample_id).group(1) for sample_id in sample_ids]
    
    # Extract library information (e.g., RPM211A -> A)
    library_ids = [sample_id.replace(subject_id, '') for sample_id, subject_id in zip(sample_ids, subject_ids)]
    
    return pd.DataFrame({
        'sample_id': sample_ids,
        'subject_id': subject_ids,
        'library_id': library_ids
    }, index=cell_ids)

def process_dataset(data_dir):
    """
    Process the GSE269478 dataset.
    
    Args:
        data_dir (Path): Directory containing the dataset files.
        
    Returns:
        tuple: (gene_adata, protein_adata) - AnnData objects for gene expression and protein data.
    """
    # Read matrix and metadata
    matrix = read_mtx_file(data_dir / 'GSE269478_mono_RNA_protein_15.mtx.gz')
    obs_df = read_txt_file(data_dir / 'GSE269478_mono_RNA_protein_obs.txt.gz')
    var_df = read_txt_file(data_dir / 'GSE269478_mono_RNA_protein_var.txt.gz')
    
    print(f"Matrix shape: {matrix.shape}")
    print(f"Observations: {obs_df.shape}")
    print(f"Variables: {var_df.shape}")
    
    # Set index
    obs_df.set_index('cell_id', inplace=True)
    var_df.set_index('gene', inplace=True)
    
    # Merge sample info
    sample_info = extract_sample_info(obs_df.index)
    obs_df = pd.concat([obs_df, sample_info], axis=1)
    
    # Create AnnData
    adata = ad.AnnData(X=matrix, obs=obs_df, var=var_df)
    
    # Split out gene expression vs protein
    gene_mask = var_df['feature_type'] == 'Gene Expression'
    protein_mask = var_df['feature_type'] == 'Antibody Capture'
    
    gene_adata = adata[:, gene_mask].copy()
    protein_adata = adata[:, protein_mask].copy()
    
    print(f"Gene expression data shape: {gene_adata.shape}")
    print(f"Protein data shape: {protein_adata.shape}")
    
    # Check for duplicate gene names and make unique if needed
    if gene_adata.var_names.duplicated().any():
        print(f"Found {gene_adata.var_names.duplicated().sum()} duplicate gene names. Making them unique.")
        gene_adata.var_names_make_unique()
    
    # Add harmonized metadata
    for cur_adata in [gene_adata, protein_adata]:
        cur_adata.obs['organism'] = 'Homo sapiens'
        cur_adata.obs['cell_type'] = 'Monocytes'
        cur_adata.obs['crispr_type'] = 'None'
        cur_adata.obs['cancer_type'] = 'Non-Cancer'
        cur_adata.obs['condition'] = 'Cardiovascular Disease Risk Study'
        cur_adata.obs['perturbation_name'] = 'None'
    
    # Ensure both have the same cells
    common_cells = gene_adata.obs_names.intersection(protein_adata.obs_names)
    print(f"Common cells between gene and protein data: {len(common_cells)}")
    
    gene_adata = gene_adata[common_cells].copy()
    protein_adata = protein_adata[common_cells].copy()
    
    print(f"Final gene expression data shape: {gene_adata.shape}")
    print(f"Final protein data shape: {protein_adata.shape}")
    
    # Remove everything after the first dash (including -ADT) to keep just the leftmost part.
    # e.g. "CD102-ICAM-2-ADT" -> "CD102"
    cleaned_protein_names = []
    for name in protein_adata.var_names:
        # 1) Remove the trailing "-ADT" if present
        name_no_adt = re.sub(r'-ADT$', '', name)
        # 2) Keep only the substring before the first dash
        main_part = name_no_adt.split('-', 1)[0]
        cleaned_protein_names.append(main_part)
    
    # Overwrite var_names and store an extra column
    protein_adata.var_names = cleaned_protein_names
    protein_adata.var['protein_name'] = protein_adata.var_names
    
    # Optionally, add gene names in gene_adata.var
    gene_adata.var['gene_name'] = gene_adata.var_names
    
    return gene_adata, protein_adata

def main(data_dir=None):
    """
    Main function to process the dataset.
    
    Args:
        data_dir (str, optional): Path to the data directory. Defaults to None.
    """
    if data_dir is None:
        data_dir = Path('GSE269478')
    else:
        data_dir = Path(data_dir)
    
    # Download files if not present
    download_files(data_dir)
    
    # Process dataset
    gene_adata, protein_adata = process_dataset(data_dir)
    
    # Save the processed data
    output_dir = data_dir / 'processed'
    output_dir.mkdir(exist_ok=True, parents=True)
    
    gene_output_path = output_dir / 'GSE269478_gene_expression.h5ad'
    protein_output_path = output_dir / 'GSE269478_protein_expression.h5ad'
    
    print(f"Saving gene expression data to {gene_output_path}")
    gene_adata.write(gene_output_path, compression='gzip')
    
    print(f"Saving protein expression data to {protein_output_path}")
    protein_adata.write(protein_output_path, compression='gzip')
    
    print("Processing complete!")

# If you're using a Jupyter notebook, just call main() in a cell:
main()
