In [None]:
import os
import gzip
import urllib.request
import pandas as pd
import numpy as np
import anndata as ad
from scipy import sparse
from scipy.io import mmread
from pathlib import Path
import re

def download_data(data_dir):
    """
    Download the GSE269574 dataset if not already present.
    
    Args:
        data_dir: Path to the directory where data should be stored
    """
    data_dir = Path(data_dir)
    data_dir.mkdir(exist_ok=True, parents=True)
    
    # Define files to download
    files = {
        "GSM8322394_barcodes.tsv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8322nnn/GSM8322394/suppl/GSM8322394_barcodes.tsv.gz",
        "GSM8322394_features.tsv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8322nnn/GSM8322394/suppl/GSM8322394_features.tsv.gz",
        "GSM8322394_matrix.mtx.gz": "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8322nnn/GSM8322394/suppl/GSM8322394_matrix.mtx.gz",
        "GSM8322396_filtered_contig_annotations.csv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8322nnn/GSM8322396/suppl/GSM8322396_filtered_contig_annotations.csv.gz",
        "GSE269574_CITE-Seq_Cell_Surface_Protein_Antibody_Panel.xlsx": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE269nnn/GSE269574/suppl/GSE269574_CITE-Seq_Cell_Surface_Protein_Antibody_Panel.xlsx"
    }
    
    for filename, url in files.items():
        file_path = data_dir / filename
        if not file_path.exists():
            print(f"Downloading {filename}...")
            urllib.request.urlretrieve(url, file_path)
            print(f"Downloaded {filename}")
        else:
            print(f"{filename} already exists, skipping download")

def read_10x_data(data_dir):
    """
    Read 10x Genomics data files.
    
    Args:
        data_dir: Path to the directory containing the data files
        
    Returns:
        tuple: (matrix, features, barcodes)
    """
    data_dir = Path(data_dir)
    
    # Read barcodes
    with gzip.open(data_dir / "GSM8322394_barcodes.tsv.gz", 'rt') as f:
        barcodes = [line.strip() for line in f]
    
    # Read features
    features = []
    with gzip.open(data_dir / "GSM8322394_features.tsv.gz", 'rt') as f:
        for line in f:
            parts = line.strip().split('\t')
            features.append(parts)
    
    # Read matrix (transpose to get cells as rows, features as columns)
    matrix = mmread(str(data_dir / "GSM8322394_matrix.mtx.gz")).T
    
    return matrix, features, barcodes

def read_tcr_data(data_dir):
    """
    Read TCR data.
    
    Args:
        data_dir: Path to the directory containing the data files
        
    Returns:
        pandas.DataFrame: TCR data
    """
    data_dir = Path(data_dir)
    tcr_data = pd.read_csv(data_dir / "GSM8322396_filtered_contig_annotations.csv.gz")
    return tcr_data

def process_data(data_dir):
    """
    Process the GSE269574 dataset.
    
    Args:
        data_dir: Path to the directory containing the data files
        
    Returns:
        tuple: (gene_adata, protein_adata)
    """
    data_dir = Path(data_dir)
    
    # Read data
    matrix, features, barcodes = read_10x_data(data_dir)
    tcr_data = read_tcr_data(data_dir)
    
    # Process features
    feature_df = pd.DataFrame(features, columns=['id', 'name', 'feature_type'])
    
    # Split gene expression and protein features
    gene_indices = feature_df['feature_type'] == 'Gene Expression'
    protein_indices = feature_df['feature_type'] == 'Antibody Capture'
    
    gene_features = feature_df[gene_indices]
    protein_features = feature_df[protein_indices]
    
    # Convert to CSR for indexing
    matrix_csr = matrix.tocsr()
    
    # Create AnnData objects for gene and protein data
    gene_matrix = matrix_csr[:, gene_indices.values]
    protein_matrix = matrix_csr[:, protein_indices.values]
    
    # Check for duplicate gene names and make them unique if needed
    gene_names = gene_features['name'].values
    gene_ids = gene_features['id'].values
    if len(gene_names) != len(set(gene_names)):
        print(f"Found {len(gene_names) - len(set(gene_names))} duplicate gene names. Making them unique...")
        gene_name_counts = {}
        unique_gene_names = []
        for name in gene_names:
            if name in gene_name_counts:
                gene_name_counts[name] += 1
                unique_gene_names.append(f"{name}_{gene_name_counts[name]}")
            else:
                gene_name_counts[name] = 0
                unique_gene_names.append(name)
        gene_names = np.array(unique_gene_names)
    
    gene_adata = ad.AnnData(
        X=gene_matrix,
        var=pd.DataFrame(index=gene_names, data={'gene_ids': gene_ids}),
        obs=pd.DataFrame(index=barcodes)
    )
    
    protein_adata = ad.AnnData(
        X=protein_matrix,
        var=pd.DataFrame(index=protein_features['name'].values),
        obs=pd.DataFrame(index=barcodes)
    )
    
    # Process TCR data
    tcr_data['barcode'] = tcr_data['barcode'].str.strip()
    
    def join_unique(x):
        values = [str(val) for val in x if pd.notna(val)]
        if values:
            return ','.join(sorted(set(values)))
        return ''
    
    tcr_grouped = tcr_data.groupby('barcode').agg({
        'chain': join_unique,
        'v_gene': join_unique,
        'j_gene': join_unique,
        'cdr3': join_unique,
        'raw_clonotype_id': join_unique
    })
    
    # Add TCR data to both AnnData objects
    for adata in [gene_adata, protein_adata]:
        for barcode in adata.obs_names:
            if barcode in tcr_grouped.index:
                for col in tcr_grouped.columns:
                    adata.obs.loc[barcode, f'tcr_{col}'] = tcr_grouped.loc[barcode, col]
            else:
                for col in tcr_grouped.columns:
                    adata.obs.loc[barcode, f'tcr_{col}'] = ''
    
    # Add standardized metadata based on study information
    for adata in [gene_adata, protein_adata]:
        adata.obs['organism'] = 'Homo sapiens'
        adata.obs['cell_type'] = 'T Cells'
        adata.obs['crispr_type'] = 'None'
        adata.obs['cancer_type'] = 'Non-Cancer'
        adata.obs['condition'] = 'ThymoSphere week 9'
        adata.obs['perturbation_name'] = 'None'
        adata.obs['cell_fraction'] = 'CD3+ TCRγδ-'
        adata.obs['study_id'] = 'GSE269574'
        adata.obs['sample_id'] = 'GSM8322394'
    
    # --- Remove clone suffixes from protein names ---
    # The following regex will remove trailing suffixes that match known clone codes.
    def clean_protein_name(name):
        pattern = re.compile(
            r"^(.*?)(?:_(?:(?:A\d{3,4})|(?:C\d{3,4})|mh|(?:Mouse_[kl])|(?:Rat_[kl])|Hamster|FcRL3))+$"
        )
        m = pattern.match(name)
        if m:
            return m.group(1)
        return name

    protein_adata.var.index = protein_adata.var.index.map(clean_protein_name)
    
    return gene_adata, protein_adata

def main(data_dir="GSE269574"):
    """
    Main function to process the GSE269574 dataset.
    
    Args:
        data_dir: Directory to store and process data.
    
    Returns:
        tuple: Processed gene and protein AnnData objects.
    """
    # Download data if needed
    download_data(data_dir)
    
    # Process data
    print("Processing data...")
    gene_adata, protein_adata = process_data(data_dir)
    
    # Save processed data
    print("Saving processed data...")
    gene_adata.write_h5ad(os.path.join(data_dir, "GSE269574_gene_expression.h5ad"))
    protein_adata.write_h5ad(os.path.join(data_dir, "GSE269574_protein_expression.h5ad"))
    
    print(f"Processed data saved to {data_dir}")
    print(f"Gene expression data shape: {gene_adata.shape}")
    print(f"Protein expression data shape: {protein_adata.shape}")
    
    return gene_adata, protein_adata

# Run the main function to download, process, and save the data.
gene_adata, protein_adata = main()
