In [None]:
import os
import sys
import gzip
import shutil
import requests
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from scipy import sparse
from scipy.io import mmread
from pathlib import Path
from tqdm import tqdm
import time
import re  # Needed for regex in simplify_protein_name

# Constants
GEO_ACCESSION = "GSE269140"
BASE_URL = f"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE269nnn/{GEO_ACCESSION}/suppl"
FILE_PATTERNS = [
    f"{GEO_ACCESSION}_feature_reference.csv.gz",
    f"{GEO_ACCESSION}_Pool1-lane1_barcodes.tsv.gz",
    f"{GEO_ACCESSION}_Pool1-lane1_features.tsv.gz",
    f"{GEO_ACCESSION}_Pool1-lane1_matrix.mtx.gz",
    f"{GEO_ACCESSION}_Pool1-lane2_barcodes.tsv.gz",
    f"{GEO_ACCESSION}_Pool1-lane2_features.tsv.gz",
    f"{GEO_ACCESSION}_Pool1-lane2_matrix.mtx.gz",
    f"{GEO_ACCESSION}_Pool1-lane3_barcodes.tsv.gz",
    f"{GEO_ACCESSION}_Pool1-lane3_features.tsv.gz",
    f"{GEO_ACCESSION}_Pool1-lane3_matrix.mtx.gz",
    f"{GEO_ACCESSION}_Pool1-lane4_barcodes.tsv.gz",
    f"{GEO_ACCESSION}_Pool1-lane4_features.tsv.gz",
    f"{GEO_ACCESSION}_Pool1-lane4_matrix.mtx.gz",
    f"{GEO_ACCESSION}_Pool1-lane5_barcodes.tsv.gz",
    f"{GEO_ACCESSION}_Pool1-lane5_features.tsv.gz",
    f"{GEO_ACCESSION}_Pool1-lane5_matrix.mtx.gz",
    f"{GEO_ACCESSION}_Pool2-lane1_barcodes.tsv.gz",
    f"{GEO_ACCESSION}_Pool2-lane1_features.tsv.gz",
    f"{GEO_ACCESSION}_Pool2-lane1_matrix.mtx.gz",
    f"{GEO_ACCESSION}_Pool2-lane2_barcodes.tsv.gz",
    f"{GEO_ACCESSION}_Pool2-lane2_features.tsv.gz",
    f"{GEO_ACCESSION}_Pool2-lane2_matrix.mtx.gz",
    f"{GEO_ACCESSION}_Pool2-lane3_barcodes.tsv.gz",
    f"{GEO_ACCESSION}_Pool2-lane3_features.tsv.gz",
    f"{GEO_ACCESSION}_Pool2-lane3_matrix.mtx.gz",
    f"{GEO_ACCESSION}_Pool2-lane4_barcodes.tsv.gz",
    f"{GEO_ACCESSION}_Pool2-lane4_features.tsv.gz",
    f"{GEO_ACCESSION}_Pool2-lane4_matrix.mtx.gz",
]

def simplify_protein_name(name: str) -> str:
    """
    Remove 'TotalSeqCXXXX_' and 'anti-human'/'anti-mouse', etc. prefixes
    and keep only the main marker name (e.g. 'CD19', 'CD4', etc.).
    """
    # 1) Remove 'TotalSeqC' plus digits + underscore
    name = re.sub(r'^TotalSeqC\d+_+', '', name)
    
    # 2) Remove leading 'anti-' text (including 'anti-human', 'anti-mouse/human', etc.)
    #    followed by optional underscores or spaces.
    name = re.sub(r'^anti-[^_ ]+[_ ]*', '', name)
    
    # 3) Clean up repeated underscores/spaces
    name = name.strip().replace('_', ' ')
    
    return name.strip()

def download_file(url, output_path, max_retries=3):
    """
    Download a file from a URL with retry logic
    
    Args:
        url: URL to download from
        output_path: Path to save the file
        max_retries: Maximum number of retry attempts
    
    Returns:
        bool: True if download was successful, False otherwise
    """
    if os.path.exists(output_path):
        print(f"File already exists: {output_path}")
        return True
    
    print(f"Downloading {url} to {output_path}")
    
    for attempt in range(1, max_retries + 1):
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()
            
            total_size = int(response.headers.get('content-length', 0))
            block_size = 1024  # 1 KiB
            
            with open(output_path, 'wb') as f, tqdm(
                desc=f"Attempt {attempt}/{max_retries}",
                total=total_size,
                unit='B',
                unit_scale=True,
                unit_divisor=1024,
            ) as bar:
                for data in response.iter_content(block_size):
                    bar.update(len(data))
                    f.write(data)
            
            return True
        
        except Exception as e:
            print(f"Error downloading file (attempt {attempt}/{max_retries}): {e}")
            if attempt < max_retries:
                print(f"Retrying in 5 seconds...")
                time.sleep(5)
            else:
                print(f"Failed to download {url} after {max_retries} attempts")
                return False

def download_dataset(data_dir):
    """
    Download all files for the dataset
    
    Args:
        data_dir: Directory to save the files
    
    Returns:
        dict: Dictionary of downloaded files by pool and lane
    """
    os.makedirs(data_dir, exist_ok=True)
    
    data_files = {}
    
    for file_pattern in FILE_PATTERNS:
        url = f"{BASE_URL}/{file_pattern}"
        output_path = os.path.join(data_dir, file_pattern)
        
        success = download_file(url, output_path)
        
        # Organize files by pool and lane
        if success and "Pool" in file_pattern:
            parts = file_pattern.split("_")
            pool_lane = parts[1]  # e.g., "Pool1-lane1"
            file_type = parts[2].split(".")[0]  # e.g., "barcodes", "features", "matrix"
            
            if pool_lane not in data_files:
                data_files[pool_lane] = {}
            
            data_files[pool_lane][file_type] = output_path
    
    return data_files

def get_existing_files(data_dir):
    """
    Get existing files in the data directory
    
    Args:
        data_dir: Directory containing the files
    
    Returns:
        dict: Dictionary of existing files by pool and lane
    """
    data_files = {}
    
    for file_pattern in FILE_PATTERNS:
        file_path = os.path.join(data_dir, file_pattern)
        
        if os.path.exists(file_path) and "Pool" in file_pattern:
            parts = file_pattern.split("_")
            pool_lane = parts[1]  # e.g., "Pool1-lane1"
            file_type = parts[2].split(".")[0]  # e.g., "barcodes", "features", "matrix"
            
            if pool_lane not in data_files:
                data_files[pool_lane] = {}
            
            data_files[pool_lane][file_type] = file_path
    
    return data_files

def read_10x_mtx(matrix_file, features_file, barcodes_file):
    """
    Read 10X Genomics matrix format files
    
    Args:
        matrix_file: Path to the matrix.mtx.gz file
        features_file: Path to the features.tsv.gz file
        barcodes_file: Path to the barcodes.tsv.gz file
    
    Returns:
        tuple: (matrix, features_df, barcodes)
    """
    # Check if all files exist
    for file_path in [matrix_file, features_file, barcodes_file]:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")
    
    # Read matrix
    try:
        with gzip.open(matrix_file, 'rb') as f:
            X = mmread(f).T.tocsr()
    except Exception as e:
        print(f"Error reading matrix file {matrix_file}: {e}")
        raise
    
    # Read features
    feature_ids = []
    feature_names = []
    feature_types = []
    
    with gzip.open(features_file, 'rt') as f:
        for line in f:
            fields = line.strip().split('\t')
            feature_ids.append(fields[0])
            feature_names.append(fields[1])
            if len(fields) > 2:
                feature_types.append(fields[2])
            else:
                feature_types.append("Gene Expression")
    
    features_df = pd.DataFrame({
        'gene_ids': feature_ids,
        'feature_names': feature_names,
        'feature_types': feature_types
    })
    
    # Read barcodes
    barcodes = []
    with gzip.open(barcodes_file, 'rt') as f:
        for line in f:
            barcodes.append(line.strip())
    
    # Validate dimensions
    if X.shape[0] != len(barcodes) or X.shape[1] != len(features_df):
        raise ValueError(
            f"Dimension mismatch: Matrix shape {X.shape}, "
            f"barcodes length {len(barcodes)}, "
            f"features length {len(features_df)}"
        )
    
    return X, features_df, barcodes

def process_lane(pool_lane, files, data_dir):
    """
    Process a single lane of data
    
    Args:
        pool_lane: Pool and lane identifier (e.g., "Pool1-lane1")
        files: Dictionary of files for this lane
        data_dir: Data directory
    
    Returns:
        tuple: (gene_adata, protein_adata) AnnData objects for gene and protein expression
    """
    print(f"Processing {pool_lane}")
    
    try:
        # Check if all required files are present
        required_files = ['barcodes', 'features', 'matrix']
        for req_file in required_files:
            if req_file not in files:
                raise ValueError(f"Missing {req_file} file for {pool_lane}")
        
        # Read the 10X matrix files
        X, features_df, barcodes = read_10x_mtx(
            files['matrix'], 
            files['features'], 
            files['barcodes']
        )
        
        # Split gene expression and protein data
        gene_mask = features_df['feature_types'] == 'Gene Expression'
        protein_mask = features_df['feature_types'] == 'Antibody Capture'
        
        # Create AnnData objects
        gene_adata = ad.AnnData(
            X=X[:, gene_mask],
            obs=pd.DataFrame(index=barcodes),
            var=features_df[gene_mask].reset_index(drop=True)
        )
        
        protein_adata = ad.AnnData(
            X=X[:, protein_mask],
            obs=pd.DataFrame(index=barcodes),
            var=features_df[protein_mask].reset_index(drop=True)
        )
        
        # Add metadata
        pool = pool_lane.split('-')[0].replace('Pool', '')
        lane = pool_lane.split('-')[1].replace('lane', '')
        
        # Add batch information
        gene_adata.obs['batch'] = f"{pool_lane.lower()}"
        protein_adata.obs['batch'] = f"{pool_lane.lower()}"
        
        # Add pool and lane information
        gene_adata.obs['pool'] = pool
        gene_adata.obs['lane'] = lane
        protein_adata.obs['pool'] = pool
        protein_adata.obs['lane'] = lane
        
        # Add harmonization metadata
        for adata in [gene_adata, protein_adata]:
            adata.obs['organism'] = 'Homo sapiens'
            adata.obs['cell_type'] = 'T Cells'
            adata.obs['cell_subtype'] = 'CD8+ T cells'
            adata.obs['crispr_type'] = 'None'
            adata.obs['cancer_type'] = 'Non-Cancer'
            adata.obs['condition'] = 'SStim'
            adata.obs['perturbation_name'] = 'ROR1-CAR'
            adata.obs['geo_accession'] = GEO_ACCESSION
            adata.obs['study_title'] = (
                'Engineering potent chimeric antigen receptor T cells by programming '
                'signaling during T-cell activation [CITE-seq]'
            )
        
        # Set var_names to gene symbols for gene expression data
        gene_adata.var_names = gene_adata.var['feature_names'].values
        gene_adata.var.drop(columns=['feature_names'], inplace=True)
        
        # Set var_names to protein names for protein data
        protein_adata.var_names = protein_adata.var['feature_names'].values
        protein_adata.var.drop(columns=['feature_names'], inplace=True)
        
        # ---- NEW: Simplify the protein var_names by removing TotalSeqC/anti-human prefixes
        simplified_protein_names = [simplify_protein_name(n) for n in protein_adata.var_names]
        protein_adata.var_names = simplified_protein_names
        
        # IMPORTANT FIX: Make sure var names are unique right now
        gene_adata.var_names_make_unique()
        protein_adata.var_names_make_unique()
        
        return gene_adata, protein_adata
    
    except Exception as e:
        print(f"Error processing {pool_lane}: {e}")
        print("Skipping this lane and continuing with others.")
        return None, None

def merge_lanes(data_files, data_dir):
    """
    Merge data from multiple lanes
    
    Args:
        data_files: Dictionary of files by pool and lane
        data_dir: Data directory
    
    Returns:
        tuple: (gene_adata, protein_adata) Merged AnnData objects
    """
    gene_adatas = []
    protein_adatas = []
    
    for pool_lane, files in data_files.items():
        gene_adata, protein_adata = process_lane(pool_lane, files, data_dir)
        
        if gene_adata is not None and protein_adata is not None:
            # Add pool-lane identifier to cell barcodes to make them unique
            gene_adata.obs_names = [f"{bc}-{pool_lane.lower()}" for bc in gene_adata.obs_names]
            protein_adata.obs_names = [f"{bc}-{pool_lane.lower()}" for bc in protein_adata.obs_names]
            
            gene_adatas.append(gene_adata)
            protein_adatas.append(protein_adata)
    
    if not gene_adatas or not protein_adatas:
        raise ValueError("No valid data found in any pool/lane combination.")
    
    print("Merging gene expression data...")
    merged_gene_adata = ad.concat(gene_adatas, join='outer')
    
    print("Merging protein expression data...")
    merged_protein_adata = ad.concat(protein_adatas, join='outer')
    
    # Make sure var_names are unique after merging
    merged_gene_adata.var_names_make_unique()
    merged_protein_adata.var_names_make_unique()
    
    # Keep only cells that have both gene expression and protein data
    common_cells = list(set(merged_gene_adata.obs_names) & set(merged_protein_adata.obs_names))
    print(f"Found {len(common_cells)} cells with both gene expression and protein data.")
    
    merged_gene_adata = merged_gene_adata[common_cells]
    merged_protein_adata = merged_protein_adata[common_cells]
    
    return merged_gene_adata, merged_protein_adata

def run_pipeline(data_dir=None):
    """
    Run the full pipeline in a Jupyter-compatible manner.
    
    Args:
        data_dir: Directory in which to download and process data.
                  If None, uses the current working directory.
    """
    if data_dir is None:
        data_dir = os.getcwd()
    
    print(f"Processing {GEO_ACCESSION} dataset")
    print(f"Data directory: {data_dir}")
    
    try:
        # Download dataset
        data_files = download_dataset(data_dir)
        
        # Process and merge lanes
        gene_adata, protein_adata = merge_lanes(data_files, data_dir)
        
        # Save the harmonized data
        gene_output_path = os.path.join(data_dir, f"{GEO_ACCESSION}_gene_expression.h5ad")
        protein_output_path = os.path.join(data_dir, f"{GEO_ACCESSION}_protein_expression.h5ad")
        
        print(f"Saving gene expression data to {gene_output_path}")
        gene_adata.write(gene_output_path)
        
        print(f"Saving protein expression data to {protein_output_path}")
        protein_adata.write(protein_output_path)
        
        print("Processing complete!")
        print(f"Gene expression data shape: {gene_adata.shape}")
        print(f"Protein expression data shape: {protein_adata.shape}")
    
    except Exception as e:
        print(f"Error processing dataset: {e}")
        
        # Try using existing files if the download fails
        print("Attempting to process using existing files...")
        try:
            existing_files = get_existing_files(data_dir)
            gene_adata, protein_adata = merge_lanes(existing_files, data_dir)
            
            gene_output_path = os.path.join(data_dir, f"{GEO_ACCESSION}_gene_expression.h5ad")
            protein_output_path = os.path.join(data_dir, f"{GEO_ACCESSION}_protein_expression.h5ad")
            
            print(f"Saving gene expression data to {gene_output_path}")
            gene_adata.write(gene_output_path)
            
            print(f"Saving protein expression data to {protein_output_path}")
            protein_adata.write(protein_output_path)
            
            print("Processing complete!")
            print(f"Gene expression data shape: {gene_adata.shape}")
            print(f"Protein expression data shape: {protein_adata.shape}")
        
        except Exception as e2:
            print(f"Error processing existing files: {e2}")
            sys.exit(1)

# USAGE in Jupyter:
# 1) Paste this entire code block in a cell
# 2) Run the cell
# 3) Call run_pipeline("/path/to/data_dir") in another cell
#    or just run_pipeline() to use the current directory.


# --- USAGE EXAMPLE IN JUPYTER NOTEBOOK ---
# In your Jupyter Notebook, after running the above cell, simply call:
run_pipeline("/content/GSE269140")
#
# If you prefer to use the current directory, do:
# run_pipeline()
