In [None]:
#%% [code]
import os
import sys
import glob
import gzip
import time
import shutil
import tarfile
import logging
import urllib.request
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Union

import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from scipy import sparse, io
from tqdm import tqdm

# Constants
GEO_ACCESSION = "GSE279914"
GEO_URL = f"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/{GEO_ACCESSION}/suppl/{GEO_ACCESSION}_RAW.tar"

def setup_logger():
    """Configure logging for the script."""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[logging.StreamHandler(sys.stdout)]
    )
    return logging.getLogger("GSE279914_processor")

def download_data(data_dir: Path, logger) -> Path:
    """
    Download the dataset from GEO if not already present.
    
    Args:
        data_dir: Directory where data will be stored
        logger: Logger object
        
    Returns:
        Path to the downloaded tar file
    """
    tar_path = data_dir / f"{GEO_ACCESSION}_RAW.tar"
    
    if not tar_path.exists():
        logger.info(f"Downloading {GEO_ACCESSION} dataset from GEO...")
        data_dir.mkdir(parents=True, exist_ok=True)
        
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            req = urllib.request.Request(GEO_URL, headers=headers)
            
            with urllib.request.urlopen(req) as response:
                total_size = int(response.info().get('Content-Length', 0))
                block_size = 1024 * 1024  # 1MB chunks
                
                with open(tar_path, 'wb') as out_file:
                    downloaded = 0
                    for chunk in tqdm(
                        iter(lambda: response.read(block_size), b''),
                        total=total_size // block_size + 1,
                        unit='MB',
                        desc="Downloading"
                    ):
                        out_file.write(chunk)
                        downloaded += len(chunk)
            
            logger.info(f"Download complete: {tar_path}")
        except urllib.error.HTTPError as e:
            if e.code == 403:
                logger.error("Access forbidden (HTTP 403). The dataset may require authentication or is not publicly available.")
                logger.info("Please download the dataset manually from GEO and place it in the data directory.")
                logger.info(f"Download URL: {GEO_URL}")
                logger.info(f"Expected file path: {tar_path}")
                
                try:
                    from IPython.display import display, HTML
                    display(HTML(f"""
                    <div style="background-color: #ffffcc; padding: 10px; border: 1px solid #ffcc00; border-radius: 5px;">
                        <h3>Manual Download Required</h3>
                        <p>The dataset cannot be downloaded automatically due to access restrictions.</p>
                        <p>Please download the dataset manually from GEO:</p>
                        <ol>
                            <li>Go to <a href="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={GEO_ACCESSION}" target="_blank">GEO {GEO_ACCESSION}</a></li>
                            <li>Download the RAW.tar file</li>
                            <li>Place it at: <code>{tar_path}</code></li>
                        </ol>
                    </div>
                    """))
                except ImportError:
                    pass
                
                while not tar_path.exists():
                    logger.info("Waiting for manual download... Press Ctrl+C to cancel.")
                    try:
                        for _ in tqdm(range(60), desc="Waiting", unit="s"):
                            time.sleep(1)
                    except KeyboardInterrupt:
                        logger.error("Process canceled by user.")
                        sys.exit(1)
                
                logger.info(f"Found manually downloaded file: {tar_path}")
            else:
                raise
    else:
        logger.info(f"Using existing download: {tar_path}")
    
    return tar_path

def extract_data(tar_path: Path, data_dir: Path, logger) -> Path:
    """
    Extract the downloaded tar file.
    
    Args:
        tar_path: Path to the downloaded tar file
        data_dir: Directory where data will be extracted
        logger: Logger object
        
    Returns:
        Path to the extracted directory
    """
    extract_dir = data_dir / "extracted"
    
    if not extract_dir.exists() or not any(extract_dir.iterdir()):
        logger.info(f"Extracting {tar_path} to {extract_dir}...")
        extract_dir.mkdir(parents=True, exist_ok=True)
        
        with tarfile.open(tar_path, 'r') as tar:
            members = tar.getmembers()
            for member in tqdm(members, desc="Extracting files"):
                tar.extract(member, path=extract_dir)
        
        logger.info(f"Extraction complete: {extract_dir}")
    else:
        logger.info(f"Using existing extracted data: {extract_dir}")
    
    return extract_dir

def find_10x_data_files(extract_dir: Path, logger) -> List[Dict[str, Path]]:
    """
    Find all sets of 10X Genomics data files.
    
    Args:
        extract_dir: Directory containing extracted data
        logger: Logger object
        
    Returns:
        List of dictionaries containing paths to matrix, features, and barcodes files
    """
    matrix_files = list(extract_dir.glob("**/*matrix.mtx.gz"))
    
    data_sets = []
    for matrix_file in matrix_files:
        base_name = matrix_file.name.replace('matrix.mtx.gz', '')
        base_dir = matrix_file.parent
        
        features_file = base_dir / f"{base_name}features.tsv.gz"
        barcodes_file = base_dir / f"{base_name}barcodes.tsv.gz"
        
        if features_file.exists() and barcodes_file.exists():
            sample_id = matrix_file.name.split('_')[0]
            
            data_sets.append({
                'sample_id': sample_id,
                'matrix_file': matrix_file,
                'features_file': features_file,
                'barcodes_file': barcodes_file
            })
    
    logger.info(f"Found {len(data_sets)} 10X data sets")
    return data_sets

def split_gene_and_protein_data(adata: ad.AnnData, logger) -> Tuple[ad.AnnData, ad.AnnData]:
    """
    Split an AnnData object into gene expression and protein data.
    
    Args:
        adata: AnnData object containing both gene expression and protein data
        logger: Logger object
        
    Returns:
        Tuple of (gene_adata, protein_adata)
    """
    if 'feature_types' not in adata.var:
        protein_pattern = '^(Hash|CD|HLA|IgG)'
        adata.var['feature_types'] = 'Gene Expression'
        adata.var.loc[adata.var_names.str.match(protein_pattern), 'feature_types'] = 'Antibody Capture'
    
    logger.info(f"Feature types: {adata.var['feature_types'].unique()}")
    logger.info(f"Number of Gene Expression features: {(adata.var['feature_types'] == 'Gene Expression').sum()}")
    logger.info(f"Number of Antibody Capture features: {(adata.var['feature_types'] == 'Antibody Capture').sum()}")
    
    gene_data = adata[:, adata.var['feature_types'] == 'Gene Expression'].copy()
    protein_data = adata[:, adata.var['feature_types'] == 'Antibody Capture'].copy()
    
    logger.info(f"Gene data shape: {gene_data.shape}")
    logger.info(f"Protein data shape: {protein_data.shape}")
    
    if protein_data.shape[1] == 0:
        logger.warning("No protein features found in the data")
    
    return gene_data, protein_data

def process_10x_data(data_dir: Path, logger) -> Tuple[ad.AnnData, ad.AnnData]:
    """
    Process 10X Genomics data and split into gene expression and protein data.
    
    Args:
        data_dir: Directory containing extracted data
        logger: Logger object
        
    Returns:
        Tuple of (gene_adata, protein_adata)
    """
    extract_dir = data_dir / "extracted"
    data_sets = find_10x_data_files(extract_dir, logger)
    
    gene_adatas = []
    protein_adatas = []
    
    for data_set in tqdm(data_sets, desc="Processing 10X data"):
        sample_id = data_set['sample_id']
        matrix_file = data_set['matrix_file']
        features_file = data_set['features_file']
        barcodes_file = data_set['barcodes_file']
        
        temp_dir = data_dir / "temp" / sample_id
        temp_dir.mkdir(parents=True, exist_ok=True)
        
        shutil.copy(matrix_file, temp_dir / "matrix.mtx.gz")
        shutil.copy(features_file, temp_dir / "features.tsv.gz")
        shutil.copy(barcodes_file, temp_dir / "barcodes.tsv.gz")
        
        try:
            logger.info(f"Reading data for {sample_id} using direct approach")
            
            mtx_file = temp_dir / "matrix.mtx.gz"
            with gzip.open(mtx_file, 'rb') as f:
                X = io.mmread(f).T.tocsr()
            
            features_file = temp_dir / "features.tsv.gz"
            features = pd.read_csv(features_file, sep='\t', header=None, names=['id', 'name', 'feature_type'])
            
            barcodes_file = temp_dir / "barcodes.tsv.gz"
            barcodes = pd.read_csv(barcodes_file, sep='\t', header=None)[0].values
            
            adata = ad.AnnData(X=X, obs=pd.DataFrame(index=barcodes), var=pd.DataFrame(index=features['name'].values))
            adata.var['feature_types'] = features['feature_type'].values
            
            logger.info(f"Feature types in {sample_id}: {adata.var['feature_types'].unique()}")
            logger.info(f"Number of Gene Expression features: {(adata.var['feature_types'] == 'Gene Expression').sum()}")
            logger.info(f"Number of Antibody Capture features: {(adata.var['feature_types'] == 'Antibody Capture').sum()}")
            
            if adata.var_names.duplicated().any():
                logger.warning(f"Found duplicate var names in {sample_id}, making them unique")
                adata.var_names_make_unique()
            
            adata.obs['sample_id'] = sample_id
            
            gene_data, protein_data = split_gene_and_protein_data(adata, logger)
            
            gene_data.obs['organism'] = 'Homo sapiens'
            gene_data.obs['cell_type'] = 'bone marrow cells'
            gene_data.obs['cancer_type'] = 'Mixed Cell Lines' if 'CellLineMix' in sample_id else 'Non-Cancer'
            gene_data.obs['condition'] = 'diagnosis'
            gene_data.obs['perturbation_name'] = 'None'
            gene_data.obs['disease'] = 'myelodysplastic syndrome'
            gene_data.obs['tissue'] = 'bone marrow'
            gene_data.obs['crispr_type'] = 'None'
            gene_data.obs['donor'] = sample_id
            gene_data.obs['is_cell_line'] = 'CellLineMix' in sample_id
            gene_data.obs['lane'] = matrix_file.name.split('_')[1].replace('matrix.mtx.gz', '')
            gene_data.obs['batch'] = 'batch1'
            gene_data.obs['title'] = f"GSE279914_{sample_id}"
            
            protein_data.obs = gene_data.obs.copy()
            
            gene_adatas.append(gene_data)
            protein_adatas.append(protein_data)
            
            logger.info(f"Processed sample {sample_id}: {gene_data.shape} genes, {protein_data.shape} proteins")
        except Exception as e:
            logger.error(f"Error processing data from {sample_id}: {e}")
        
        shutil.rmtree(temp_dir)
    
    if gene_adatas:
        gene_adata = ad.concat(gene_adatas, join='outer', label='sample_id', index_unique='-')
        logger.info(f"Combined gene expression data: {gene_adata.shape}")
    else:
        gene_adata = None
        logger.warning("No gene expression data found")
    
    if protein_adatas:
        protein_adata = ad.concat(protein_adatas, join='outer', label='sample_id', index_unique='-')
        logger.info(f"Combined protein data: {protein_adata.shape}")
    else:
        protein_adata = None
        logger.warning("No protein data found")
    
    return gene_adata, protein_adata

def check_for_duplicate_genes(adata: ad.AnnData, logger) -> ad.AnnData:
    """
    Check for and handle duplicate gene names in var_names.
    
    Args:
        adata: AnnData object to check
        logger: Logger object
        
    Returns:
        AnnData object with unique var_names
    """
    duplicates = adata.var_names.duplicated()
    n_duplicates = duplicates.sum()
    
    if n_duplicates > 0:
        logger.warning(f"Found {n_duplicates} duplicate gene names")
        adata_unique = adata.copy()
        dup_genes = adata.var_names[duplicates].tolist()
        logger.info(f"Duplicate genes: {dup_genes[:10]}...")
        adata_unique.var_names_make_unique()
        return adata_unique
    else:
        logger.info("No duplicate gene names found")
        return adata

def create_paired_datasets(gene_adata: ad.AnnData, protein_adata: ad.AnnData, logger) -> Tuple[ad.AnnData, ad.AnnData]:
    """
    Create paired datasets where gene and protein data share the same cells.
    
    Args:
        gene_adata: AnnData object containing gene expression data
        protein_adata: AnnData object containing protein data
        logger: Logger object
        
    Returns:
        Tuple of (paired_gene_adata, paired_protein_adata)
    """
    if gene_adata is None or protein_adata is None:
        logger.warning("Cannot create paired datasets: missing gene or protein data")
        return None, None
    
    common_barcodes = np.intersect1d(gene_adata.obs_names, protein_adata.obs_names)
    logger.info(f"Found {len(common_barcodes)} cells with both gene and protein data")
    
    if len(common_barcodes) == 0:
        logger.warning("No common cells found between gene and protein data")
        return None, None
    
    paired_gene_adata = gene_adata[common_barcodes].copy()
    paired_protein_adata = protein_adata[common_barcodes].copy()
    paired_protein_adata = paired_protein_adata[paired_gene_adata.obs_names].copy()
    
    return paired_gene_adata, paired_protein_adata

def process_data(data_dir: Path, logger) -> None:
    """
    Process the GSE279914 dataset.
    
    Args:
        data_dir: Directory where data will be stored
        logger: Logger object
    """
    tar_path = download_data(data_dir, logger)
    extract_data(tar_path, data_dir, logger)
    
    gene_adata, protein_adata = process_10x_data(data_dir, logger)
    
    if gene_adata is not None:
        gene_adata = check_for_duplicate_genes(gene_adata, logger)
    
    paired_gene_adata, paired_protein_adata = create_paired_datasets(gene_adata, protein_adata, logger)
    
    # --- QC Filtering Step ---
    if paired_gene_adata is not None and paired_protein_adata is not None:
        logger.info("Performing QC filtering on paired gene expression data...")
        logger.info(f"Number of cells before QC filtering: {paired_gene_adata.n_obs}")
        sc.pp.calculate_qc_metrics(paired_gene_adata, inplace=True)
        qc_threshold = 200
        paired_gene_adata_qc = paired_gene_adata[paired_gene_adata.obs['n_genes_by_counts'] >= qc_threshold].copy()
        logger.info(f"Number of cells after QC filtering: {paired_gene_adata_qc.n_obs}")
        
        # Update the paired protein data to keep only cells present after QC filtering
        paired_protein_adata_qc = paired_protein_adata[paired_gene_adata_qc.obs_names].copy()
        paired_gene_adata, paired_protein_adata = paired_gene_adata_qc, paired_protein_adata_qc
    # --- End QC Filtering Step ---
    
    # --- Clean Protein var_names: Remove suffix after '-' ---
    if paired_protein_adata is not None:
        logger.info("Cleaning protein var_names: removing suffix")
        # For each protein name, keep only the part before the hyphen
        cleaned_var_names = [name.split('-')[0] for name in paired_protein_adata.var_names]
        paired_protein_adata.var_names = cleaned_var_names
    # --- End Cleaning Protein var_names ---
    
    output_dir = data_dir / GEO_ACCESSION
    output_dir.mkdir(parents=True, exist_ok=True)
    
    if gene_adata is not None:
        gene_adata.write_h5ad(output_dir / f"{GEO_ACCESSION}_gene_expression.h5ad")
        logger.info(f"Saved all gene expression data: {gene_adata.shape}")
    
    if protein_adata is not None:
        protein_adata.write_h5ad(output_dir / f"{GEO_ACCESSION}_protein_expression.h5ad")
        logger.info(f"Saved all protein expression data: {protein_adata.shape}")
    
    if paired_gene_adata is not None and paired_protein_adata is not None:
        paired_gene_adata.write_h5ad(output_dir / f"{GEO_ACCESSION}_paired_gene_expression.h5ad")
        paired_protein_adata.write_h5ad(output_dir / f"{GEO_ACCESSION}_paired_protein_expression.h5ad")
        logger.info(f"Saved paired gene expression data: {paired_gene_adata.shape}")
        logger.info(f"Saved paired protein expression data: {paired_protein_adata.shape}")

#%% [code]
def run_processing(data_dir: Union[str, Path] = None):
    """
    Run the GSE279914 processing pipeline.
    
    Args:
        data_dir: Optional directory to store data. Defaults to "./GSE279914".
    """
    logger = setup_logger()
    
    if data_dir is None:
        data_dir = Path("./GSE279914")
    elif isinstance(data_dir, str):
        data_dir = Path(data_dir)
    
    logger.info(f"Processing {GEO_ACCESSION} dataset")
    logger.info(f"Data directory: {data_dir}")
    
    process_data(data_dir, logger)
    logger.info("Processing complete")

# Run the processing pipeline
run_processing()
