In [None]:
import os
import gzip
import urllib.request
import pandas as pd
import numpy as np
import scanpy as sc
from scipy import sparse
from pathlib import Path
import anndata as ad

def download_files(data_dir):
    """
    Download the dataset files if they don't exist.
    
    Args:
        data_dir (str): Directory to save the downloaded files
    """
    # Create directory if it doesn't exist
    os.makedirs(data_dir, exist_ok=True)
    
    # Define the files to download
    files = {
        "GSM8569459_HYST22_RSEC_Mols.csv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8569nnn/GSM8569459/suppl/GSM8569459_HYST22_RSEC_Mols.csv.gz",
        "GSE279408_HYST22_SampleTagCalls.csv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279408/suppl/GSE279408_HYST22_SampleTagCalls.csv.gz",
        "GSE279408_HYST22_feature_reference.csv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279408/suppl/GSE279408_HYST22_feature_reference.csv.gz"
    }
    
    # Download files if they don't exist
    for filename, url in files.items():
        file_path = os.path.join(data_dir, filename)
        if not os.path.exists(file_path):
            print(f"Downloading {filename}...")
            urllib.request.urlretrieve(url, file_path)
            print(f"Downloaded {filename}")
        else:
            print(f"{filename} already exists, skipping download")
    
    return True

def read_sample_tags(data_dir):
    """
    Read sample tag information.
    
    Args:
        data_dir (str): Directory containing the dataset files
    
    Returns:
        pd.DataFrame: DataFrame containing sample tag information
    """
    sample_tags_file = os.path.join(data_dir, "GSE279408_HYST22_SampleTagCalls.csv.gz")
    
    # Skip the header lines (metadata)
    with gzip.open(sample_tags_file, 'rt') as f:
        lines = f.readlines()
        
    # Find the line with the column headers
    header_line_idx = 0
    for i, line in enumerate(lines):
        if line.startswith("Cell_Index,Sample_Tag,Sample_Name"):
            header_line_idx = i
            break
    
    # Read the CSV data starting from the header line
    sample_tags = pd.read_csv(sample_tags_file, skiprows=header_line_idx)
    
    return sample_tags

def read_feature_reference(data_dir):
    """
    Read feature reference information.
    
    Args:
        data_dir (str): Directory containing the dataset files
    
    Returns:
        pd.DataFrame: DataFrame containing feature reference information
    """
    feature_ref_file = os.path.join(data_dir, "GSE279408_HYST22_feature_reference.csv.gz")
    feature_ref = pd.read_csv(feature_ref_file)
    
    return feature_ref

def read_expression_data(data_dir):
    """
    Read expression data.
    
    Args:
        data_dir (str): Directory containing the dataset files
    
    Returns:
        tuple: (gene_data, protein_data, cell_barcodes, gene_names, protein_names)
    """
    expression_file = os.path.join(data_dir, "GSM8569459_HYST22_RSEC_Mols.csv.gz")
    
    # Read the header to get feature names
    with gzip.open(expression_file, 'rt') as f:
        # Skip metadata lines
        line = f.readline()
        while line.startswith('#'):
            line = f.readline()
        
        # Read the header line with feature names
        header = line.strip().split(',')
    
    # Separate protein and gene features
    protein_cols = [col for col in header if '|' in col]
    gene_cols = [col for col in header if '|' not in col and col != 'Cell_Index']
    
    # Read the data
    print("Reading expression data...")
    df = pd.read_csv(expression_file, skiprows=7)  # Skip the metadata rows
    
    # Extract cell barcodes
    cell_barcodes = df['Cell_Index'].values
    
    # Extract gene expression data
    gene_data = df[gene_cols].values
    
    # Extract protein expression data
    protein_data = df[protein_cols].values
    
    print(f"Found {len(cell_barcodes)} cells, {len(gene_cols)} genes, and {len(protein_cols)} proteins")
    
    return gene_data, protein_data, cell_barcodes, gene_cols, protein_cols

def process_protein_names(protein_cols):
    """
    Process protein names to extract the gene symbols.
    
    Args:
        protein_cols (list): List of protein column names
    
    Returns:
        list: List of processed protein names (gene symbols)
    """
    # Extract gene symbols from protein names (format: CD103|ITGAE|AHS0001|pAbO)
    protein_names = []
    for col in protein_cols:
        parts = col.split('|')
        if len(parts) >= 2:
            # Use the gene symbol (second part)
            protein_names.append(parts[1])
        else:
            # If no gene symbol, use the original name
            protein_names.append(col)
    
    return protein_names

def create_anndata_objects(gene_data, protein_data, cell_barcodes, gene_names, protein_names, sample_tags):
    """
    Create AnnData objects for gene and protein expression data.
    
    Args:
        gene_data (np.ndarray): Gene expression data
        protein_data (np.ndarray): Protein expression data
        cell_barcodes (np.ndarray): Cell barcodes
        gene_names (list): Gene names
        protein_names (list): Protein names
        sample_tags (pd.DataFrame): Sample tag information
    
    Returns:
        tuple: (gene_adata, protein_adata)
    """
    # Create sparse matrices
    gene_matrix = sparse.csr_matrix(gene_data)
    protein_matrix = sparse.csr_matrix(protein_data)
    
    # Create AnnData objects
    gene_adata = ad.AnnData(X=gene_matrix)
    protein_adata = ad.AnnData(X=protein_matrix)
    
    # Set var_names
    gene_adata.var_names = pd.Index(gene_names)
    protein_adata.var_names = pd.Index(process_protein_names(protein_names))
    
    # Set obs_names
    gene_adata.obs_names = pd.Index([str(bc) for bc in cell_barcodes])
    protein_adata.obs_names = pd.Index([str(bc) for bc in cell_barcodes])
    
    # Add raw counts
    gene_adata.raw = gene_adata.copy()
    protein_adata.raw = protein_adata.copy()
    
    # Add sample information to obs
    sample_info = sample_tags.set_index('Cell_Index')
    
    # Create a mapping dictionary for cell barcodes to sample names
    cell_to_sample = sample_info['Sample_Name'].to_dict()
    
    # Add sample information to obs
    gene_adata.obs['sample'] = [cell_to_sample.get(bc, 'Unknown') for bc in cell_barcodes]
    protein_adata.obs['sample'] = [cell_to_sample.get(bc, 'Unknown') for bc in cell_barcodes]
    
    # Extract tissue information from sample names (HYST22_EM or HYST22_ECX)
    gene_adata.obs['tissue'] = gene_adata.obs['sample'].apply(
        lambda x: 'endometrium' if 'EM' in x else 'endocervix' if 'ECX' in x else 'Unknown'
    )
    protein_adata.obs['tissue'] = protein_adata.obs['sample'].apply(
        lambda x: 'endometrium' if 'EM' in x else 'endocervix' if 'ECX' in x else 'Unknown'
    )
    
    return gene_adata, protein_adata

def harmonize_metadata(adata, dataset_type):
    """
    Harmonize metadata according to the specified requirements.
    
    Args:
        adata (AnnData): AnnData object to harmonize
        dataset_type (str): Type of dataset ('gene' or 'protein')
    
    Returns:
        AnnData: Harmonized AnnData object
    """
    # Add standardized metadata
    adata.obs['organism'] = 'Homo sapiens'
    adata.obs['condition'] = adata.obs['tissue']  # Use tissue as condition
    adata.obs['perturbation_name'] = 'None'  # No perturbation in this dataset
    adata.obs['crispr_type'] = 'None'  # No CRISPR in this dataset
    adata.obs['cancer_type'] = 'Non-Cancer'  # Non-cancer samples
    
    # Add dataset-specific metadata
    adata.uns['dataset_id'] = 'GSE279408'
    adata.uns['dataset_name'] = 'Aging modifies endometrial dendritic cell function and unconventional double negative T cells in the human genital mucosa'
    adata.uns['dataset_type'] = dataset_type
    adata.uns['dataset_description'] = 'Single-cell RNA-seq and CITE-seq data from human endometrium and endocervix samples'
    
    # Add cell type information (this will be filled in later after clustering or annotation)
    adata.obs['cell_type'] = 'Unknown'
    
    return adata

def filter_paired_data(gene_adata, protein_adata):
    """
    Filter data to keep only cells that have both gene and protein expression data.
    
    Args:
        gene_adata (AnnData): Gene expression data
        protein_adata (AnnData): Protein expression data
    
    Returns:
        tuple: (filtered_gene_adata, filtered_protein_adata)
    """
    # Get common cell barcodes
    common_barcodes = list(set(gene_adata.obs_names).intersection(set(protein_adata.obs_names)))
    print(f"Found {len(common_barcodes)} cells with both gene and protein expression data")
    
    # Filter data
    gene_adata = gene_adata[common_barcodes, :]
    protein_adata = protein_adata[common_barcodes, :]
    
    return gene_adata, protein_adata

def check_duplicate_genes(adata):
    """
    Check for duplicate gene names and make them unique if necessary.
    
    Args:
        adata (AnnData): AnnData object to check
    
    Returns:
        AnnData: AnnData object with unique gene names
    """
    # Check for duplicate gene names
    if len(adata.var_names) != len(set(adata.var_names)):
        print(f"Found {len(adata.var_names) - len(set(adata.var_names))} duplicate gene names")
        
        # Make gene names unique
        adata.var_names_make_unique()
        
        print("Made gene names unique")
    
    return adata

def main(data_dir):
    """
    Main function to process the dataset.
    
    Args:
        data_dir (str): Directory containing the dataset files
    """
    # Download files if they don't exist
    download_files(data_dir)
    
    # Read sample tags
    sample_tags = read_sample_tags(data_dir)
    print(f"Found {len(sample_tags)} sample tag entries")
    
    # Read feature reference
    feature_ref = read_feature_reference(data_dir)
    print(f"Found {len(feature_ref)} feature reference entries")
    
    # Read expression data
    gene_data, protein_data, cell_barcodes, gene_names, protein_names = read_expression_data(data_dir)
    
    # Create AnnData objects
    gene_adata, protein_adata = create_anndata_objects(
        gene_data, protein_data, cell_barcodes, gene_names, protein_names, sample_tags
    )
    
    # Filter to keep only paired data
    gene_adata, protein_adata = filter_paired_data(gene_adata, protein_adata)
    
    # Check for duplicate gene names
    gene_adata = check_duplicate_genes(gene_adata)
    protein_adata = check_duplicate_genes(protein_adata)
    
    # Harmonize metadata
    gene_adata = harmonize_metadata(gene_adata, 'gene_expression')
    protein_adata = harmonize_metadata(protein_adata, 'protein_expression')
    
    # Save AnnData objects
    output_dir = os.path.join(data_dir, 'processed')
    os.makedirs(output_dir, exist_ok=True)
    
    gene_output_path = os.path.join(output_dir, 'gene_expression.h5ad')
    protein_output_path = os.path.join(output_dir, 'protein_expression.h5ad')
    
    print(f"Saving gene expression data to {gene_output_path}")
    gene_adata.write(gene_output_path, compression='gzip')
    
    print(f"Saving protein expression data to {protein_output_path}")
    protein_adata.write(protein_output_path, compression='gzip')
    
    print("Processing complete!")
    
    # Print summary
    print("\nSummary:")
    print(f"Gene expression data: {gene_adata.shape[0]} cells, {gene_adata.shape[1]} genes")
    print(f"Protein expression data: {protein_adata.shape[0]} cells, {protein_adata.shape[1]} proteins")
    print(f"Output files: {gene_output_path}, {protein_output_path}")

# In a Jupyter Notebook, you can set the data directory and run main() directly:
data_dir = "GSE279408"  # Change this path if needed
main(data_dir)
