In [None]:
import os
import sys
import gzip
import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
from scipy import sparse
import urllib.request
import tarfile
import re
from pathlib import Path

# Set random seed for reproducibility
np.random.seed(42)

def download_files(data_dir):
    """
    Download necessary files if they don't exist in the data directory.
    
    Parameters:
    -----------
    data_dir : str
        Path to the data directory
    """
    # Define files to download
    files = {
        "GSE279774_HYST35_SampleTagCalls.csv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279774/suppl/GSE279774_HYST35_SampleTagCalls.csv.gz",
        "GSE279774_HYST35_feature_reference.csv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279774/suppl/GSE279774_HYST35_feature_reference.csv.gz",
        "GSE279774_RAW.tar": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279774/suppl/GSE279774_RAW.tar",
        "GSM8579964_HYST35_RSEC_mols.csv.gz": "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM8579nnn/GSM8579964/suppl/GSM8579964_HYST35_RSEC_mols.csv.gz"
    }
    
    # Download files if they don't exist
    for filename, url in files.items():
        file_path = os.path.join(data_dir, filename)
        if not os.path.exists(file_path):
            print(f"Downloading {filename}...")
            urllib.request.urlretrieve(url, file_path)
            print(f"Downloaded {filename}")
        else:
            print(f"File {filename} already exists")
    
    # Extract tar file if needed
    tar_file = os.path.join(data_dir, "GSE279774_RAW.tar")
    if os.path.exists(tar_file):
        # Check if we need to extract
        extracted_files = [f for f in os.listdir(data_dir) if f.endswith('.csv') and not f.startswith('GSE')]
        if not extracted_files:
            print("Extracting tar file...")
            with tarfile.open(tar_file) as tar:
                tar.extractall(path=data_dir)
            print("Extracted tar file")

def read_csv_with_headers(file_path, compression='gzip'):
    """
    Read CSV file with headers and skip comment lines.
    
    Parameters:
    -----------
    file_path : str
        Path to the CSV file
    compression : str
        Compression type ('gzip' or None)
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing the CSV data
    """
    # Open file
    if compression == 'gzip':
        with gzip.open(file_path, 'rt') as f:
            # Skip comment lines
            line = f.readline()
            while line.startswith('#'):
                line = f.readline()
            
            # Read header
            header = line.strip().split(',')
            
            # Read data
            data = []
            for line in f:
                data.append(line.strip().split(','))
    else:
        with open(file_path, 'r') as f:
            # Skip comment lines
            line = f.readline()
            while line.startswith('#'):
                line = f.readline()
            
            # Read header
            header = line.strip().split(',')
            
            # Read data
            data = []
            for line in f:
                data.append(line.strip().split(','))
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=header)
    return df

def process_gene_expression_data(data_dir):
    """
    Process gene expression data.
    
    Parameters:
    -----------
    data_dir : str
        Path to the data directory
        
    Returns:
    --------
    anndata.AnnData
        AnnData object containing gene expression data
    """
    print("Processing gene expression data...")
    
    # Read gene expression data
    gene_expr_file = os.path.join(data_dir, "GSM8579964_HYST35_RSEC_mols.csv.gz")
    
    # Read the first few lines to determine the structure
    with gzip.open(gene_expr_file, 'rt') as f:
        header_lines = []
        for i in range(10):  # Read first 10 lines
            line = f.readline().strip()
            header_lines.append(line)
    
    # Skip header lines that start with #
    skip_rows = 0
    for line in header_lines:
        if line.startswith('#'):
            skip_rows += 1
        else:
            break
    
    # Read the data
    df = pd.read_csv(gene_expr_file, skiprows=skip_rows)
    
    # Extract cell barcodes and gene names
    cell_barcodes = df['Cell_Index'].values
    
    # Separate gene and protein columns
    gene_cols = [col for col in df.columns if '|' not in col and col != 'Cell_Index']
    protein_cols = [col for col in df.columns if '|' in col and col != 'Cell_Index']
    
    # Create gene expression matrix
    gene_expr = df[gene_cols].values
    
    # Convert to sparse matrix
    gene_expr_sparse = sparse.csr_matrix(gene_expr)
    
    # Create AnnData object for gene expression
    adata_gene = ad.AnnData(
        X=gene_expr_sparse,
        obs=pd.DataFrame(index=cell_barcodes),
        var=pd.DataFrame(index=gene_cols)
    )
    
    # Add metadata
    adata_gene.uns['dataset'] = 'GSE279774'
    adata_gene.uns['description'] = 'Multi-omic analysis of dendritic cell populations in the female genital tract'
    
    print(f"Gene expression data: {adata_gene.shape[0]} cells, {adata_gene.shape[1]} genes")
    
    return adata_gene, protein_cols, df

def process_protein_expression_data(df, protein_cols, cell_barcodes):
    """
    Process protein expression data.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing the expression data
    protein_cols : list
        List of protein column names
    cell_barcodes : list
        List of cell barcodes
        
    Returns:
    --------
    anndata.AnnData
        AnnData object containing protein expression data
    """
    print("Processing protein expression data...")
    
    # Create protein expression matrix
    protein_expr = df[protein_cols].values
    
    # Convert to sparse matrix
    protein_expr_sparse = sparse.csr_matrix(protein_expr)
    
    # Extract protein names and metadata
    protein_info = []
    for col in protein_cols:
        parts = col.split('|')
        protein_name = parts[0]
        gene_name = parts[1] if len(parts) > 1 else ""
        antibody_id = parts[2] if len(parts) > 2 else ""
        antibody_type = parts[3] if len(parts) > 3 else ""
        
        protein_info.append({
            'protein_name': protein_name,
            'gene_name': gene_name,
            'antibody_id': antibody_id,
            'antibody_type': antibody_type
        })
    
    # Create protein metadata DataFrame
    protein_metadata = pd.DataFrame(protein_info)
    
    # Create AnnData object for protein expression
    adata_protein = ad.AnnData(
        X=protein_expr_sparse,
        obs=pd.DataFrame(index=cell_barcodes),
        var=pd.DataFrame(protein_metadata.values, 
                         index=protein_cols, 
                         columns=protein_metadata.columns)
    )
    
    # Add metadata
    adata_protein.uns['dataset'] = 'GSE279774'
    adata_protein.uns['description'] = 'Multi-omic analysis of dendritic cell populations in the female genital tract - Protein Expression'
    
    print(f"Protein expression data: {adata_protein.shape[0]} cells, {adata_protein.shape[1]} proteins")
    
    return adata_protein

def process_sample_tags(data_dir, adata_gene, adata_protein):
    """
    Process sample tag information and add to AnnData objects.
    
    Parameters:
    -----------
    data_dir : str
        Path to the data directory
    adata_gene : anndata.AnnData
        AnnData object containing gene expression data
    adata_protein : anndata.AnnData
        AnnData object containing protein expression data
        
    Returns:
    --------
    anndata.AnnData, anndata.AnnData
        Updated AnnData objects
    """
    print("Processing sample tag information...")
    
    # Read sample tag data
    sample_tag_file = os.path.join(data_dir, "GSE279774_HYST35_SampleTagCalls.csv.gz")
    
    # Skip header lines that start with #
    with gzip.open(sample_tag_file, 'rt') as f:
        skip_rows = 0
        for line in f:
            if line.startswith('#'):
                skip_rows += 1
            else:
                break
    
    # Read the data
    sample_tags = pd.read_csv(sample_tag_file, skiprows=skip_rows)
    
    # Create a dictionary mapping cell indices to sample tags
    cell_to_sample = dict(zip(sample_tags['Cell_Index'], sample_tags['Sample_Name']))
    
    # Add sample information to AnnData objects
    for adata in [adata_gene, adata_protein]:
        # Create a new column for sample information
        sample_info = [cell_to_sample.get(cell, "Unknown") for cell in adata.obs.index]
        adata.obs['sample'] = sample_info
        
        # Extract tissue information from sample names
        tissue_info = []
        for sample in sample_info:
            if 'ECX' in sample:
                tissue_info.append('Ectocervix')
            elif 'END' in sample:
                tissue_info.append('Endocervix')
            elif 'EM' in sample:
                tissue_info.append('Endometrium')
            else:
                tissue_info.append('Unknown')
        
        adata.obs['tissue'] = tissue_info
    
    return adata_gene, adata_protein

def harmonize_data(adata_gene, adata_protein):
    """
    Harmonize data according to specified standards.
    
    Parameters:
    -----------
    adata_gene : anndata.AnnData
        AnnData object containing gene expression data
    adata_protein : anndata.AnnData
        AnnData object containing protein expression data
        
    Returns:
    --------
    anndata.AnnData, anndata.AnnData
        Harmonized AnnData objects
    """
    print("Harmonizing data...")
    
    # Add standardized metadata
    for adata in [adata_gene, adata_protein]:
        # Organism
        adata.obs['organism'] = 'Homo sapiens'
        
        # Cell type - based on the paper, these are dendritic cells from the female genital tract
        adata.obs['cell_type'] = 'Dendritic cells'
        
        # CRISPR type - not applicable for this dataset
        adata.obs['crispr_type'] = 'None'
        
        # Cancer type - based on the paper, these are non-cancer samples
        adata.obs['cancer_type'] = 'Non-Cancer'
        
        # Condition - based on the paper, these are homeostatic conditions
        adata.obs['condition'] = 'Homeostatic'
        
        # Perturbation name - not applicable for this dataset
        adata.obs['perturbation_name'] = 'None'
    
    # Ensure gene names are symbols
    # The gene names in this dataset are already gene symbols
    
    # Check for duplicate gene names
    if len(adata_gene.var_names) != len(set(adata_gene.var_names)):
        print("Warning: Duplicate gene names found. Resolving...")
        # Make gene names unique
        adata_gene.var_names_make_unique()
    
    return adata_gene, adata_protein

def filter_paired_data(adata_gene, adata_protein):
    """
    Filter data to keep only cells that have both gene and protein expression data.
    
    Parameters:
    -----------
    adata_gene : anndata.AnnData
        AnnData object containing gene expression data
    adata_protein : anndata.AnnData
        AnnData object containing protein expression data
        
    Returns:
    --------
    anndata.AnnData, anndata.AnnData
        Filtered AnnData objects
    """
    print("Filtering paired data...")
    
    # Get common cell barcodes
    common_cells = np.intersect1d(adata_gene.obs.index, adata_protein.obs.index)
    
    print(f"Common cells: {len(common_cells)} out of {adata_gene.shape[0]} gene expression cells and {adata_protein.shape[0]} protein expression cells")
    
    # Filter AnnData objects
    adata_gene_filtered = adata_gene[adata_gene.obs.index.isin(common_cells)].copy()
    adata_protein_filtered = adata_protein[adata_protein.obs.index.isin(common_cells)].copy()
    
    # Ensure the order of cells is the same in both objects
    adata_protein_filtered = adata_protein_filtered[adata_gene_filtered.obs.index].copy()
    
    return adata_gene_filtered, adata_protein_filtered

def main(data_dir=None):
    """
    Main function to process and harmonize the GSE279774 dataset.
    
    Parameters:
    -----------
    data_dir : str, optional
        Path to the data directory. Defaults to the current working directory.
    """
    if data_dir is None:
        data_dir = os.getcwd()
    
    print(f"Processing GSE279774 dataset in directory: {data_dir}")
    
    # Download files if needed
    download_files(data_dir)
    
    # Process gene expression data
    adata_gene, protein_cols, df = process_gene_expression_data(data_dir)
    
    # Process protein expression data
    adata_protein = process_protein_expression_data(df, protein_cols, df['Cell_Index'].values)
    
    # Process sample tags
    adata_gene, adata_protein = process_sample_tags(data_dir, adata_gene, adata_protein)
    
    # Filter paired data
    adata_gene, adata_protein = filter_paired_data(adata_gene, adata_protein)
    
    # Update protein variable names to only keep the first part before '|'
    adata_protein.var_names = [name.split('|')[0] for name in adata_protein.var_names]
    adata_protein.var_names_make_unique()
    
    # Harmonize data
    adata_gene, adata_protein = harmonize_data(adata_gene, adata_protein)
    
    # Save data
    output_dir = os.path.join(data_dir, "harmonized")
    os.makedirs(output_dir, exist_ok=True)
    
    adata_gene.write_h5ad(os.path.join(output_dir, "GSE279774_gene_expression.h5ad"))
    adata_protein.write_h5ad(os.path.join(output_dir, "GSE279774_protein_expression.h5ad"))
    
    print("Data processing and harmonization complete.")
    print(f"Gene expression data saved to: {os.path.join(output_dir, 'GSE279774_gene_expression.h5ad')}")
    print(f"Protein expression data saved to: {os.path.join(output_dir, 'GSE279774_protein_expression.h5ad')}")

# Run the pipeline by calling main() with an optional directory (or leave it as default)
main()  # or use main('/path/to/your/data_dir') if you want to specify a directory
