In [None]:
import os
import glob
import gzip
import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
from scipy import sparse
import requests
import tarfile
from tqdm import tqdm
import re

# Set random seed for reproducibility
np.random.seed(42)

def download_data(data_dir, url="https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE279775&format=file"):
    """
    Download the GSE279775 dataset if not already present.
    
    Args:
        data_dir: Directory to save the data
        url: URL to download the data from
    """
    os.makedirs(data_dir, exist_ok=True)
    tar_file = os.path.join(data_dir, "GSE279775_RAW.tar")
    
    if not os.path.exists(tar_file):
        print(f"Downloading GSE279775 dataset to {tar_file}...")
        response = requests.get(url, stream=True)
        total_size = int(response.headers.get('content-length', 0))
        
        with open(tar_file, 'wb') as f:
            with tqdm(total=total_size, unit='B', unit_scale=True, desc="Downloading") as pbar:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
                        pbar.update(len(chunk))
        
        print("Download complete. Extracting files...")
        with tarfile.open(tar_file) as tar:
            tar.extractall(path=data_dir)
        print("Extraction complete.")
    else:
        print(f"Found existing data at {tar_file}")
        # Check if files are already extracted
        if not glob.glob(os.path.join(data_dir, "GSM*.csv.gz")):
            print("Extracting files...")
            with tarfile.open(tar_file) as tar:
                tar.extractall(path=data_dir)
            print("Extraction complete.")
        else:
            print("Files already extracted.")

def read_feature_reference(file_path):
    """
    Read the feature reference file to get protein information.
    
    Args:
        file_path: Path to the feature reference file
        
    Returns:
        DataFrame with protein information
    """
    print(f"Reading feature reference from {os.path.basename(file_path)}...")
    with gzip.open(file_path, 'rt') as f:
        feature_ref = pd.read_csv(f)
    print(f"Found {len(feature_ref)} features")
    return feature_ref

def read_sample_tags(file_path):
    """
    Read the sample tag file to get cell metadata.
    
    Args:
        file_path: Path to the sample tag file
        
    Returns:
        DataFrame with cell metadata
    """
    print(f"Reading sample tags from {os.path.basename(file_path)}...")
    
    # Count header lines to skip
    header_lines = 0
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            header_lines += 1
            if line.startswith("Cell_Index"):
                break
    
    print(f"Found header at line {header_lines}")
    
    # Read the data with the correct header
    sample_tags = pd.read_csv(file_path, skiprows=header_lines-1)
    
    print(f"Found {len(sample_tags)} sample tags")
    return sample_tags

def read_expression_data(file_path):
    """
    Read the expression data file.
    
    Args:
        file_path: Path to the expression data file
        
    Returns:
        Tuple of (gene_expression_matrix, protein_expression_matrix, gene_names, protein_names, cell_indices)
    """
    print(f"Reading expression data from {os.path.basename(file_path)}...")
    
    # Count header lines to skip
    header_lines = 0
    header_line = ""
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            header_lines += 1
            if line.startswith("Cell_Index"):
                header_line = line
                break
    
    print(f"Found header at line {header_lines}")
    
    # Read the header to get gene and protein names
    header_parts = header_line.strip().split(',')
    
    # Identify protein columns (they have a specific format with | characters)
    protein_pattern = re.compile(r'.*\|.*\|.*\|.*')
    protein_indices = [i for i, col in enumerate(header_parts) if protein_pattern.match(col)]
    
    # The rest are gene columns (excluding the first Cell_Index column)
    gene_indices = [i for i in range(1, len(header_parts)) if i not in protein_indices]
    
    # Get protein and gene names:
    # First, split on "|" to get the primary protein name,
    # then split on ":" to remove any trailing parts.
    protein_names = [header_parts[i].split('|')[0].split(':')[0] for i in protein_indices]
    gene_names = [header_parts[i] for i in gene_indices]
    
    print(f"Found {len(protein_names)} proteins and {len(gene_names)} genes")
    
    # Read the data in chunks to handle large files
    print("Reading data in chunks...")
    chunks = pd.read_csv(file_path, skiprows=header_lines, chunksize=1000)
    
    # Initialize empty lists to store data
    cell_indices = []
    gene_data = []
    protein_data = []
    
    for i, chunk in enumerate(chunks):
        if i % 10 == 0:
            print(f"Processing chunk {i}...")
        
        cell_indices.extend(chunk.iloc[:, 0].values)
        
        # Extract gene expression data
        gene_chunk = chunk.iloc[:, gene_indices].values
        gene_data.append(gene_chunk)
        
        # Extract protein expression data
        protein_chunk = chunk.iloc[:, protein_indices].values
        protein_data.append(protein_chunk)
    
    # Combine chunks
    print("Combining chunks...")
    gene_expression = np.vstack(gene_data)
    protein_expression = np.vstack(protein_data)
    
    print(f"Processed {len(cell_indices)} cells")
    
    return gene_expression, protein_expression, gene_names, protein_names, cell_indices

def process_dataset(data_dir):
    """
    Process the GSE279775 dataset.
    
    Args:
        data_dir: Directory containing the data
        
    Returns:
        Tuple of (gene_anndata, protein_anndata)
    """
    # Find all RSEC_mols files (gene expression)
    rsec_files = glob.glob(os.path.join(data_dir, "*RSEC_mols.csv.gz"))
    
    # Find all SampleTagCalls files (metadata)
    tag_files = glob.glob(os.path.join(data_dir, "*SampleTagCalls.csv.gz"))
    
    # Find all feature_reference files (protein info)
    feature_files = glob.glob(os.path.join(data_dir, "*feature_reference.csv.gz"))
    
    # Create dictionary to store data by sample
    samples = {}
    
    # Process each sample
    for rsec_file in rsec_files:
        sample_id = os.path.basename(rsec_file).split('_RSEC')[0]
        print(f"\nProcessing sample: {sample_id}")
        
        # Find corresponding tag and feature files
        tag_file = next((f for f in tag_files if sample_id in f), None)
        feature_file = next((f for f in feature_files if sample_id in f), None)
        
        if not tag_file or not feature_file:
            print(f"Warning: Missing metadata files for {sample_id}")
            continue
        
        # Read data
        gene_expr, protein_expr, gene_names, protein_names, cell_indices = read_expression_data(rsec_file)
        sample_tags = read_sample_tags(tag_file)
        feature_ref = read_feature_reference(feature_file)
        
        # Create cell metadata with unique observation names by combining sample ID and cell index
        unique_cell_ids = [f"{sample_id}_{str(idx)}" for idx in cell_indices]
        cell_metadata = pd.DataFrame(index=unique_cell_ids)
        
        # Add sample tags to metadata
        sample_tags_dict = dict(zip(sample_tags['Cell_Index'], sample_tags['Sample_Name']))
        cell_metadata['sample'] = [sample_tags_dict.get(idx, 'Unknown') for idx in cell_indices]
        
        # Extract condition and tissue information from sample name and tags
        donor_id = sample_id.split('_')[0]
        condition = 'Control' if 'Control' in sample_id else 'HIV'
        cell_metadata['donor_id'] = donor_id
        cell_metadata['condition'] = condition
        
        # Extract tissue information from sample tags if available
        tissues = []
        for idx in cell_indices:
            tag = sample_tags_dict.get(idx, '')
            if 'ECX' in tag:
                tissues.append('Ectocervix')
            elif 'END' in tag:
                tissues.append('Endocervix')
            elif 'EM' in tag:
                tissues.append('Endometrium')
            else:
                tissues.append('Unknown')
        cell_metadata['tissue'] = tissues
        
        # Create AnnData objects using the unique cell IDs
        gene_adata = ad.AnnData(
            X=sparse.csr_matrix(gene_expr),
            obs=cell_metadata.copy(),
            var=pd.DataFrame(index=gene_names)
        )
        
        protein_adata = ad.AnnData(
            X=sparse.csr_matrix(protein_expr),
            obs=cell_metadata.copy(),
            var=pd.DataFrame(index=protein_names)
        )
        
        # Add protein metadata from feature reference
        protein_var = pd.DataFrame(index=protein_names)
        for protein in protein_names:
            if protein in feature_ref['Name'].values:
                row = feature_ref[feature_ref['Name'] == protein].iloc[0]
                protein_var.loc[protein, 'Sequence'] = row['Sequence']
                protein_var.loc[protein, 'Feature_Type'] = row['Feature_Type']
        protein_adata.var = protein_var
        
        # Store in samples dictionary
        samples[sample_id] = {
            'gene': gene_adata,
            'protein': protein_adata
        }
    
    # Combine all samples
    all_gene_adatas = [sample_data['gene'] for sample_data in samples.values()]
    all_protein_adatas = [sample_data['protein'] for sample_data in samples.values()]
    
    combined_gene_adata = ad.concat(all_gene_adatas, join='outer', merge='same')
    combined_protein_adata = ad.concat(all_protein_adatas, join='outer', merge='same')
    
    return combined_gene_adata, combined_protein_adata

def harmonize_data(gene_adata, protein_adata):
    """
    Harmonize the data according to specified standards.
    
    Args:
        gene_adata: AnnData object with gene expression data
        protein_adata: AnnData object with protein expression data
        
    Returns:
        Tuple of (harmonized_gene_adata, harmonized_protein_adata)
    """
    # Make a copy to avoid modifying the original
    gene_adata = gene_adata.copy()
    protein_adata = protein_adata.copy()
    
    # Add standardized metadata fields
    gene_adata.obs['organism'] = 'Homo sapiens'
    protein_adata.obs['organism'] = 'Homo sapiens'
    
    gene_adata.obs['cell_type'] = 'Dendritic cells'
    protein_adata.obs['cell_type'] = 'Dendritic cells'
    
    gene_adata.obs['crispr_type'] = 'None'
    protein_adata.obs['crispr_type'] = 'None'
    
    gene_adata.obs['cancer_type'] = 'Non-Cancer'
    protein_adata.obs['cancer_type'] = 'Non-Cancer'
    
    # Ensure 'condition' is a string
    gene_adata.obs['condition'] = gene_adata.obs['condition'].astype(str)
    protein_adata.obs['condition'] = protein_adata.obs['condition'].astype(str)
    
    # Add perturbation name based on condition
    gene_adata.obs['perturbation_name'] = gene_adata.obs['condition'].apply(
        lambda x: 'HIV-1' if x == 'HIV' else 'None'
    )
    protein_adata.obs['perturbation_name'] = protein_adata.obs['condition'].apply(
        lambda x: 'HIV-1' if x == 'HIV' else 'None'
    )
    
    # Filter for cells that are in both gene and protein data
    common_cells = gene_adata.obs_names.intersection(protein_adata.obs_names)
    gene_adata = gene_adata[common_cells]
    protein_adata = protein_adata[common_cells]
    
    # Make variable names unique if needed
    if gene_adata.var_names.duplicated().any():
        print("Found duplicate gene names. Making them unique...")
        gene_adata.var_names_make_unique()
    if protein_adata.var_names.duplicated().any():
        print("Found duplicate protein names. Making them unique...")
        protein_adata.var_names_make_unique()
    
    return gene_adata, protein_adata

def main(data_dir="GSE279775"):
    """
    Run the full pipeline: download, process, and harmonize the dataset.
    
    Args:
        data_dir: Directory to store or locate the dataset
    """
    # Download data if needed
    download_data(data_dir)
    
    # Process the dataset
    gene_adata, protein_adata = process_dataset(data_dir)
    
    # Harmonize the data
    gene_adata, protein_adata = harmonize_data(gene_adata, protein_adata)
    
    # Save the harmonized data
    gene_output = os.path.join(data_dir, "GSE279775_gene_expression.h5ad")
    protein_output = os.path.join(data_dir, "GSE279775_protein_expression.h5ad")
    gene_adata.write_h5ad(gene_output)
    protein_adata.write_h5ad(protein_output)
    
    print(f"\nGene expression data shape: {gene_adata.shape}")
    print(f"Protein expression data shape: {protein_adata.shape}")
    print(f"Harmonized data saved to {data_dir}")
    
    # Print summary of the data
    print("\nGene expression data summary:")
    print(f"Number of cells: {gene_adata.n_obs}")
    print(f"Number of genes: {gene_adata.n_vars}")
    print(f"Conditions: {gene_adata.obs['condition'].unique()}")
    print(f"Tissues: {gene_adata.obs['tissue'].unique()}")
    
    print("\nProtein expression data summary:")
    print(f"Number of cells: {protein_adata.n_obs}")
    print(f"Number of proteins: {protein_adata.n_vars}")
    
    return gene_adata, protein_adata

# Run the pipeline in the notebook
gene_adata, protein_adata = main()
