In [None]:
import os
import gzip
import urllib.request
from pathlib import Path
import numpy as np
import pandas as pd
import scipy.sparse as sp
import anndata as ad

# URLs for downloading the data
GEO_URL_BASE = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE277nnn/GSE277081/suppl/"
FILES_TO_DOWNLOAD = [
    "GSE277081_barcodes-Aarm.tsv.gz",
    "GSE277081_barcodes-Aflu.tsv.gz",
    "GSE277081_barcodes-Barm.tsv.gz",
    "GSE277081_barcodes-Bflu.tsv.gz",
    "GSE277081_features-Aarm.tsv.gz",
    "GSE277081_features-Aflu.tsv.gz",
    "GSE277081_features-Barm.tsv.gz",
    "GSE277081_features-Bflu.tsv.gz",
    "GSE277081_matrix-Aarm.mtx.gz",
    "GSE277081_matrix-Aflu.mtx.gz",
    "GSE277081_matrix-Barm.mtx.gz",
    "GSE277081_matrix-Bflu.mtx.gz"
]

def download_data(data_dir):
    """
    Download the dataset files if they don't exist.
    
    Args:
        data_dir (Path): Directory to download the data to
    """
    raw_data_dir = data_dir / "raw_data"
    raw_data_dir.mkdir(exist_ok=True, parents=True)
    
    for file_name in FILES_TO_DOWNLOAD:
        file_path = raw_data_dir / file_name
        if not file_path.exists():
            url = f"{GEO_URL_BASE}{file_name}"
            print(f"Downloading {url} to {file_path}")
            urllib.request.urlretrieve(url, file_path)
        else:
            print(f"File {file_path} already exists, skipping download")
    
    return raw_data_dir

def read_10x_mtx(matrix_file, features_file, barcodes_file):
    """
    Read 10x data from mtx, features, and barcodes files.
    
    Args:
        matrix_file (str): Path to the matrix file
        features_file (str): Path to the features file
        barcodes_file (str): Path to the barcodes file
        
    Returns:
        tuple: (matrix, features_df, barcodes)
    """
    # Read the matrix
    with gzip.open(matrix_file, 'rt') as f:
        # Skip header lines
        while True:
            header = f.readline()
            if not header.startswith('%'):
                break
        
        # Parse dimensions
        dims = header.strip().split()
        n_features, n_barcodes, n_entries = int(dims[0]), int(dims[1]), int(dims[2])
        
        # Read the data
        data, row_indices, col_indices = [], [], []
        for i in range(n_entries):
            line = f.readline().strip().split()
            row_indices.append(int(line[0]) - 1)  # 1-based to 0-based indexing
            col_indices.append(int(line[1]) - 1)  # 1-based to 0-based indexing
            data.append(int(line[2]))
    
    # Create sparse matrix
    matrix = sp.csr_matrix((data, (row_indices, col_indices)), shape=(n_features, n_barcodes))
    
    # Read features
    feature_ids = []
    feature_names = []
    feature_types = []
    with gzip.open(features_file, 'rt') as f:
        for line in f:
            parts = line.strip().split('\t')
            feature_ids.append(parts[0])
            feature_names.append(parts[1])  # Use gene symbol as feature name
            feature_types.append(parts[2])
    
    features_df = pd.DataFrame({
        'feature_id': feature_ids,
        'feature_name': feature_names,
        'feature_type': feature_types
    })
    
    # Read barcodes
    barcodes = []
    with gzip.open(barcodes_file, 'rt') as f:
        for line in f:
            barcodes.append(line.strip())
    
    return matrix, features_df, barcodes

def process_dataset(data_dir):
    """
    Process the GSE277081 dataset.
    
    Args:
        data_dir (Path): Directory containing the data
        
    Returns:
        tuple: (gene_expression_adata, protein_expression_adata)
    """
    raw_data_dir = download_data(data_dir)
    
    # Process each sample
    samples = ['Aarm', 'Aflu', 'Barm', 'Bflu']
    gene_adatas = []
    protein_adatas = []
    
    for sample in samples:
        print(f"Processing sample {sample}")
        
        # File paths
        matrix_file = raw_data_dir / f"GSE277081_matrix-{sample}.mtx.gz"
        features_file = raw_data_dir / f"GSE277081_features-{sample}.tsv.gz"
        barcodes_file = raw_data_dir / f"GSE277081_barcodes-{sample}.tsv.gz"
        
        # Read the data
        matrix, features_df, barcodes = read_10x_mtx(matrix_file, features_file, barcodes_file)
        
        # Split gene expression and protein expression
        gene_mask = features_df['feature_type'] == 'Gene Expression'
        protein_mask = features_df['feature_type'] == 'Antibody Capture'
        
        # Create gene expression AnnData
        if gene_mask.any():
            gene_indices = np.where(gene_mask)[0]
            gene_matrix = matrix[gene_indices, :]
            gene_features = features_df.loc[gene_mask, 'feature_name'].values
            gene_ids = features_df.loc[gene_mask, 'feature_id'].values
            
            # Check for duplicate gene names
            gene_name_counts = pd.Series(gene_features).value_counts()
            duplicated_genes = gene_name_counts[gene_name_counts > 1].index.tolist()
            
            if duplicated_genes:
                print(f"Found {len(duplicated_genes)} duplicated gene names. Appending unique identifiers.")
                for gene in duplicated_genes:
                    dup_indices = np.where(gene_features == gene)[0]
                    for i, idx in enumerate(dup_indices):
                        gene_features[idx] = f"{gene}_{i+1}"
            
            # Create var DataFrame with gene IDs and names
            var_df = pd.DataFrame({
                'gene_id': gene_ids,
                'gene_name': gene_features
            }, index=gene_features)
            
            gene_adata = ad.AnnData(
                X=gene_matrix.T,
                obs=pd.DataFrame(index=barcodes),
                var=var_df
            )
            
            # Add sample metadata
            gene_adata.obs['sample'] = sample
            gene_adata.obs['replicate'] = sample[0]  # A or B
            gene_adata.obs['condition'] = 'LCMV Armstrong' if 'arm' in sample.lower() else 'Influenza PR8'
            
            gene_adatas.append(gene_adata)
        
        # Create protein expression AnnData
        if protein_mask.any():
            protein_indices = np.where(protein_mask)[0]
            protein_matrix = matrix[protein_indices, :]
            protein_features = features_df.loc[protein_mask, 'feature_name'].values
            protein_ids = features_df.loc[protein_mask, 'feature_id'].values
            
            # Create var DataFrame with protein IDs and names
            var_df = pd.DataFrame({
                'protein_id': protein_ids,
                'protein_name': protein_features
            }, index=protein_features)
            
            protein_adata = ad.AnnData(
                X=protein_matrix.T,
                obs=pd.DataFrame(index=barcodes),
                var=var_df
            )
            
            # Add sample metadata
            protein_adata.obs['sample'] = sample
            protein_adata.obs['replicate'] = sample[0]  # A or B
            protein_adata.obs['condition'] = 'LCMV Armstrong' if 'arm' in sample.lower() else 'Influenza PR8'
            
            protein_adatas.append(protein_adata)
    
    # Combine all samples
    print("Combining all samples")
    
    # Make cell barcodes unique by adding sample prefix
    for i, adata in enumerate(gene_adatas):
        sample = samples[i]
        adata.obs.index = [f"{sample}_{bc}" for bc in adata.obs.index]
    
    for i, adata in enumerate(protein_adatas):
        sample = samples[i]
        adata.obs.index = [f"{sample}_{bc}" for bc in adata.obs.index]
    
    # Concatenate the data
    gene_adata = ad.concat(gene_adatas, join='outer')
    protein_adata = ad.concat(protein_adatas, join='outer')
    
    # Find common barcodes between gene and protein data
    common_barcodes = np.intersect1d(gene_adata.obs.index, protein_adata.obs.index)
    print(f"Found {len(common_barcodes)} common barcodes between gene and protein data")
    
    # Subset to common barcodes
    gene_adata = gene_adata[gene_adata.obs.index.isin(common_barcodes)].copy()
    protein_adata = protein_adata[protein_adata.obs.index.isin(common_barcodes)].copy()
    
    # Ensure the same order of cells in both datasets
    gene_adata = gene_adata[protein_adata.obs.index].copy()
    
    # Extract additional metadata from protein expression
    extract_metadata_from_proteins(gene_adata, protein_adata)
    
    # Harmonize metadata
    harmonize_metadata(gene_adata, protein_adata)
    
    return gene_adata, protein_adata

def extract_metadata_from_proteins(gene_adata, protein_adata):
    """
    Extract additional metadata from protein expression data.
    
    Args:
        gene_adata (AnnData): Gene expression data
        protein_adata (AnnData): Protein expression data
    """
    # Extract metadata from protein expression
    # These are based on the protein markers in the dataset
    metadata_markers = {
        'CD103': 'tissue_residency',
        'CD69': 'activation',
        'LY6C': 'effector_memory',
        'CD127': 'memory',
        'CD62L': 'central_memory',
        'KLRG1': 'terminal_effector',
        'CX3CR1': 'migration',
        'IV Ab': 'intravascular',
        'HOST': 'host_origin',
        'DONOR': 'donor_origin'
    }
    
    # Add protein expression as metadata
    for protein, metadata_name in metadata_markers.items():
        if protein in protein_adata.var_names:
            protein_idx = protein_adata.var_names.get_loc(protein)
            # Extract the column from the sparse matrix
            if sp.issparse(protein_adata.X):
                values = protein_adata.X[:, protein_idx].toarray().flatten()
            else:
                values = protein_adata.X[:, protein_idx]
            
            gene_adata.obs[metadata_name] = values
            protein_adata.obs[metadata_name] = values

def harmonize_metadata(gene_adata, protein_adata):
    """
    Harmonize metadata for both gene and protein expression data.
    
    Args:
        gene_adata (AnnData): Gene expression data
        protein_adata (AnnData): Protein expression data
    """
    # Add standard metadata fields
    for adata in [gene_adata, protein_adata]:
        # Organism
        adata.obs['organism'] = 'Mus musculus'
        
        # Cell type - all cells are CD8+ T cells
        adata.obs['cell_type'] = 'CD8+ T cell'
        
        # CRISPR type - not applicable for this dataset
        adata.obs['crispr_type'] = 'None'
        
        # Cancer type - not applicable for this dataset
        adata.obs['cancer_type'] = 'Non-Cancer'
        
        # Condition - already set, but let's add more details
        adata.obs['condition'] = adata.obs['condition'].astype(str) + ' infection'
        
        # Perturbation name - not applicable for this dataset
        adata.obs['perturbation_name'] = 'None'
        
        # Add tissue information based on hashtag oligos if available
        if 'tissue_residency' in adata.obs:
            # Simplify to binary tissue residency status
            adata.obs['tissue_resident'] = (adata.obs['tissue_residency'] > adata.obs['tissue_residency'].median()).astype(str)
        
        # Convert categorical columns
        for col in ['organism', 'cell_type', 'crispr_type', 'cancer_type', 'condition', 'perturbation_name', 'sample', 'replicate']:
            if col in adata.obs:
                adata.obs[col] = adata.obs[col].astype('category')

def save_anndata(adata, output_file):
    """
    Save AnnData object to h5ad file.
    
    Args:
        adata (AnnData): AnnData object to save
        output_file (Path): Output file path
    """
    print(f"Saving {output_file}")
    adata.write_h5ad(output_file, compression='gzip')

def main():
    """Main function for Jupyter Notebook."""
    # Set the data root to the current working directory
    data_root = Path.cwd()
    print(f"Using data root path: {data_root}")
    
    # Process the dataset
    gene_adata, protein_adata = process_dataset(data_root)
    
    # Print summary of the data
    print("\nGene expression data summary:")
    print(f"Number of cells: {gene_adata.n_obs}")
    print(f"Number of genes: {gene_adata.n_vars}")
    print(f"Metadata columns: {list(gene_adata.obs.columns)}")
    
    print("\nProtein expression data summary:")
    print(f"Number of cells: {protein_adata.n_obs}")
    print(f"Number of proteins: {protein_adata.n_vars}")
    print(f"Metadata columns: {list(protein_adata.obs.columns)}")
    
    # Save the results
    save_anndata(gene_adata, data_root / "GSE277081_gene_expression_harmonized.h5ad")
    save_anndata(protein_adata, data_root / "GSE277081_protein_expression_harmonized.h5ad")
    
    print("\nDone!")

# Run the main function directly in the notebook
main()
