In [None]:
import os
import sys
import gzip
import numpy as np
import pandas as pd
import anndata as ad
from scipy.io import mmread
from scipy.sparse import csr_matrix
import tarfile
import urllib.request
import json

# Dataset-specific metadata
DATASET_METADATA = {
    "GSE263524": {
        "title": "Inhibition of MBTPS1 reprograms cold into inflamed tumors and potentiates anti-PD-1 immunotherapy [scRNA-seq]",
        "organism": "Mus musculus",
        "download_url": "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE263524&format=file",
        "samples": {
            "GSM8193978": {
                "name": "NC", 
                "description": "MC38 tumor, WT", 
                "condition": "Control", 
                "perturbation_name": "Non-targeting",
                "cell_type": "MC38 tumor cell",
                "crispr_type": "CRISPR KO",
                "cancer_type": "Colorectal cancer"
            },
            "GSM8193979": {
                "name": "SHM", 
                "description": "MC38 tumor, SHM", 
                "condition": "Knockout", 
                "perturbation_name": "Mbtps1",
                "cell_type": "MC38 tumor cell",
                "crispr_type": "CRISPR KO",
                "cancer_type": "Colorectal cancer"
            }
        }
    }
}

def download_data(geo_accession, data_dir):
    """
    Download and extract the dataset if not already present.
    
    Args:
        geo_accession: GEO accession number.
        data_dir: Directory to store the downloaded data.
    """
    if geo_accession not in DATASET_METADATA:
        raise ValueError(f"Dataset {geo_accession} is not supported")
    
    metadata = DATASET_METADATA[geo_accession]
    download_url = metadata["download_url"]
    samples = metadata["samples"]
    
    tar_path = os.path.join(data_dir, f"{geo_accession}_RAW.tar")
    
    if not os.path.exists(tar_path):
        print(f"Downloading {geo_accession} dataset...")
        urllib.request.urlretrieve(download_url, tar_path)
        
    # Check if all required files exist; if not, extract the tar file
    all_files_exist = True
    for sample_id in samples:
        for suffix in ["_barcodes.tsv.gz", "_features.tsv.gz", "_matrix.mtx.gz"]:
            file_path = os.path.join(data_dir, f"{sample_id}_{samples[sample_id]['name']}{suffix}")
            if not os.path.exists(file_path):
                all_files_exist = False
                break
    
    if not all_files_exist:
        print("Extracting tar file...")
        with tarfile.open(tar_path, 'r') as tar:
            tar.extractall(path=data_dir)
    
    print("Data files are ready.")

def read_10x_data(data_dir, sample_id, sample_info):
    """
    Read 10X Genomics formatted data files.
    
    Args:
        data_dir: Directory containing the data files.
        sample_id: GEO sample ID (e.g., GSM8193978).
        sample_info: Dictionary with sample metadata.
        
    Returns:
        Dictionary containing the loaded data.
    """
    sample_name = sample_info['name']
    
    # File paths
    barcodes_path = os.path.join(data_dir, f"{sample_id}_{sample_name}_barcodes.tsv.gz")
    features_path = os.path.join(data_dir, f"{sample_id}_{sample_name}_features.tsv.gz")
    matrix_path = os.path.join(data_dir, f"{sample_id}_{sample_name}_matrix.mtx.gz")
    
    # Read barcodes
    with gzip.open(barcodes_path, 'rt') as f:
        barcodes = pd.read_csv(f, sep='\t', header=None, names=['barcode'])
    
    # Read features
    with gzip.open(features_path, 'rt') as f:
        features = pd.read_csv(f, sep='\t', header=None, names=['gene_id', 'gene_symbol', 'feature_type'])
    
    # Read matrix (this can be memory-intensive)
    print(f"Reading matrix for {sample_id}...")
    with gzip.open(matrix_path, 'rb') as f:
        matrix = mmread(f).tocsr()
    
    print(f"Matrix shape: {matrix.shape}")
    
    return {
        'barcodes': barcodes,
        'features': features,
        'matrix': matrix,
        'sample_id': sample_id,
        'sample_info': sample_info
    }

def make_gene_symbols_unique(gene_symbols):
    """
    Make gene symbols unique by appending a suffix to duplicates.
    
    Args:
        gene_symbols: Array of gene symbols.
        
    Returns:
        Array of unique gene symbols.
    """
    if len(gene_symbols) == len(set(gene_symbols)):
        return gene_symbols
    
    print(f"Found {len(gene_symbols) - len(set(gene_symbols))} duplicate gene symbols. Making them unique...")
    
    gene_counts = {}
    unique_gene_symbols = []
    
    for symbol in gene_symbols:
        if symbol in gene_counts:
            gene_counts[symbol] += 1
            unique_gene_symbols.append(f"{symbol}_{gene_counts[symbol]}")
        else:
            gene_counts[symbol] = 0
            unique_gene_symbols.append(symbol)
    
    return np.array(unique_gene_symbols)

def create_anndata(data, geo_accession):
    """
    Create an AnnData object from the data.
    
    Args:
        data: Dictionary containing the loaded data.
        geo_accession: GEO accession number.
        
    Returns:
        AnnData object with harmonized metadata.
    """
    matrix = data['matrix']
    barcodes = data['barcodes']['barcode'].values
    features = data['features']
    sample_info = data['sample_info']
    
    # The matrix from 10X is genes x cells, but AnnData expects cells x genes; transpose it.
    matrix = matrix.transpose()
    
    # Create observation (cell) metadata
    obs = pd.DataFrame(index=barcodes)
    obs['organism'] = DATASET_METADATA[geo_accession]["organism"]
    obs['cell_type'] = sample_info['cell_type']
    obs['crispr_type'] = sample_info['crispr_type']
    obs['cancer_type'] = sample_info['cancer_type']
    obs['condition'] = sample_info['condition']
    obs['perturbation_name'] = sample_info['perturbation_name']
    obs['sample_id'] = data['sample_id']
    obs['description'] = sample_info['description']
    
    # Make gene symbols unique
    gene_symbols = make_gene_symbols_unique(features['gene_symbol'].values)
    
    # Create variable (gene) metadata
    var = pd.DataFrame(index=gene_symbols)
    var['gene_id'] = features['gene_id'].values
    var['feature_type'] = features['feature_type'].values
    var['original_gene_symbol'] = features['gene_symbol'].values
    
    # Create AnnData object
    adata = ad.AnnData(
        X=matrix,
        obs=obs,
        var=var,
        uns={
            'sample_id': data['sample_id'],
            'description': sample_info['description'],
            'geo_accession': geo_accession,
            'title': DATASET_METADATA[geo_accession]["title"]
        }
    )
    
    return adata

def process_sample(geo_accession, data_dir, sample_id, sample_info):
    """
    Process a single sample.
    
    Args:
        geo_accession: GEO accession number.
        data_dir: Directory containing the data files.
        sample_id: GEO sample ID (e.g., GSM8193978).
        sample_info: Dictionary with sample metadata.
        
    Returns:
        Path to the output h5ad file.
    """
    print(f"Processing sample {sample_id} ({sample_info['description']})...")
    
    data = read_10x_data(data_dir, sample_id, sample_info)
    adata = create_anndata(data, geo_accession)
    
    output_dir = os.path.join(data_dir, "processed")
    os.makedirs(output_dir, exist_ok=True)
    
    output_path = os.path.join(output_dir, f"{sample_id}.h5ad")
    adata.write(output_path)
    
    print(f"Created h5ad file: {output_path}")
    return output_path

def run_processing(geo_accession='GSE263524', data_dir=os.getcwd()):
    """
    Run the processing of the dataset in a Jupyter Notebook environment.
    
    Args:
        geo_accession: GEO accession number (default: GSE263524).
        data_dir: Directory to store the data (default: current directory).
    """
    if geo_accession not in DATASET_METADATA:
        print(f"Error: Dataset {geo_accession} is not supported")
        print("Supported datasets:")
        for accession in DATASET_METADATA:
            print(f"  - {accession}: {DATASET_METADATA[accession]['title']}")
        return
    
    print(f"Processing dataset: {geo_accession}")
    print(f"Using data directory: {data_dir}")
    
    # Create a directory for the dataset if it doesn't exist
    data_dir = os.path.join(data_dir, geo_accession)
    os.makedirs(data_dir, exist_ok=True)
    
    download_data(geo_accession, data_dir)
    
    output_files = []
    for sample_id, sample_info in DATASET_METADATA[geo_accession]["samples"].items():
        output_file = process_sample(geo_accession, data_dir, sample_id, sample_info)
        output_files.append(output_file)
    
    metadata_path = os.path.join(data_dir, "processed", f"{geo_accession}_metadata.json")
    with open(metadata_path, 'w') as f:
        json.dump(DATASET_METADATA[geo_accession], f, indent=2)
    
    print("\nProcessing complete!")
    print("Output files:")
    for file_path in output_files:
        print(f"  - {file_path}")
    print(f"  - {metadata_path}")

# Call the run_processing function directly (you can change parameters if needed)
run_processing()


In [None]:
import anndata as ad
import numpy as np

# File paths for the two h5ad files
file1 = '/content/GSE263524/processed/GSM8193978.h5ad'
file2 = '/content/GSE263524/processed/GSM8193979.h5ad'

# Load the datasets
adata1 = ad.read_h5ad(file1)
adata2 = ad.read_h5ad(file2)

def compute_total_counts(adata):
    """Compute the total counts per cell."""
    if hasattr(adata.X, 'toarray'):
        return np.array(adata.X.sum(axis=1)).flatten()
    else:
        return adata.X.sum(axis=1)

def compute_mito_percentage(adata):
    """
    Compute the percentage of counts from mitochondrial genes.
    Assumes mitochondrial genes are annotated in the 'original_gene_symbol'
    and start with "MT-" (case-insensitive).
    """
    mito_genes = adata.var['original_gene_symbol'].str.upper().str.startswith('MT-')
    if hasattr(adata.X, 'toarray'):
        mito_counts = np.array(adata.X[:, mito_genes].sum(axis=1)).flatten()
        total_counts = np.array(adata.X.sum(axis=1)).flatten()
    else:
        mito_counts = np.array(adata.X[:, mito_genes].sum(axis=1)).flatten()
        total_counts = np.array(adata.X.sum(axis=1)).flatten()
    mito_pct = mito_counts / total_counts * 100
    return mito_pct

def compute_n_genes(adata):
    """Compute the number of genes detected per cell (nonzero counts)."""
    if hasattr(adata.X, 'toarray'):
        n_genes = np.array((adata.X.toarray() > 0).sum(axis=1)).flatten()
    else:
        n_genes = np.array((adata.X > 0).sum(axis=1)).flatten()
    return n_genes

# Add QC metrics to adata1
adata1.obs['total_counts'] = compute_total_counts(adata1)
adata1.obs['mito_pct'] = compute_mito_percentage(adata1)
adata1.obs['n_genes'] = compute_n_genes(adata1)

# Add QC metrics to adata2
adata2.obs['total_counts'] = compute_total_counts(adata2)
adata2.obs['mito_pct'] = compute_mito_percentage(adata2)
adata2.obs['n_genes'] = compute_n_genes(adata2)

# Define QC thresholds
min_total_counts = 500
max_mito_pct = 10
max_n_genes = 7500

# Apply filtering for adata1
initial_cells_1 = adata1.n_obs
adata1_qc = adata1[(adata1.obs['total_counts'] > min_total_counts) &
                   (adata1.obs['mito_pct'] < max_mito_pct) &
                   (adata1.obs['n_genes'] < max_n_genes)].copy()
filtered_cells_1 = adata1_qc.n_obs

# Apply filtering for adata2
initial_cells_2 = adata2.n_obs
adata2_qc = adata2[(adata2.obs['total_counts'] > min_total_counts) &
                   (adata2.obs['mito_pct'] < max_mito_pct) &
                   (adata2.obs['n_genes'] < max_n_genes)].copy()
filtered_cells_2 = adata2_qc.n_obs

# Print cell counts before and after QC for each sample
print("GSM8193978:")
print("  Original cell count:", initial_cells_1)
print("  After QC cell count:", filtered_cells_1)

print("\nGSM8193979:")
print("  Original cell count:", initial_cells_2)
print("  After QC cell count:", filtered_cells_2)


# Combine the QC filtered datasets
combined_adata = ad.concat(
    [adata1_qc, adata2_qc],
    join='outer',          # include all features from both datasets
    merge='unique',        # resolves duplicate observations if needed
    label='sample',        # adds a column indicating sample of origin
    keys=['GSM8193978', 'GSM8193979']
)


combined_adata.write_h5ad('/content/GSE263524.h5ad', compression="gzip")


