In [None]:
import os
import sys
import glob
import gzip
import shutil
import urllib.request
import tarfile
import pandas as pd
import numpy as np
import anndata as ad
from scipy import io
from pathlib import Path

def download_and_extract(data_dir):
    """
    Download and extract the GSE205310 dataset if not already present.
    
    Args:
        data_dir: Path to the directory where the data should be stored.
    """
    # Create directory if it doesn't exist
    os.makedirs(data_dir, exist_ok=True)
    
    # Check if the tar file exists
    tar_path = os.path.join(data_dir, "GSE205310_RAW.tar")
    if not os.path.exists(tar_path):
        print(f"Downloading GSE205310_RAW.tar to {tar_path}...")
        url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE205310&format=file"
        urllib.request.urlretrieve(url, tar_path)
    
    # Check if files are already extracted
    if not os.path.exists(os.path.join(data_dir, "GSM6210116_dual.matrix.mtx.gz")):
        print(f"Extracting files from {tar_path}...")
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=data_dir)
    
    print("Download and extraction complete.")

def parse_guide_identity(guide_identity):
    """
    Parse the guide identity string to extract perturbation information.
    
    Args:
        guide_identity: String containing guide identity information.
        
    Returns:
        Dictionary with perturbation information.
    """
    info = {
        'perturbation_name': None,
        'condition': 'test'
    }
    
    # Check if it's a non-targeting control
    if 'non-targeting' in guide_identity.lower():
        info['perturbation_name'] = 'Non-targeting'
        info['condition'] = 'control'
        return info
    
    # For targeting guides, extract the gene name
    if '_' in guide_identity:
        # For dual library format: GENE_STRAND_POSITION
        gene = guide_identity.split('_')[0]
        info['perturbation_name'] = gene
    else:
        # For Dolcetto format: GENE_SEQUENCE
        gene = guide_identity.split('_')[0]
        info['perturbation_name'] = gene
    
    return info

def process_sample(sample_id, data_dir):
    """
    Process a single sample from the GSE205310 dataset.
    
    Args:
        sample_id: Sample identifier (e.g., 'GSM6210116_dual').
        data_dir: Path to the directory containing the data.
        
    Returns:
        AnnData object with the processed data.
    """
    print(f"Processing {sample_id}...")
    
    # File paths
    matrix_file = os.path.join(data_dir, f"{sample_id}.matrix.mtx.gz")
    features_file = os.path.join(data_dir, f"{sample_id}.features.tsv.gz")
    barcodes_file = os.path.join(data_dir, f"{sample_id}.barcodes.tsv.gz")
    cell_identities_file = os.path.join(data_dir, f"{sample_id}_cell_identities.csv.gz")
    
    # Read the expression data
    with gzip.open(matrix_file, 'rb') as f:
        X = io.mmread(f).T.tocsr()
    
    # Read the features (genes)
    with gzip.open(features_file, 'rt') as f:
        gene_ids = []
        gene_symbols = []
        for line in f:
            fields = line.strip().split('\t')
            gene_ids.append(fields[0])
            gene_symbols.append(fields[1])
    
    # Read the barcodes
    with gzip.open(barcodes_file, 'rt') as f:
        barcodes = [line.strip() for line in f]
    
    # Create a DataFrame for var with gene symbols as index
    var_df = pd.DataFrame(index=gene_symbols)
    var_df['gene_ids'] = gene_ids
    
    # Make gene symbols unique
    gene_counts = var_df.groupby(level=0).cumcount().values
    new_index = [f"{gene}_{i}" if i > 0 else gene for gene, i in zip(var_df.index, gene_counts)]
    var_df.index = new_index
    
    # Create the AnnData object
    adata = ad.AnnData(X=X, obs=pd.DataFrame(index=barcodes), var=var_df)
    
    # Read cell identities
    cell_identities = pd.read_csv(cell_identities_file)
    cell_identities.set_index('cell_barcode', inplace=True)
    
    # Create metadata DataFrame with the same index as adata.obs
    metadata = pd.DataFrame(index=adata.obs.index)
    metadata['sample_id'] = sample_id
    
    # Add guide information
    guide_info = {}
    for cell_barcode, row in cell_identities.iterrows():
        if cell_barcode in adata.obs.index:
            guide_info[cell_barcode] = parse_guide_identity(row['guide_identity'])
    
    guide_df = pd.DataFrame.from_dict(guide_info, orient='index')
    metadata = pd.concat([metadata, guide_df], axis=1)
    
    metadata['perturbation_name'] = metadata['perturbation_name'].fillna('Unknown')
    metadata['condition'] = metadata['condition'].fillna('Unknown')
    
    # Add standard harmonized metadata fields
    metadata['organism'] = 'Homo sapiens'
    metadata['cell_type'] = 'K562'
    metadata['crispr_type'] = 'CRISPRi'
    metadata['cancer_type'] = 'Chronic myelogenous leukemia'
    
    # Add original guide identity
    cell_barcodes_in_both = set(cell_identities.index).intersection(set(adata.obs.index))
    metadata['guide_identity'] = pd.Series({
        bc: cell_identities.loc[bc, 'guide_identity'] if bc in cell_barcodes_in_both else 'Unknown'
        for bc in metadata.index
    })
    
    # Add read count and UMI count if available
    if 'read_count' in cell_identities.columns:
        metadata['read_count'] = pd.Series({
            bc: cell_identities.loc[bc, 'read_count'] if bc in cell_barcodes_in_both else np.nan
            for bc in metadata.index
        })
    
    if 'UMI_count' in cell_identities.columns:
        metadata['UMI_count'] = pd.Series({
            bc: cell_identities.loc[bc, 'UMI_count'] if bc in cell_barcodes_in_both else np.nan
            for bc in metadata.index
        })
    
    # Add library type
    if 'dual' in sample_id:
        metadata['library_type'] = 'dual_sgRNA'
    elif 'dolcetto' in sample_id:
        metadata['library_type'] = 'dolcetto'
    
    # Update adata.obs with metadata
    adata.obs = metadata
    
    print(f"Number of genes: {adata.n_vars}")
    print(f"Number of cells: {adata.n_obs}")
    
    return adata

def harmonize_dataset(data_dir):
    """
    Harmonize the GSE205310 dataset.
    
    Args:
        data_dir: Path to the directory containing the data.
        
    Returns:
        Dictionary of AnnData objects, one for each sample.
    """
    # Ensure data is downloaded and extracted
    download_and_extract(data_dir)
    
    # Process each sample
    sample_ids = [
        'GSM6210116_dual',
        'GSM6210117_dolcetto'
    ]
    
    adatas = {}
    for sample_id in sample_ids:
        adatas[sample_id] = process_sample(sample_id, data_dir)
    
    # Save each AnnData object
    for sample_id, adata in adatas.items():
        output_file = os.path.join(data_dir, f"{sample_id}_harmonized.h5ad")
        print(f"Saving harmonized data to {output_file}...")
        adata.write(output_file)
    
    # Create a combined dataset
    print("Creating combined dataset...")
    
    # Ensure unique observation names
    for sample_id, adata in adatas.items():
        adata.obs_names = [f"{sample_id}-{obs}" for obs in adata.obs_names]
    
    # Find common genes across datasets
    common_genes = set.intersection(*[set(adata.var_names) for adata in adatas.values()])
    print(f"Number of common genes across datasets: {len(common_genes)}")
    
    # Subset each dataset to the common genes
    for sample_id in adatas:
        adatas[sample_id] = adatas[sample_id][:, list(common_genes)]
    
    # Concatenate datasets
    combined = ad.concat(
        list(adatas.values()),
        join='outer',
        label='sample_id',
        keys=list(adatas.keys()),
        index_unique='-'
    )
    
    # Filter out cells where perturbation_name is 'Unknown'
    combined = combined[combined.obs['perturbation_name'] != 'Unknown'].copy()
    
    output_file = os.path.join(data_dir, "GSE205310_combined_harmonized.h5ad")
    print(f"Saving combined harmonized data to {output_file}...")
    combined.write(output_file)
    
    return adatas

def main(data_dir=None):
    """
    Main function to run the harmonization process in Jupyter.
    
    Args:
        data_dir: (Optional) Directory where the data will be stored.
                  If None, the current working directory is used.
    """
    if data_dir is None:
        data_dir = os.getcwd()
    
    print(f"Using data directory: {data_dir}")
    adatas = harmonize_dataset(data_dir)
    
    print("Harmonization complete.")
    
    # Print summary of the harmonized data
    for sample_id, adata in adatas.items():
        print(f"\nSummary for {sample_id}:")
        print(f"  Number of cells: {adata.n_obs}")
        print(f"  Number of genes: {adata.n_vars}")
        print(f"  Perturbation targets: {adata.obs['perturbation_name'].nunique()}")
        print(f"  Conditions: {', '.join(adata.obs['condition'].unique())}")

# Run the main function directly in the notebook
main()
