In [None]:
import os
import gzip
import tarfile
import requests
import numpy as np
import pandas as pd
import anndata as ad
from tqdm import tqdm
from pathlib import Path
from scipy import sparse, io

def download_file(url, destination):
    """
    Download a file from a URL to a destination path.
    
    Args:
        url: URL to download from
        destination: Path to save the file to
    """
    if os.path.exists(destination):
        print(f"File {destination} already exists, skipping download")
        return
    
    print(f"Downloading {url} to {destination}")
    response = requests.get(url, stream=True)
    response.raise_for_status()
    
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024
    
    with open(destination, 'wb') as f:
        for data in tqdm(response.iter_content(block_size), total=total_size//block_size, unit='KB'):
            f.write(data)

def download_and_extract_gse263747(data_dir):
    """
    Download and extract GSE263747 dataset files if they don't exist.
    
    Args:
        data_dir: Directory to save the files to
    """
    base_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE263nnn/GSE263747/suppl"
    tar_file = os.path.join(data_dir, "GSE263747_RAW.tar")
    
    # Download the tar file if it doesn't exist
    if not os.path.exists(tar_file):
        download_file(f"{base_url}/GSE263747_RAW.tar", tar_file)
    
    # Extract files if they don't exist
    if not os.path.exists(os.path.join(data_dir, "GSM8197841_SNU_D5_GEX_matrix.mtx.gz")):
        print(f"Extracting {tar_file} to {data_dir}")
        with tarfile.open(tar_file) as tar:
            tar.extractall(path=data_dir)

def read_10x_mtx(matrix_file, features_file, barcodes_file):
    """
    Read 10x data in MTX format.
    
    Args:
        matrix_file: Path to the matrix.mtx.gz file
        features_file: Path to the features.tsv.gz file
        barcodes_file: Path to the barcodes.tsv.gz file
        
    Returns:
        X: Sparse matrix (cells x genes)
        var_df: DataFrame with gene information
        obs_df: DataFrame with cell information
    """
    # Read the matrix
    with gzip.open(matrix_file, 'rb') as f:
        X = io.mmread(f).tocsr().T  # Transpose to get cells x genes
    
    # Read features (genes)
    with gzip.open(features_file, 'rt') as f:
        var_df = pd.read_csv(f, sep='\t', header=None)
        var_df.columns = ['gene_ids', 'gene_symbols', 'feature_types'] if var_df.shape[1] >= 3 else ['gene_ids', 'gene_symbols']
    
    # Read barcodes (cells)
    with gzip.open(barcodes_file, 'rt') as f:
        barcodes = pd.read_csv(f, sep='\t', header=None)[0].values
    
    # Create observation DataFrame
    obs_df = pd.DataFrame(index=barcodes)
    
    return X, var_df, obs_df

def read_sgRNA_data(matrix_file, features_file, barcodes_file):
    """
    Read sgRNA data in MTX format.
    
    Args:
        matrix_file: Path to the matrix.mtx.gz file
        features_file: Path to the features.tsv.gz file
        barcodes_file: Path to the barcodes.tsv.gz file
        
    Returns:
        sgRNA_df: DataFrame with sgRNA counts
        features_df: DataFrame with sgRNA information
    """
    # Read the matrix
    with gzip.open(matrix_file, 'rb') as f:
        X = io.mmread(f).tocsr().T  # Transpose to get cells x sgRNAs
    
    # Read features (sgRNAs)
    with gzip.open(features_file, 'rt') as f:
        features_df = pd.read_csv(f, sep='\t', header=None)
    
    # Read barcodes (cells)
    with gzip.open(barcodes_file, 'rt') as f:
        barcodes = pd.read_csv(f, sep='\t', header=None)[0].values
    
    # Convert to DataFrame
    sgRNA_df = pd.DataFrame(
        X.toarray(),
        index=barcodes,
        columns=features_df[0].values
    )
    
    return sgRNA_df, features_df

def parse_sgRNA_target(sgRNA_info):
    """
    Parse sgRNA information to extract target gene.
    
    Args:
        sgRNA_info: String with sgRNA information
        
    Returns:
        Target gene name
    """
    if isinstance(sgRNA_info, str):
        if "non-targeting" in sgRNA_info.lower() or "nt" in sgRNA_info.lower():
            return "Non-targeting"
        else:
            # Extract gene name from sgRNA info (format varies)
            parts = sgRNA_info.split('_')
            if len(parts) > 1:
                return parts[0]
            else:
                return sgRNA_info
    return "unknown"

def process_gse263747(data_dir):
    """
    Process GSE263747 dataset.
    
    Args:
        data_dir: Directory containing the dataset files
        
    Returns:
        Dictionary of AnnData objects for each timepoint
    """
    # File paths
    day5_gex_matrix = os.path.join(data_dir, "GSM8197841_SNU_D5_GEX_matrix.mtx.gz")
    day5_gex_features = os.path.join(data_dir, "GSM8197841_SNU_D5_GEX_features.tsv.gz")
    day5_gex_barcodes = os.path.join(data_dir, "GSM8197841_SNU_D5_GEX_barcodes.tsv.gz")
    
    day10_gex_matrix = os.path.join(data_dir, "GSM8197842_SNU_D10_GEX_matrix.mtx.gz")
    day10_gex_features = os.path.join(data_dir, "GSM8197842_SNU_D10_GEX_features.tsv.gz")
    day10_gex_barcodes = os.path.join(data_dir, "GSM8197842_SNU_D10_GEX_barcodes.tsv.gz")
    
    day5_sgRNA_matrix = os.path.join(data_dir, "GSM8197843_SNU_D5_sgRNA_matrix.mtx.gz")
    day5_sgRNA_features = os.path.join(data_dir, "GSM8197843_SNU_D5_sgRNA_features.tsv.gz")
    day5_sgRNA_barcodes = os.path.join(data_dir, "GSM8197843_SNU_D5_sgRNA_barcodes.tsv.gz")
    
    day10_sgRNA_matrix = os.path.join(data_dir, "GSM8197844_SNU_D10_sgRNA_matrix.mtx.gz")
    day10_sgRNA_features = os.path.join(data_dir, "GSM8197844_SNU_D10_sgRNA_features.tsv.gz")
    day10_sgRNA_barcodes = os.path.join(data_dir, "GSM8197844_SNU_D10_sgRNA_barcodes.tsv.gz")
    
    # Process day 5 data
    print("Processing day5 data...")
    day5_X, day5_var_df, day5_obs_df = read_10x_mtx(day5_gex_matrix, day5_gex_features, day5_gex_barcodes)
    day5_sgRNA_df, day5_sgRNA_features_df = read_sgRNA_data(day5_sgRNA_matrix, day5_sgRNA_features, day5_sgRNA_barcodes)
    
    # Process day 10 data
    print("Processing day10 data...")
    day10_X, day10_var_df, day10_obs_df = read_10x_mtx(day10_gex_matrix, day10_gex_features, day10_gex_barcodes)
    day10_sgRNA_df, day10_sgRNA_features_df = read_sgRNA_data(day10_sgRNA_matrix, day10_sgRNA_features, day10_sgRNA_barcodes)
    
    # Create sgRNA to target gene mapping
    sgRNA_to_gene = {}
    for _, row in day5_sgRNA_features_df.iterrows():
        sgRNA_id = row[0]
        sgRNA_info = row[1] if len(row) > 1 else "unknown"
        target_gene = parse_sgRNA_target(sgRNA_info)
        sgRNA_to_gene[sgRNA_id] = target_gene
    
    # For each cell, find the sgRNA with the highest count
    day5_sgRNA_assignments = day5_sgRNA_df.idxmax(axis=1)
    day10_sgRNA_assignments = day10_sgRNA_df.idxmax(axis=1)
    
    # Add sgRNA and target gene information to observation DataFrames
    day5_obs_df['sgRNA'] = day5_sgRNA_assignments
    day5_obs_df['perturbation_name'] = day5_sgRNA_assignments.map(sgRNA_to_gene)
    day5_obs_df['timepoint'] = 'day5'
    
    day10_obs_df['sgRNA'] = day10_sgRNA_assignments
    day10_obs_df['perturbation_name'] = day10_sgRNA_assignments.map(sgRNA_to_gene)
    day10_obs_df['timepoint'] = 'day10'
    
    # Create AnnData objects
    day5_adata = ad.AnnData(X=day5_X, obs=day5_obs_df, var=pd.DataFrame(index=day5_var_df['gene_symbols'].values))
    day5_adata.var['gene_ids'] = day5_var_df['gene_ids'].values
    if 'feature_types' in day5_var_df.columns:
        day5_adata.var['feature_types'] = day5_var_df['feature_types'].values
    
    day10_adata = ad.AnnData(X=day10_X, obs=day10_obs_df, var=pd.DataFrame(index=day10_var_df['gene_symbols'].values))
    day10_adata.var['gene_ids'] = day10_var_df['gene_ids'].values
    if 'feature_types' in day10_var_df.columns:
        day10_adata.var['feature_types'] = day10_var_df['feature_types'].values
    
    return {'day5': day5_adata, 'day10': day10_adata}

def harmonize_gse263747(data_dir):
    """
    Harmonize GSE263747 dataset into h5ad format.
    
    Args:
        data_dir: Directory containing the dataset files
    """
    # Download and extract data if needed
    download_and_extract_gse263747(data_dir)
    
    # Process data
    adatas = process_gse263747(data_dir)
    
    # Harmonize metadata and save individual timepoint datasets
    for timepoint, adata in adatas.items():
        # Add standardized metadata
        adata.obs['organism'] = 'Homo sapiens'
        adata.obs['cell_type'] = 'SNU-761 liver cancer cells'
        adata.obs['crispr_type'] = 'CRISPR KO'
        adata.obs['cancer_type'] = 'Liver Cancer'
        
        # For each cell, set condition to "Control" if perturbation_name is exactly "Non-targeting",
        # otherwise keep the timepoint.
        adata.obs['condition'] = adata.obs.apply(
            lambda row: "Control" if row['perturbation_name'] == "Non-targeting" else timepoint, axis=1
        )
        
        # Save harmonized data for this timepoint
        output_file = os.path.join(data_dir, f"GSE263747_{timepoint}_harmonized.h5ad")
        print(f"Saving harmonized data to {output_file}")
        adata.write(output_file)
    
    # Create a combined dataset
    combined_adata = ad.concat(
        adatas.values(),
        label="timepoint",
        keys=list(adatas.keys()),
        index_unique='-'
    )
    
    # Save combined dataset
    output_file = os.path.join(data_dir, "GSE263747_combined_harmonized.h5ad")
    print(f"Saving combined harmonized data to {output_file}")
    combined_adata.write(output_file)
    
    print("\nHarmonization complete!")
    print(f"Dataset summary:")
    print(f"  - Number of cells: {combined_adata.n_obs}")
    print(f"  - Number of genes: {combined_adata.n_vars}")
    print(f"  - Organism: Homo sapiens")
    print(f"  - Cell type: SNU-761 liver cancer cells")
    print(f"  - CRISPR type: CRISPR KO")
    print(f"  - Cancer type: Liver Cancer")
    print(f"  - Timepoints: day5, day10")
    print(f"  - Number of perturbations: {combined_adata.obs['perturbation_name'].nunique()}")


# Define the data directory. You can change this to a desired path.
data_dir = os.getcwd()  # or set to another directory, e.g., "/path/to/data"

# Run the harmonization process
harmonize_gse263747(data_dir)
