In [None]:
# Install any required packages if not already installed (uncomment if needed)
# !pip install anndata requests bs4 tqdm scipy pandas numpy

import os
import sys
import glob
import gzip
import re
import gc
import requests
import tarfile
from typing import Dict, List, Tuple, Optional, Union
from pathlib import Path
import numpy as np
import pandas as pd
import scipy.io as sio
import scipy.sparse as sp
from tqdm import tqdm
import anndata
from bs4 import BeautifulSoup

# Constants
GEO_ACCESSION = "GSE273271"
GEO_URL = f"https://www.ncbi.nlm.nih.gov/geo/download/?acc={GEO_ACCESSION}&format=file"
DOWNLOAD_FILENAME = f"{GEO_ACCESSION}_RAW.tar"

def download_data(data_dir: str) -> None:
    """
    Download the dataset if it doesn't exist.
    
    Args:
        data_dir: Directory to download the data to
    """
    os.makedirs(data_dir, exist_ok=True)
    tar_path = os.path.join(data_dir, DOWNLOAD_FILENAME)
    
    if not os.path.exists(tar_path):
        print(f"Downloading {GEO_ACCESSION} dataset...")
        response = requests.get(GEO_URL, stream=True)
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024  # 1 Kibibyte
        
        with open(tar_path, 'wb') as file:
            with tqdm(total=total_size, unit='iB', unit_scale=True) as progress_bar:
                for data in response.iter_content(block_size):
                    progress_bar.update(len(data))
                    file.write(data)
        
        print(f"Download complete: {tar_path}")
    else:
        print(f"Dataset already downloaded: {tar_path}")
    
    # Extract the tar file if needed
    sample_files = glob.glob(os.path.join(data_dir, "GSM*_*_barcodes.tsv.gz"))
    if not sample_files:
        print(f"Extracting {tar_path}...")
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=data_dir)
        print("Extraction complete.")
    else:
        print("Files already extracted.")

def get_sample_metadata(sample_ids: List[str]) -> pd.DataFrame:
    """
    Extract metadata for each sample from the sample IDs and titles.
    
    Args:
        sample_ids: List of sample IDs (GSM numbers)
        
    Returns:
        DataFrame with sample metadata
    """
    sample_metadata = []
    
    for sample_id in tqdm(sample_ids, desc="Fetching sample metadata"):
        # Find the corresponding barcode file to get the sample name
        barcode_file = glob.glob(f"{sample_id}_*_barcodes.tsv.gz")[0]
        sample_name = os.path.basename(barcode_file).split('_')[1]
        
        # Get the title from the GEO website
        url = f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={sample_id}"
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            title_td = soup.find('td', string='Title')
            if title_td:
                title = title_td.find_next('td').text.strip()
            else:
                title = ""
                
            # Extract metadata from title
            time_point_match = re.search(r'E(\d+\.\d+|\d+)', title)
            time_point = time_point_match.group(0) if time_point_match else None
            
            genotype_match = re.search(r'(wild type|Chd8 het)', title, re.IGNORECASE)
            genotype = genotype_match.group(0) if genotype_match else None
            
            sex_match = re.search(r'(male|female)', title, re.IGNORECASE)
            sex = sex_match.group(0) if sex_match else None
            
            rep_match = re.search(r'rep(\d+)', title)
            rep = rep_match.group(1) if rep_match else None
            
            sample_metadata.append({
                'sample_id': sample_id,
                'sample_name': sample_name,
                'title': title,
                'time_point': time_point,
                'genotype': genotype,
                'sex': sex,
                'replicate': rep
            })
        except Exception as e:
            print(f"Error processing {sample_id}: {e}")
            sample_metadata.append({
                'sample_id': sample_id,
                'sample_name': sample_name
            })
    
    return pd.DataFrame(sample_metadata)

def read_10x_data(matrix_file: str, features_file: str, barcodes_file: str) -> Tuple[sp.csr_matrix, np.ndarray, np.ndarray, np.ndarray]:
    """
    Read 10X Genomics data files.
    
    Args:
        matrix_file: Path to the matrix.mtx.gz file
        features_file: Path to the features.tsv.gz file
        barcodes_file: Path to the barcodes.tsv.gz file
        
    Returns:
        Tuple containing:
        - Sparse matrix of gene expression data
        - Array of gene IDs
        - Array of gene names
        - Array of cell barcodes
    """
    # Read the matrix
    matrix = sio.mmread(matrix_file).T.tocsr()
    
    # Read features (genes)
    with gzip.open(features_file, 'rt') as f:
        features_df = pd.read_csv(f, sep='\t', header=None)
        gene_ids = features_df[0].values
        gene_names = features_df[1].values
    
    # Read barcodes (cells)
    with gzip.open(barcodes_file, 'rt') as f:
        barcodes = pd.read_csv(f, sep='\t', header=None)[0].values
    
    return matrix, gene_ids, gene_names, barcodes

def process_sample(sample_id: str, metadata_df: pd.DataFrame) -> anndata.AnnData:
    """
    Process a single sample and create an AnnData object.
    
    Args:
        sample_id: Sample ID (GSM number)
        metadata_df: DataFrame with sample metadata
        
    Returns:
        AnnData object for the sample
    """
    # Find the files for this sample
    barcode_file = glob.glob(f"{sample_id}_*_barcodes.tsv.gz")[0]
    features_file = glob.glob(f"{sample_id}_*_features.tsv.gz")[0]
    matrix_file = glob.glob(f"{sample_id}_*_matrix.mtx.gz")[0]
    
    # Read the data
    matrix, gene_ids, gene_names, barcodes = read_10x_data(matrix_file, features_file, barcode_file)
    
    # Create AnnData object with explicit dtype to avoid warnings
    adata = anndata.AnnData(
        X=matrix,
        var=pd.DataFrame(index=gene_names),
        obs=pd.DataFrame(index=barcodes),
        dtype=np.float32
    )
    
    # Make variable names unique
    adata.var_names_make_unique()
    
    # Add gene IDs as a column in var
    adata.var['gene_ids'] = gene_ids
    
    # Get sample metadata
    sample_meta = metadata_df[metadata_df['sample_id'] == sample_id].iloc[0].to_dict()
    
    # Add sample metadata to each cell
    for key, value in sample_meta.items():
        if key not in ['sample_id', 'sample_name', 'title']:
            adata.obs[key] = value
    
    # Add sample ID and name
    adata.obs['sample_id'] = sample_id
    adata.obs['sample_name'] = sample_meta['sample_name']
    
    return adata

def harmonize_metadata(adata: anndata.AnnData) -> anndata.AnnData:
    """
    Harmonize metadata according to the required format.
    
    Args:
        adata: AnnData object with raw metadata
        
    Returns:
        AnnData object with harmonized metadata
    """
    # Create standardized metadata fields
    
    # 1. organism
    adata.obs['organism'] = 'Mus musculus'
    
    # 2. cell_type
    adata.obs['cell_type'] = 'Neural Cells'  # Default
    
    # 3. crispr_type
    adata.obs['crispr_type'] = 'None'
    
    # 4. cancer_type
    adata.obs['cancer_type'] = 'Non-Cancer'
    
    # 5. condition: Map genotype to condition
    condition_map = {
        'wild type': 'Control',
        'Chd8 het': 'Test'
    }
    adata.obs['condition'] = adata.obs['genotype'].map(condition_map)
    
    # 6. perturbation_name
    adata.obs['perturbation_name'] = 'None'
    adata.obs.loc[adata.obs['genotype'] == 'Chd8 het', 'perturbation_name'] = 'Chd8'
    
    return adata

def process_and_save_individual_samples(sample_ids: List[str], metadata_df: pd.DataFrame, data_dir: str) -> None:
    """
    Process each sample individually and save to separate files.
    
    Args:
        sample_ids: List of sample IDs
        metadata_df: DataFrame with sample metadata
        data_dir: Directory to save the results
    """
    # Create a directory for individual sample files
    samples_dir = os.path.join(data_dir, "individual_samples")
    os.makedirs(samples_dir, exist_ok=True)
    
    # Process each sample individually
    sample_files = []
    for i, sample_id in enumerate(tqdm(sample_ids, desc="Processing samples")):
        try:
            # Process the sample
            adata = process_sample(sample_id, metadata_df)
            
            # Harmonize metadata
            adata = harmonize_metadata(adata)
            
            # Save this sample
            sample_file = os.path.join(samples_dir, f"{sample_id}.h5ad")
            adata.write_h5ad(sample_file)
            sample_files.append(sample_file)
            
            # Clear memory
            del adata
            gc.collect()
        except Exception as e:
            print(f"Error processing sample {sample_id}: {e}")
    
    # Create a combined metadata file without loading the full data
    print("Creating combined metadata file...")
    
    all_obs = []
    all_var_names = set()
    total_cells = 0
    
    for sample_file in tqdm(sample_files, desc="Collecting metadata"):
        adata = anndata.read_h5ad(sample_file, backed='r')
        obs = adata.obs.copy()
        obs['_sample_file'] = os.path.basename(sample_file)
        all_obs.append(obs)
        all_var_names.update(adata.var_names)
        total_cells += adata.n_obs
        del adata
        gc.collect()
    
    combined_obs = pd.concat(all_obs, axis=0)
    all_var_names = list(all_var_names)
    
    metadata_file = os.path.join(data_dir, f"{GEO_ACCESSION}_metadata.csv")
    combined_obs.to_csv(metadata_file)
    
    var_names_file = os.path.join(data_dir, f"{GEO_ACCESSION}_var_names.txt")
    with open(var_names_file, 'w') as f:
        for var_name in all_var_names:
            f.write(f"{var_name}\n")
    
    summary_file = os.path.join(data_dir, f"{GEO_ACCESSION}_summary.txt")
    with open(summary_file, 'w') as f:
        f.write(f"Dataset: {GEO_ACCESSION}\n")
        f.write(f"Total samples: {len(sample_files)}\n")
        f.write(f"Total cells: {total_cells}\n")
        f.write(f"Total genes: {len(all_var_names)}\n")
        f.write(f"Metadata fields: {list(combined_obs.columns)}\n")
    
    print("Processing complete!")
    print(f"Individual sample files saved to: {samples_dir}")
    print(f"Metadata saved to: {metadata_file}")
    print(f"Variable names saved to: {var_names_file}")
    print(f"Summary saved to: {summary_file}")
    print(f"Total samples: {len(sample_files)}")
    print(f"Total cells: {total_cells}")
    print(f"Total genes: {len(all_var_names)}")
    print(f"Metadata fields: {list(combined_obs.columns)}")

def main(data_dir: str = None) -> None:
    """
    Main function to process and harmonize the dataset.
    
    Args:
        data_dir: Directory containing the data files. If None, uses the current directory.
    """
    if data_dir is None:
        data_dir = os.getcwd()
    
    # Download and extract data if needed
    download_data(data_dir)
    
    # Change to data directory
    os.chdir(data_dir)
    
    # Get all sample files
    barcode_files = glob.glob("GSM*_*_barcodes.tsv.gz")
    sample_ids = [os.path.basename(f).split('_')[0] for f in barcode_files]
    sample_ids = list(set(sample_ids))
    
    print(f"Found {len(sample_ids)} samples")
    
    # Get sample metadata
    metadata_df = get_sample_metadata(sample_ids)
    print("Sample metadata:")
    print(metadata_df[['sample_id', 'time_point', 'genotype', 'sex', 'replicate']].head())
    
    # Process each sample individually to manage memory usage
    process_and_save_individual_samples(sample_ids, metadata_df, data_dir)

# To run the script in Jupyter, simply call main() with your desired data directory.
# For example, to use a folder named 'data' in the current directory:
data_dir = "data"  # Change this as needed, or set to None to use the current directory
main(data_dir)


In [None]:
import os
import glob
import gc
import pandas as pd
import numpy as np
import anndata
from tqdm import tqdm

# Constants
GEO_ACCESSION = "GSE273271"

def combine_samples(data_dir: str) -> None:
    """
    Combine individual sample h5ad files into a single h5ad file.
    
    Args:
        data_dir: Directory containing the individual sample h5ad files
    """
    # Path to the directory containing individual sample files
    samples_dir = os.path.join(data_dir, "individual_samples")
    
    # Check if the directory exists
    if not os.path.exists(samples_dir):
        print(f"Error: Directory {samples_dir} does not exist.")
        print("Please run the harmonize_data notebook cell first to create individual sample files.")
        return
    
    # Get all sample files
    sample_files = glob.glob(os.path.join(samples_dir, "GSM*.h5ad"))
    
    if not sample_files:
        print(f"Error: No sample files found in {samples_dir}.")
        return
    
    print(f"Found {len(sample_files)} sample files.")
    
    # Load metadata
    metadata_file = os.path.join(data_dir, f"{GEO_ACCESSION}_metadata.csv")
    if os.path.exists(metadata_file):
        print(f"Loading metadata from {metadata_file}...")
        metadata = pd.read_csv(metadata_file, index_col=0)
    else:
        print(f"Warning: Metadata file {metadata_file} not found.")
        metadata = None
    
    # Combine samples in batches to manage memory
    print("Combining samples...")
    combined_adata = None
    batch_size = 4  # adjust the batch size if needed
    
    for i in range(0, len(sample_files), batch_size):
        batch_files = sample_files[i:i+batch_size]
        batch_num = i // batch_size + 1
        
        print(f"Processing batch {batch_num}/{(len(sample_files) + batch_size - 1) // batch_size}")
        
        # Load and combine samples in this batch
        batch_adatas = []
        for sample_file in tqdm(batch_files, desc=f"Batch {batch_num}"):
            try:
                adata = anndata.read_h5ad(sample_file)
                batch_adatas.append(adata)
            except Exception as e:
                print(f"Error loading {sample_file}: {e}")
        
        # Concatenate samples in this batch
        if batch_adatas:
            batch_adata = anndata.concat(batch_adatas, join='outer')
            
            # Combine with previous batches
            if combined_adata is None:
                combined_adata = batch_adata
            else:
                combined_adata = anndata.concat([combined_adata, batch_adata], join='outer')
            
            # Clear memory
            del batch_adatas, batch_adata
            gc.collect()
    
    # Make variable names unique
    combined_adata.var_names_make_unique()
    
    # Save the combined dataset
    output_file = os.path.join(data_dir, f"{GEO_ACCESSION}_combined.h5ad")
    print(f"Saving combined dataset to {output_file}...")
    combined_adata.write_h5ad(output_file)
    
    print("Combination complete!")
    print(f"Final dataset shape: {combined_adata.shape}")
    print(f"Metadata fields: {list(combined_adata.obs.columns)}")

def main(data_dir: str = None) -> None:
    """
    Main function to combine individual sample h5ad files.
    
    Args:
        data_dir: Directory containing the individual sample h5ad files
    """
    # Set data directory
    if data_dir is None:
        data_dir = os.getcwd()
    
    combine_samples(data_dir)

# To run the script in a Jupyter Notebook, simply call main() with your desired data directory.
# For example, to use a folder named 'data' in the current directory:
data_dir = "data"  # Change this as needed, or set to None to use the current directory
main(data_dir)
