In [None]:
"""
GSE120861 Dataset Harmonizer for Jupyter Notebook

This notebook cell processes the GSE120861 dataset and harmonizes it into h5ad format.
It downloads the required files (if not already present), processes metadata, and (optionally)
downloads and processes a large expression matrix.

Usage:
    Call run_harmonizer() with optional parameters:
      - data_dir: Directory to store/download files (default: 'GSE120861')
      - download_expr_matrix: Set to True to download the large expression matrix automatically,
                              False to skip, or leave as None to be prompted.
"""

import os
import gzip
import urllib.request
import subprocess
import anndata

import pandas as pd
import numpy as np
import h5py
from scipy import sparse
import warnings
warnings.filterwarnings('ignore')

# URLs for downloading the dataset files
FILE_URLS = {
    'at_scale_screen.cells.txt.gz': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE120nnn/GSE120861/suppl/GSE120861_at_scale_screen.cells.txt.gz',
    'at_scale_screen.genes.txt.gz': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE120nnn/GSE120861/suppl/GSE120861_at_scale_screen.genes.txt.gz',
    'at_scale_screen.phenoData.txt.gz': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE120nnn/GSE120861/suppl/GSE120861_at_scale_screen.phenoData.txt.gz',
    'grna_groups.at_scale.txt.gz': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE120nnn/GSE120861/suppl/GSE120861_grna_groups.at_scale.txt.gz',
    'gene_gRNAgroup_pair_table.at_scale.txt.gz': 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE120nnn/GSE120861/suppl/GSE120861_gene_gRNAgroup_pair_table.at_scale.txt.gz'
}

# Optional large expression matrix file (8.9GB)
EXPRESSION_MATRIX_URL = 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE120nnn/GSE120861/suppl/GSE120861_at_scale_screen.exprs.mtx.gz'

def download_file(url, destination):
    """Download a file from a URL to a destination path using wget (or urllib as fallback)."""
    print(f"Downloading {url} to {destination}...")
    try:
        subprocess.run(['wget', '-q', '--show-progress', '-O', destination, url], check=True)
        print(f"Downloaded {destination}")
    except subprocess.CalledProcessError:
        print(f"Failed to download with wget. Trying urllib...")
        try:
            urllib.request.urlretrieve(url, destination)
            print(f"Downloaded {destination}")
        except Exception as e:
            print(f"Error downloading {url}: {e}")
            raise

def ensure_files_exist(data_dir):
    """Ensure all required files exist, downloading them if necessary."""
    os.makedirs(data_dir, exist_ok=True)
    for filename, url in FILE_URLS.items():
        file_path = os.path.join(data_dir, filename)
        if not os.path.exists(file_path):
            download_file(url, file_path)
        else:
            print(f"File {filename} already exists.")

def read_gzipped_text(file_path):
    """Read a gzipped text file line by line."""
    with gzip.open(file_path, 'rt') as f:
        return [line.strip() for line in f]

def read_mtx_file(file_path):
    """Read a Matrix Market file into a sparse matrix."""
    print(f"Reading matrix file {file_path}...")
    with open(file_path, 'r') as f:
        # Skip header comments and read dimensions
        while True:
            line = f.readline().strip()
            if not line.startswith('%'):
                dimensions = line.split()
                n_genes, n_cells, n_entries = int(dimensions[0]), int(dimensions[1]), int(dimensions[2])
                break
    
    data = []
    row_indices = []
    col_indices = []
    with open(file_path, 'r') as f:
        # Skip header
        line = f.readline()
        while line.startswith('%'):
            line = f.readline()
        # Skip dimensions line
        line = f.readline()
        for line in f:
            parts = line.strip().split()
            row_indices.append(int(parts[0]) - 1)  # convert to 0-based index
            col_indices.append(int(parts[1]) - 1)
            data.append(int(parts[2]))
    
    # Create sparse matrix (cells x genes)
    matrix = sparse.csr_matrix((data, (row_indices, col_indices)), shape=(n_genes, n_cells))
    return matrix.T  # transpose to have cells as rows

def parse_phenodata(file_path):
    """Parse the phenoData file to extract cell metadata."""
    print(f"Parsing phenoData file {file_path}...")
    columns = [
        'sample_id', 'cell_id', 'umi_count', 'size_factor', 
        'perturbations', 'perturbation_ids', 'grna_sequences',
        'detected_genes', 'detected_umis', 'frip', 'n_grnas',
        'batch', 'grna_library', 'prep_batch', 'chip_batch', 
        'lane_batch', 'doublet_score'
    ]
    rows = []
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= len(columns):
                rows.append(parts[:len(columns)])
    df = pd.DataFrame(rows, columns=columns)
    numeric_cols = ['umi_count', 'size_factor', 'detected_genes', 'detected_umis', 
                    'frip', 'n_grnas', 'doublet_score']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

def parse_grna_groups(file_path):
    """Parse the gRNA groups file to understand perturbation types."""
    print(f"Parsing gRNA groups file {file_path}...")
    df = pd.read_csv(file_path, sep='\t', header=None)
    df.columns = ['group_name', 'sequence']
    df['category'] = 'unknown'
    df.loc[df['group_name'].str.contains('TSS'), 'category'] = 'TSS'
    df.loc[df['group_name'].str.contains('NTC'), 'category'] = 'NTC'
    df.loc[df['group_name'].str.contains('chr'), 'category'] = 'enhancer'
    return df

def parse_gene_grna_pairs(file_path):
    """Parse the gene-gRNA pair table to understand gene-perturbation relationships."""
    print(f"Parsing gene-gRNA pair table {file_path}...")
    df = pd.read_csv(file_path, sep='\t', low_memory=False)
    return df

def extract_perturbation_info(phenodata_df, grna_groups_df):
    """Extract perturbation information from phenoData and gRNA groups."""
    print("Extracting perturbation information...")
    # Map group names to categories
    group_to_category = dict(zip(grna_groups_df['group_name'], grna_groups_df['category']))
    results = []
    for _, row in phenodata_df.iterrows():
        cell_id = row['cell_id']
        perturbations = row['perturbations'].split('_')
        # Extract TSS perturbations
        tss_targets = []
        i = 0
        while i < len(perturbations):
            if i+1 < len(perturbations) and perturbations[i+1] == 'TSS':
                tss_targets.append(perturbations[i])
                i += 2
            else:
                i += 1
        # Extract enhancer perturbations
        enhancer_targets = []
        i = 0
        while i < len(perturbations):
            if i+2 < len(perturbations) and perturbations[i].startswith('chr') and perturbations[i+1] in ['top', 'second'] and perturbations[i+2] == 'two':
                enhancer_targets.append(perturbations[i])
                i += 3
            else:
                i += 1
        is_control = len(tss_targets) == 0 and len(enhancer_targets) == 0
        result = {
            'cell_id': cell_id,
            'tss_targets': ','.join(tss_targets) if tss_targets else 'None',
            'enhancer_targets': ','.join(enhancer_targets) if enhancer_targets else 'None',
            'is_control': is_control,
            'n_perturbations': len(tss_targets) + len(enhancer_targets),
            'perturbation_type': 'None' if is_control else ('TSS' if len(enhancer_targets) == 0 else ('enhancer' if len(tss_targets) == 0 else 'mixed'))
        }
        results.append(result)
    return pd.DataFrame(results)

def create_harmonized_metadata(phenodata_df, perturbation_df, gene_grna_pairs_df):
    """Create harmonized metadata according to the specified format."""
    print("Creating harmonized metadata...")
    metadata = pd.merge(phenodata_df, perturbation_df, on='cell_id')
    metadata['organism'] = 'Homo sapiens'
    metadata['cell_type'] = 'K562'
    metadata['crispr_type'] = 'CRISPRi'
    metadata['cancer_type'] = 'Chronic myelogenous leukemia'
    metadata['condition'] = metadata['is_control'].map({True: 'Control', False: 'Test'})
    
    def format_perturbation_name(row):
        if row['is_control']:
            return 'Non-targeting'
        if row['perturbation_type'] == 'TSS':
            return row['tss_targets']
        elif row['perturbation_type'] == 'enhancer':
            return row['enhancer_targets']
        else:
            return f"{row['tss_targets']} + {row['enhancer_targets']}"
    
    metadata['perturbation_name'] = metadata.apply(format_perturbation_name, axis=1)
    return metadata

def create_h5ad_metadata_only(data_dir, output_file):
    """Create an h5ad file with metadata only (no expression matrix)."""
    print(f"Creating h5ad metadata file {output_file}...")
    ensure_files_exist(data_dir)
    cells_file = os.path.join(data_dir, 'at_scale_screen.cells.txt.gz')
    genes_file = os.path.join(data_dir, 'at_scale_screen.genes.txt.gz')
    
    cells = read_gzipped_text(cells_file)
    genes = read_gzipped_text(genes_file)
    print(f"Found {len(cells)} cells and {len(genes)} genes")
    
    phenodata_file = os.path.join(data_dir, 'at_scale_screen.phenoData.txt.gz')
    grna_groups_file = os.path.join(data_dir, 'grna_groups.at_scale.txt.gz')
    gene_grna_pairs_file = os.path.join(data_dir, 'gene_gRNAgroup_pair_table.at_scale.txt.gz')
    
    phenodata_df = parse_phenodata(phenodata_file)
    grna_groups_df = parse_grna_groups(grna_groups_file)
    gene_grna_pairs_df = parse_gene_grna_pairs(gene_grna_pairs_file)
    
    perturbation_df = extract_perturbation_info(phenodata_df, grna_groups_df)
    metadata_df = create_harmonized_metadata(phenodata_df, perturbation_df, gene_grna_pairs_df)
    
    with h5py.File(output_file, 'w') as f:
        obs_group = f.create_group('obs')
        var_group = f.create_group('var')
        for col in metadata_df.columns:
            if col in ['organism', 'cell_type', 'crispr_type', 'cancer_type', 'condition', 'perturbation_name']:
                obs_group.create_dataset(col, data=metadata_df[col].values.astype('S'))
            else:
                if metadata_df[col].dtype == 'object':
                    obs_group.create_dataset(f"original_{col}", data=metadata_df[col].values.astype('S'))
                else:
                    obs_group.create_dataset(f"original_{col}", data=metadata_df[col].values)
        obs_group.create_dataset('_index', data=np.array(cells, dtype='S'))
        var_group.create_dataset('_index', data=np.array(genes, dtype='S'))
        f.attrs['dataset_id'] = 'GSE120861'
        f.attrs['title'] = 'A genome-wide framework for mapping gene regulation via cellular genetic screens'
        f.attrs['description'] = 'CRISPRi screen in K562 cells targeting candidate enhancers'
        f.attrs['note'] = 'This file contains metadata only. Expression matrix not included due to size constraints.'
    
    print(f"Created h5ad metadata file {output_file}")

def download_expression_matrix(data_dir):
    """Download and decompress the expression matrix file."""
    print("Downloading expression matrix file...")
    exprs_gz_file = os.path.join(data_dir, 'at_scale_screen.exprs.mtx.gz')
    exprs_file = os.path.join(data_dir, 'at_scale_screen.exprs.mtx')
    
    # Download if neither the gzipped nor decompressed file exists.
    if not os.path.exists(exprs_gz_file) and not os.path.exists(exprs_file):
        download_file(EXPRESSION_MATRIX_URL, exprs_gz_file)
    else:
        print("Expression matrix file already exists.")
    
    # If the decompressed file doesn't exist, check if decompression is needed.
    if not os.path.exists(exprs_file):
        # Open the file in binary mode and check the first two bytes.
        with open(exprs_gz_file, 'rb') as f:
            magic = f.read(2)
        if magic == b'\x1f\x8b':
            print(f"Decompressing {exprs_gz_file}...")
            try:
                with gzip.open(exprs_gz_file, 'rb') as f_in:
                    with open(exprs_file, 'wb') as f_out:
                        f_out.write(f_in.read())
                print(f"Decompressed to {exprs_file}")
            except Exception as e:
                print(f"Error decompressing file: {e}")
                return False
        else:
            print(f"File {exprs_gz_file} is not gzipped. Assuming it is already in plain text.")
            # If you want to rename it to the expected filename:
            os.rename(exprs_gz_file, exprs_file)
    return True


def create_full_h5ad(data_dir, metadata_file, output_file):
    """Create a full h5ad file with expression matrix using an AnnData object."""
    print(f"Creating full h5ad file {output_file}...")
    
    # Path to expression matrix file
    exprs_file = os.path.join(data_dir, 'at_scale_screen.exprs.mtx')
    if not os.path.exists(exprs_file):
        print(f"Expression matrix file {exprs_file} not found. Please download it first.")
        return False

    # Read metadata from the metadata file
    with h5py.File(metadata_file, 'r') as f:
        # Read cell and gene names
        cells = [s.decode('utf-8') for s in f['obs']['_index'][:]]
        genes = [s.decode('utf-8') for s in f['var']['_index'][:]]
        
        # Read harmonized metadata columns
        harmonized_keys = ['organism', 'cell_type', 'crispr_type', 'cancer_type', 'condition', 'perturbation_name']
        obs_data = {}
        for key in harmonized_keys:
            obs_data[key] = [s.decode('utf-8') for s in f['obs'][key][:]]
    
    # Read the expression matrix (as a sparse matrix)
    matrix = read_mtx_file(exprs_file)
    
    # Create observation (obs) and variable (var) dataframes
    obs_df = pd.DataFrame(obs_data, index=cells)
    var_df = pd.DataFrame(index=genes)
    
    # Create AnnData object
    adata = anndata.AnnData(X=matrix, obs=obs_df, var=var_df)
    
    # Optionally, add additional info to the uns field
    adata.uns['dataset_id'] = 'GSE120861'
    adata.uns['title'] = 'A genome-wide framework for mapping gene regulation via cellular genetic screens'
    adata.uns['description'] = 'CRISPRi screen in K562 cells targeting candidate enhancers'
    
    # Write the AnnData object to an h5ad file (in the proper format)
    adata.write_h5ad(output_file)
    print(f"Created full h5ad file {output_file}")
    return True

def run_harmonizer(data_dir='GSE120861', download_expr_matrix=None):
    """
    Run the harmonization process.
    
    Parameters:
      data_dir (str): Directory for dataset files (default: 'GSE120861')
      download_expr_matrix (bool or None): 
          True to download the large expression matrix automatically,
          False to skip,
          None to prompt the user.
    """
    metadata_file = os.path.join(data_dir, 'GSE120861_harmonized_metadata.h5ad')
    full_h5ad_file = os.path.join(data_dir, 'GSE120861_harmonized.h5ad')
    
    create_h5ad_metadata_only(data_dir, metadata_file)
    
    if download_expr_matrix is None:
        choice = input("\nThe expression matrix file is large (8.9GB). Do you want to download it? (y/n): ").strip().lower()
        download_expr_matrix = (choice == 'y')
    
    if download_expr_matrix:
        if download_expression_matrix(data_dir):
            create_full_h5ad(data_dir, metadata_file, full_h5ad_file)
    else:
        print("\nSkipping expression matrix download.")
        print("To download and process the expression matrix later, run:")
        print(f"!wget -O {data_dir}/at_scale_screen.exprs.mtx.gz {EXPRESSION_MATRIX_URL}")
        print(f"!gunzip {data_dir}/at_scale_screen.exprs.mtx.gz")
        print(f"Then, call create_full_h5ad('{data_dir}', '{metadata_file}', '{full_h5ad_file}')")

# Run the harmonizer (adjust parameters as needed)
run_harmonizer()


In [None]:
# Set your input and output file paths
input_h5ad = "GSE120861/GSE120861_harmonized.h5ad"      # Path to your original h5ad file
output_h5ad = "filtered_output.h5ad"  # Update with your desired output path

import anndata
import pandas as pd
import re

# Load the AnnData object
try:
    adata = anndata.read_h5ad(input_h5ad)
    print(f"Loaded h5ad file with {adata.n_obs} cells.")
except Exception as e:
    raise RuntimeError(f"Error loading {input_h5ad}: {e}")

# Ensure the 'perturbation_name' column exists
if "perturbation_name" not in adata.obs.columns:
    raise ValueError("'perturbation_name' column not found in the AnnData object.")

def clean_perturbation(name):
    """
    Splits the perturbation name on commas or plus signs (with optional spaces),
    removes tokens that start with 'chr' (case-insensitive), and rejoins the remaining tokens with a plus sign.
    Returns an empty string if no token remains.
    """
    # Split on commas or plus signs with optional whitespace
    tokens = re.split(r'\s*\+\s*|,', name)
    # Remove extra spaces and empty tokens
    tokens = [token.strip() for token in tokens if token.strip()]
    # Remove tokens that start with 'chr' (case-insensitive)
    tokens = [token for token in tokens if not token.lower().startswith("chr")]
    return "+".join(tokens) if tokens else ""

# Apply cleaning function to the 'perturbation_name' column
adata.obs["perturbation_name"] = adata.obs["perturbation_name"].apply(clean_perturbation)

# Print unique perturbation names after cleaning
print("Unique perturbation names after cleaning:")
print(adata.obs["perturbation_name"].unique())

# Remove cells that have an empty perturbation name
initial_count = adata.n_obs
adata = adata[adata.obs["perturbation_name"] != ""].copy()
filtered_count = adata.n_obs
print(f"Removed {initial_count - filtered_count} cells with an empty perturbation name; {filtered_count} cells remain.")

# Save the modified AnnData object
try:
    adata.write_h5ad(output_h5ad)
    print(f"Modified h5ad file saved to: {output_h5ad}")
except Exception as e:
    raise RuntimeError(f"Error saving {output_h5ad}: {e}")
