In [None]:
# GSE153056 Dataset Harmonizer for Jupyter Notebook

import os
import gzip
import shutil
import requests
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from pathlib import Path
from tqdm import tqdm
from scipy import sparse

# Set random seed for reproducibility
np.random.seed(42)

class GSE153056Harmonizer:
    """
    Class to download and harmonize the GSE153056 dataset.
    """
    
    def __init__(self, data_dir):
        """
        Initialize the harmonizer with the data directory.
        
        Parameters:
        -----------
        data_dir : str
            Path to the directory where the data will be downloaded and processed.
        """
        self.data_dir = Path(data_dir)
        self.accession = "GSE153056"
        self.download_urls = {
            "raw_tar": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE153nnn/GSE153056/suppl/GSE153056_RAW.tar",
            "eccite_metadata": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE153nnn/GSE153056/suppl/GSE153056_ECCITE_metadata.tsv.gz"
        }
        
        # Create the data directory if it doesn't exist
        os.makedirs(self.data_dir, exist_ok=True)
    
    def download_files(self):
        """
        Download the dataset files if they don't exist.
        """
        print("Checking and downloading required files...")
        
        for file_key, url in self.download_urls.items():
            file_name = url.split('/')[-1]
            file_path = self.data_dir / file_name
            
            if not file_path.exists():
                print(f"Downloading {file_name}...")
                response = requests.get(url, stream=True)
                total_size = int(response.headers.get('content-length', 0))
                
                with open(file_path, 'wb') as f, tqdm(
                    desc=file_name,
                    total=total_size,
                    unit='B',
                    unit_scale=True,
                    unit_divisor=1024,
                ) as bar:
                    for data in response.iter_content(chunk_size=1024):
                        size = f.write(data)
                        bar.update(size)
                
                # Extract tar file if needed
                if file_key == "raw_tar" and not (self.data_dir / "GSM4633614_ECCITE_cDNA_counts.tsv.gz").exists():
                    print(f"Extracting {file_name}...")
                    shutil.unpack_archive(file_path, self.data_dir)
            else:
                print(f"{file_name} already exists, skipping download.")
    
    def read_count_matrix_sparse(self, file_path):
        """
        Read a gzipped count matrix file into a sparse matrix.
        
        Parameters:
        -----------
        file_path : str or Path
            Path to the gzipped count matrix file.
            
        Returns:
        --------
        tuple
            A tuple containing (sparse_matrix, genes, cell_barcodes).
        """
        print(f"Reading {file_path} as sparse matrix...")
        
        # First pass: get dimensions and non-zero entries
        with gzip.open(file_path, 'rt') as f:
            header = f.readline().strip().split('\t')
            cell_barcodes = header[1:]  # Skip the first empty column
            genes = []
            data = []
            indices = []
            indptr = [0]
            
            for i, line in enumerate(f):
                if i % 1000 == 0:
                    print(f"Processed {i} rows...")
                
                parts = line.strip().split('\t')
                genes.append(parts[0])
                
                # Convert counts to sparse format
                row_data = [int(float(x)) for x in parts[1:]]
                row_indices = [j for j, x in enumerate(row_data) if x > 0]
                row_data = [row_data[j] for j in row_indices]
                
                data.extend(row_data)
                indices.extend(row_indices)
                indptr.append(len(indices))
        
        # Create sparse matrix
        X = sparse.csr_matrix((data, indices, indptr), shape=(len(genes), len(cell_barcodes)))
        print(f"Finished reading {file_path}, shape: {X.shape}")
        
        return X, genes, cell_barcodes
    
    def process_eccite_seq_data(self):
        """
        Process the ECCITE-seq data from the dataset.
        
        Returns:
        --------
        anndata.AnnData
            The processed ECCITE-seq data as an AnnData object.
        """
        print("Processing ECCITE-seq data...")
        
        # Read RNA counts as sparse matrix
        rna_matrix, genes, cell_barcodes = self.read_count_matrix_sparse(
            self.data_dir / "GSM4633614_ECCITE_cDNA_counts.tsv.gz"
        )
        
        # Read ADT and GDO counts (these are smaller, so we can use pandas)
        print("Reading ADT counts...")
        adt_counts = pd.read_csv(
            gzip.open(self.data_dir / "GSM4633615_ECCITE_ADT_counts.tsv.gz", 'rt'),
            sep='\t', index_col=0
        )
        print(f"ADT counts shape: {adt_counts.shape}")
        
        print("Reading GDO counts...")
        gdo_counts = pd.read_csv(
            gzip.open(self.data_dir / "GSM4633618_ECCITE_GDO_counts.tsv.gz", 'rt'),
            sep='\t', index_col=0
        )
        print(f"GDO counts shape: {gdo_counts.shape}")
        
        print("Reading metadata...")
        # Read metadata
        metadata = pd.read_csv(self.data_dir / "GSE153056_ECCITE_metadata.tsv.gz", sep='\t', index_col=0)
        print(f"Metadata shape: {metadata.shape}")
        
        # Read ADT and GDO barcodes
        adt_barcodes = pd.read_csv(self.data_dir / "GSM4633615_ECCITE_ADT_Barcodes.csv.gz", 
                                  header=None, names=['barcode', 'protein'])
        gdo_barcodes = pd.read_csv(self.data_dir / "GSM4633618_ECCITE_GDO_Barcodes.csv.gz", 
                                  header=None, names=['barcode', 'guide'])
        
        print("Creating AnnData object...")
        # Create AnnData object with RNA counts (transposed)
        adata = ad.AnnData(
            X=rna_matrix.T,
            obs=pd.DataFrame(index=cell_barcodes),
            var=pd.DataFrame(index=genes)
        )
        print(f"AnnData shape: {adata.shape}")
        
        # Add ADT and GDO counts as observations
        print("Adding protein and guide counts...")
        
        # Create protein and guide count matrices of the right shape
        protein_counts = np.zeros((adata.n_obs, len(adt_barcodes)))
        guide_counts = np.zeros((adata.n_obs, len(gdo_barcodes)))
        
        # Add the protein and guide counts as observations
        adata.obsm['protein_counts'] = protein_counts
        adata.obsm['guide_counts'] = guide_counts
        
        # Add metadata for cells that exist in both the count matrix and metadata
        print("Adding metadata to AnnData...")
        common_cells = adata.obs.index.intersection(metadata.index)
        print(f"Common cells: {len(common_cells)} out of {adata.n_obs} cells")
        
        # If no common cells found, try to match by removing the prefix
        if len(common_cells) == 0:
            print("No common cells found. Trying to match by removing prefix...")
            # Create a mapping from cell barcodes to metadata index
            cell_to_metadata = {}
            for cell in adata.obs.index:
                # Try to find a match in metadata by removing prefix
                for meta_idx in metadata.index:
                    if cell in meta_idx or meta_idx in cell:
                        cell_to_metadata[cell] = meta_idx
                        break
            
            print(f"Found {len(cell_to_metadata)} matches after removing prefix")
            
            # Add metadata using the mapping
            for col in metadata.columns:
                adata.obs[col] = np.nan
                for cell, meta_idx in cell_to_metadata.items():
                    adata.obs.loc[cell, col] = metadata.loc[meta_idx, col]
        else:
            # Add metadata directly for common cells
            for col in metadata.columns:
                adata.obs[col] = np.nan
                adata.obs.loc[common_cells, col] = metadata.loc[common_cells, col].values
        
        # Add protein and guide names
        adata.uns['protein_names'] = adt_barcodes['protein'].tolist()
        adata.uns['guide_names'] = gdo_barcodes['guide'].tolist()
        
        # Add experiment type
        adata.obs['experiment_type'] = 'ECCITE-seq'
        
        print("ECCITE-seq processing complete")
        return adata
    
    def harmonize_data(self, adata):
        """
        Harmonize the data according to the specified format.
        
        Parameters:
        -----------
        adata : anndata.AnnData
            The AnnData object to harmonize.
            
        Returns:
        --------
        anndata.AnnData
            The harmonized AnnData object.
        """
        print("Harmonizing ECCITE-seq data...")
        
        # Create standardized metadata columns
        adata.obs['organism'] = 'Homo sapiens'
        adata.obs['cell_type'] = 'THP-1'  # THP-1 monocyte cell line
        adata.obs['crispr_type'] = 'CRISPR KO'
        adata.obs['cancer_type'] = 'Leukemia'  # THP-1 is a leukemia cell line
        
        # Set condition based on metadata
        if 'con' in adata.obs.columns:
            # Convert to string first to avoid type issues
            adata.obs['condition'] = adata.obs['con'].astype(str).map({'tx': 'Test', 'nt': 'Control'})
            # Fill NaN values with 'Unknown'
            adata.obs['condition'] = adata.obs['condition'].fillna('Unknown')
        else:
            adata.obs['condition'] = 'Unknown'
        
        # Set perturbation name based on guide information
        if 'gene' in adata.obs.columns:
            # Extract gene name from guide_ID and convert to string
            adata.obs['perturbation_name'] = adata.obs['gene'].astype(str)
            
            # Mark non-targeting controls
            if 'NT' in adata.obs.columns:
                # Cells with gene='NT' are non-targeting controls
                nt_mask = (adata.obs['gene'] == 'NT')
                adata.obs.loc[nt_mask, 'perturbation_name'] = 'Non-targeting'
            
            # Fill NaN values with 'Unknown'
            adata.obs['perturbation_name'] = adata.obs['perturbation_name'].fillna('Unknown')
        else:
            adata.obs['perturbation_name'] = 'Unknown'
        
        # Fix data types for all columns to ensure compatibility with h5ad
        for col in adata.obs.columns:
            # Convert all object columns to string
            if adata.obs[col].dtype == 'object' or isinstance(adata.obs[col].dtype, pd.CategoricalDtype):
                adata.obs[col] = adata.obs[col].astype(str)
            # Convert all float columns with NaNs to float
            elif pd.api.types.is_float_dtype(adata.obs[col]):
                adata.obs[col] = adata.obs[col].astype(float)
            # Convert all int columns to int
            elif pd.api.types.is_integer_dtype(adata.obs[col]):
                adata.obs[col] = adata.obs[col].astype(int)
        
        # Add study information
        adata.uns['study'] = {
            'accession': self.accession,
            'title': 'Characterizing the molecular regulation of inhibitory immune checkpoints with multi-modal single-cell screens',
            'authors': 'Papalexi E, et al.',
            'description': 'ECCITE-seq study of immune checkpoint regulation in THP-1 cells'
        }
        
        return adata
    
    def run(self):
        """
        Run the harmonization process.
        
        Returns:
        --------
        str
            Path to the harmonized h5ad file.
        """
        # Download files if needed
        self.download_files()
        
        # Process ECCITE-seq data
        eccite_seq_data = self.process_eccite_seq_data()
        
        # Harmonize data
        eccite_seq_data_harmonized = self.harmonize_data(eccite_seq_data)
        
        # Save harmonized data
        output_path = self.data_dir / f"{self.accession}_harmonized.h5ad"
        print(f"Saving harmonized data to {output_path}")
        eccite_seq_data_harmonized.write_h5ad(output_path)
        
        print("Harmonization complete!")
        return str(output_path)

# Helper function to run the harmonizer in Jupyter Notebook
def run_harmonizer(data_dir='./GSE153056'):
    harmonizer = GSE153056Harmonizer(data_dir)
    output_path = harmonizer.run()
    print(f"Harmonized data saved to: {output_path}")

# Execute the harmonizer (you can change the data_dir if needed)
run_harmonizer()
