In [None]:
# Make sure to install the required packages if you haven't already:
# !pip install scanpy anndata pandas numpy scipy matplotlib seaborn

import os
import urllib.request
import gzip
import shutil

import scanpy as sc
import pandas as pd
import numpy as np
import scipy.io
import matplotlib.pyplot as plt
import seaborn as sns
from anndata import AnnData

class GSE254100Processor:
    """
    Processor for the GSE254100 dataset.
    """
    
    def __init__(self, data_dir):
        """
        Initialize the processor.
        
        Parameters:
        -----------
        data_dir : str
            Path to the directory containing the dataset files.
        """
        self.data_dir = data_dir
        self.output_dir = os.path.join(data_dir, "processed")
        self.plots_dir = os.path.join(data_dir, "plots")
        
        # Create output directories
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(self.plots_dir, exist_ok=True)
        
        # File paths
        self.matrix_path = os.path.join(data_dir, "GSE254100_matrix.mtx")
        self.features_path = os.path.join(data_dir, "GSE254100_features.tsv")
        self.barcodes_path = os.path.join(data_dir, "GSE254100_barcodes.tsv")
        self.h5ad_path = os.path.join(self.output_dir, "GSE254100_harmonized.h5ad")
        
        # URLs for downloading the dataset
        self.base_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE254nnn/GSE254100/suppl/"
        self.matrix_url = f"{self.base_url}GSE254100_matrix.mtx.gz"
        self.features_url = f"{self.base_url}GSE254100_features.tsv.gz"
        self.barcodes_url = f"{self.base_url}GSE254100_barcodes.tsv.gz"
    
    def download_data(self):
        """
        Download the dataset files if they don't exist.
        """
        print("Checking for dataset files...")
        
        # Check if files already exist
        if (os.path.exists(self.matrix_path) and
            os.path.exists(self.features_path) and
            os.path.exists(self.barcodes_path)):
            print("Dataset files already exist. Skipping download.")
            return
        
        # Download and extract files
        files_to_download = [
            (self.matrix_url, self.matrix_path),
            (self.features_url, self.features_path),
            (self.barcodes_url, self.barcodes_path)
        ]
        
        for url, path in files_to_download:
            if not os.path.exists(path):
                gz_path = f"{path}.gz"
                print(f"Downloading {url}...")
                urllib.request.urlretrieve(url, gz_path)
                
                print(f"Extracting {gz_path}...")
                with gzip.open(gz_path, 'rb') as f_in:
                    with open(path, 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                
                # Remove the gzipped file
                os.remove(gz_path)
        
        print("Download complete.")
    
    def load_data(self):
        """
        Load the dataset from the raw files.
        
        Returns:
        --------
        adata : AnnData
            AnnData object containing the dataset.
        """
        print(f"Loading data from: {self.data_dir}")
        
        # Read the matrix
        matrix = scipy.io.mmread(self.matrix_path).T.tocsr()
        
        # Read features (genes)
        features = pd.read_csv(self.features_path, sep='\t', header=None)
        gene_ids = features[0].values
        gene_names = features[1].values
        feature_types = features[2].values if features.shape[1] > 2 else np.array(['Gene Expression'] * len(gene_ids))
        
        # Read barcodes
        barcodes = pd.read_csv(self.barcodes_path, sep='\t', header=None)[0].values
        
        # Extract sample information from barcodes
        # Format: AAACCCAGTCTTCAAG-1 where -1 is the sample identifier
        sample_ids = np.array([bc.split('-')[1] for bc in barcodes])
        
        # Create AnnData object
        adata = AnnData(X=matrix, 
                        obs=pd.DataFrame(index=barcodes),
                        var=pd.DataFrame(index=gene_names))
        
        # Make variable names unique
        adata.var_names_make_unique()
        
        # Add gene information
        adata.var['gene_ids'] = gene_ids
        adata.var['feature_types'] = feature_types
        
        # Add sample IDs
        adata.obs['sample_id'] = sample_ids
        
        print(f"Created AnnData object with {adata.n_obs} cells and {adata.n_vars} genes")
        
        return adata
    
    def harmonize_metadata(self, adata):
        """
        Harmonize the metadata in the AnnData object.
        
        Parameters:
        -----------
        adata : AnnData
            AnnData object to harmonize.
        
        Returns:
        --------
        adata : AnnData
            Harmonized AnnData object.
        """
        print("Harmonizing metadata...")
        
        # Map sample IDs to experimental conditions based on metadata
        sample_mapping = {
            '1': 'WT_trachea',   # Assuming sample 1 is wild-type
            '2': 'BGH_trachea',  # Assuming sample 2 is BGH
            '3': 'CFAP_trachea', # Assuming sample 3 is CFAP
            '4': 'NM_trachea'    # Assuming sample 4 is NM
        }
        
        # Add sample information to obs
        adata.obs['sample'] = [sample_mapping.get(s, 'unknown') for s in adata.obs['sample_id']]
        
        # Add genotype information
        genotype_mapping = {
            'WT_trachea': 'wild-type',
            'BGH_trachea': 'bgh',
            'CFAP_trachea': 'Cfap54gt/gt',
            'NM_trachea': 'nm1054'
        }
        adata.obs['genotype'] = [genotype_mapping.get(s, 'unknown') for s in adata.obs['sample']]
        
        # Add mutation gene information
        mutation_gene_mapping = {
            'WT_trachea': 'none',
            'BGH_trachea': 'Spef2',
            'CFAP_trachea': 'Cfap54',
            'NM_trachea': 'Cfap221'
        }
        adata.obs['mutation_gene'] = [mutation_gene_mapping.get(s, 'unknown') for s in adata.obs['sample']]
        
        # Add standardized metadata for harmonization
        adata.obs['organism'] = 'Mus musculus'
        adata.obs['cell_type'] = 'Tracheal Epithelial Cells'  # General cell type
        adata.obs['condition'] = ['control' if g == 'wild-type' else 'test' for g in adata.obs['genotype']]
        adata.obs['perturbation_name'] = ['none' if g == 'wild-type' else m for g, m in zip(adata.obs['genotype'], adata.obs['mutation_gene'])]
        adata.obs['cancer_type'] = 'Non-Cancer'  # This is not a cancer study
        adata.obs['crispr_type'] = 'none'  # This is not a CRISPR study
        
        # Add tissue information
        adata.obs['tissue'] = 'trachea'
        
        # Add experiment information
        adata.uns['dataset_id'] = 'GSE254100'
        adata.uns['dataset_name'] = 'Cellular responses in the airway ciliary microenvironment from mouse models of primary ciliary dyskinesia with central pair apparatus defects'
        adata.uns['dataset_description'] = 'Single-cell RNA sequencing of tracheal epithelial cells from mouse models with mutations in CPA genes'
        
        return adata
    
    def preprocess_data(self, adata):
      print("Preprocessing data (without filtering)...")
      
      # Calculate quality control metrics
      sc.pp.calculate_qc_metrics(adata, inplace=True)
      
      # Store raw counts
      adata.raw = adata.copy()
      
      # Filtering steps removed: no sc.pp.filter_cells or sc.pp.filter_genes calls.
      print(f"Data contains {adata.n_obs} cells and {adata.n_vars} genes")
      
      return adata

    
    def analyze_data(self, adata):
        """
        Perform analysis on the data.
        
        Parameters:
        -----------
        adata : AnnData
            AnnData object to analyze.
        
        Returns:
        --------
        adata : AnnData
            Analyzed AnnData object.
        """
        print("Analyzing data...")
        
        # Normalize data
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        
        # Identify highly variable genes
        sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
        
        # Run PCA
        sc.pp.pca(adata, svd_solver='arpack')
        
        # Compute neighborhood graph
        sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
        
        # Run UMAP for visualization
        sc.tl.umap(adata)
        
        # Run Leiden clustering
        sc.tl.leiden(adata, resolution=0.8)
        
        # Identify marker genes for each genotype
        sc.tl.rank_genes_groups(adata, 'genotype', method='wilcoxon')
        
        return adata
    
    def create_visualizations(self, adata):
        """
        Create visualizations of the data.
        
        Parameters:
        -----------
        adata : AnnData
            AnnData object to visualize.
        """
        print("Creating visualizations...")
        
        # UMAP colored by genotype
        plt.figure(figsize=(10, 8))
        sc.pl.umap(adata, color='genotype', show=False)
        plt.savefig(os.path.join(self.plots_dir, "umap_genotype.png"))
        plt.close()
        
        # UMAP colored by sample
        plt.figure(figsize=(10, 8))
        sc.pl.umap(adata, color='sample', show=False)
        plt.savefig(os.path.join(self.plots_dir, "umap_sample.png"))
        plt.close()
        
        # UMAP colored by Leiden clusters
        plt.figure(figsize=(10, 8))
        sc.pl.umap(adata, color='leiden', legend_loc='on data', show=False)
        plt.savefig(os.path.join(self.plots_dir, "umap_leiden.png"))
        plt.close()
        
        # QC metrics visualization
        plt.figure(figsize=(15, 5))
        sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'], 
                     jitter=0.4, multi_panel=True, show=False)
        plt.savefig(os.path.join(self.plots_dir, "qc_metrics.png"))
        plt.close()
        
        # Distribution of cells by genotype
        plt.figure(figsize=(10, 6))
        sns.countplot(data=adata.obs, x='genotype')
        plt.title('Cell Distribution by Genotype')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(self.plots_dir, "cell_distribution_genotype.png"))
        plt.close()
        
        # Plot top marker genes
        plt.figure(figsize=(12, 10))
        sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, show=False)
        plt.savefig(os.path.join(self.plots_dir, "marker_genes_genotype.png"))
        plt.close()
        
        # Save marker genes to CSV
        marker_genes = sc.get.rank_genes_groups_df(adata, group=None)
        marker_genes.to_csv(os.path.join(self.plots_dir, "marker_genes_genotype.csv"))
        
        print(f"Visualizations saved to {self.plots_dir}")
    
    def create_summary(self, adata):
        """
        Create a summary of the dataset.
        
        Parameters:
        -----------
        adata : AnnData
            AnnData object to summarize.
        
        Returns:
        --------
        summary : dict
            Dictionary containing summary information.
        """
        summary = {
            'n_cells': adata.n_obs,
            'n_genes': adata.n_vars,
            'organism': 'Mus musculus',
            'tissue': 'trachea',
            'condition': adata.obs['condition'].value_counts().to_dict(),
            'genotype': adata.obs['genotype'].value_counts().to_dict(),
            'cell_type': adata.obs['cell_type'].value_counts().to_dict()
        }
        
        print("\nDataset Summary:")
        for key, value in summary.items():
            print(f"{key}: {value}")
        
        return summary
    
    def process(self, analyze=True, visualize=True):
        """
        Process the dataset.
        
        Parameters:
        -----------
        analyze : bool, optional
            Whether to perform analysis on the data (default is True).
        visualize : bool, optional
            Whether to create visualizations (default is True).
        
        Returns:
        --------
        adata : AnnData
            Processed AnnData object.
        """
        # Download the data if necessary
        self.download_data()
        
        # Load the data
        adata = self.load_data()
        
        # Harmonize the metadata
        adata = self.harmonize_metadata(adata)
        
        # Preprocess the data
        adata = self.preprocess_data(adata)
        
        # Save the harmonized dataset
        print(f"Saving harmonized dataset to {self.h5ad_path}")
        adata.write(self.h5ad_path)
        
        # Analyze the data if requested
        if analyze:
            adata = self.analyze_data(adata)
            
            # Save the analyzed dataset
            analyzed_path = os.path.join(self.output_dir, "GSE254100_analyzed.h5ad")
            print(f"Saving analyzed dataset to {analyzed_path}")
            adata.write(analyzed_path)
        
        # Create visualizations if requested
        if visualize and analyze:
            self.create_visualizations(adata)
        
        # Create a summary of the dataset
        self.create_summary(adata)
        
        print("\nDone!")
        
        return adata

# ============================
# Running the processor in Jupyter Notebook
# ============================

# Update this variable with the path to your dataset directory
data_dir = "/content/GSE254100"  # <-- Change this to your data directory

# Instantiate the processor and process the data
processor = GSE254100Processor(data_dir)
adata = processor.process(analyze=True, visualize=True)
