In [1]:
#!/usr/bin/env python3
"""
Script to convert separate CSV, MTX, and gene files into AnnData H5AD format
Specifically for Karaayvas2018 dataset
"""

import pandas as pd
import numpy as np
import anndata as ad
from scipy.io import mmread
from scipy.sparse import csr_matrix
import os

def create_h5ad_from_files(cells_csv_path, mtx_path, genes_txt_path, output_path):
    """
    Convert separate files to AnnData format and save as H5AD
    
    Parameters:
    -----------
    cells_csv_path : str
        Path to cells.csv file (cell metadata)
    mtx_path : str
        Path to Exp_data_TPM.mtx file (expression matrix)
    genes_txt_path : str
        Path to gene.txt file (gene names/IDs)
    output_path : str
        Path where to save the output H5AD file
    """
    
    print("Loading cell metadata...")
    # Load cell metadata (observations)
    obs = pd.read_csv(cells_csv_path, index_col=0)
    print(f"Loaded {obs.shape[0]} cells with {obs.shape[1]} metadata columns")
    
    print("Loading gene information...")
    # Load gene names/IDs (variables)
    with open(genes_txt_path, 'r') as f:
        gene_names = [line.strip() for line in f.readlines()]
    
    # Create gene dataframe
    var = pd.DataFrame(index=gene_names)
    var['gene_ids'] = gene_names
    print(f"Loaded {len(gene_names)} genes")
    
    print("Loading expression matrix...")
    # Load expression matrix (usually genes x cells in MTX format)
    mtx_data = mmread(mtx_path)
    
    # Convert to CSR format if not already
    if not isinstance(mtx_data, csr_matrix):
        mtx_data = csr_matrix(mtx_data)
    
    print(f"Expression matrix shape: {mtx_data.shape}")
    
    # MTX files are typically genes x cells, but AnnData expects cells x genes
    # Check dimensions and transpose if needed
    if mtx_data.shape[0] == len(gene_names) and mtx_data.shape[1] == len(obs):
        print("Transposing matrix from genes x cells to cells x genes format")
        X = mtx_data.T
    elif mtx_data.shape[1] == len(gene_names) and mtx_data.shape[0] == len(obs):
        print("Matrix is already in cells x genes format")
        X = mtx_data
    else:
        print(f"Warning: Matrix dimensions {mtx_data.shape} don't match cells ({len(obs)}) and genes ({len(gene_names)})")
        print("Proceeding with assumption that matrix needs transposition...")
        X = mtx_data.T
    
    print(f"Final expression matrix shape (cells x genes): {X.shape}")
    
    # Ensure indices match
    if len(obs) != X.shape[0]:
        print(f"Error: Number of cells in metadata ({len(obs)}) doesn't match matrix rows ({X.shape[0]})")
        return
    
    if len(var) != X.shape[1]:
        print(f"Error: Number of genes ({len(var)}) doesn't match matrix columns ({X.shape[1]})")
        return
    
    print("Creating AnnData object...")
    # Create AnnData object
    adata = ad.AnnData(
        X=X,
        obs=obs,
        var=var
    )
    
    # Add some basic information
    adata.uns['source'] = 'Karaayvas2018'
    adata.uns['data_type'] = 'TPM'
    
    print(f"Created AnnData object: {adata}")
    print(f"  - {adata.n_obs} cells")
    print(f"  - {adata.n_vars} genes")
    print(f"  - Observation columns: {list(adata.obs.columns)}")
    
    print(f"Saving to {output_path}...")
    # Save as H5AD
    adata.write_h5ad(output_path)
    print("Done!")
    
    return adata

In [5]:
# File paths - adjust these to match your file locations
cells_csv = "Cells.csv"
mtx_file = "Exp_data_TPM.mtx" 
genes_txt = "Genes.txt"
output_file = "Karaayvas2018.h5ad"

# Check if files exist
files_to_check = [cells_csv, mtx_file, genes_txt]
for file_path in files_to_check:
    if not os.path.exists(file_path):
        print(f"Error: File {file_path} not found!")
        print("Please make sure all input files are in the current directory or update the file paths.")


# Convert files to H5AD
try:
    adata = create_h5ad_from_files(cells_csv, mtx_file, genes_txt, output_file)
    print(f"\nSuccessfully created {output_file}")
    print("\nBasic statistics:")
    print(f"Total cells: {adata.n_obs}")
    print(f"Total genes: {adata.n_vars}")
    print(f"Data type: {adata.X.dtype}")
    print(f"Matrix format: {type(adata.X)}")
    
except Exception as e:
    print(f"Error during conversion: {str(e)}")
    print("Please check your input files and try again.")

Loading cell metadata...
Loaded 1534 cells with 11 metadata columns
Loading gene information...
Loaded 21785 genes
Loading expression matrix...
Expression matrix shape: (21785, 1534)
Transposing matrix from genes x cells to cells x genes format
Final expression matrix shape (cells x genes): (1534, 21785)
Creating AnnData object...
Created AnnData object: AnnData object with n_obs × n_vars = 1534 × 21785
    obs: 'sample', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment'
    var: 'gene_ids'
    uns: 'source', 'data_type'
  - 1534 cells
  - 21785 genes
  - Observation columns: ['sample', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment']
Saving to Karaayvas2018.h5ad...
Done!

Successfully created Karaayvas2018.h5ad

Basic statistics:
Total cells: 1534
Total genes: 21785
Data type: float64
Matrix format: <class 'scipy.sparse

In [6]:
import scanpy as sc

# load the created H5AD file to verify
anndata = sc.read_h5ad(output_file)