# Converting h5ad File to coloncancer.csv Format

This notebook converts `data_debug_restricted.h5ad` to match the format of `coloncancer.csv`. 

The target format has the following columns:
- dataset: The name of the dataset
- tissue: The tissue type
- marker: A comma-separated list of marker genes for each cell type
- manual_annotation: The cell type annotation
- manual_CLname: The cell ontology name
- manual_CLID: The cell ontology ID
- manual_broadtype: The broad cell type category

In [41]:
# Import required libraries
import scanpy as sc
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting parameters
sc.settings.set_figure_params(dpi=100, frameon=False)
plt.rcParams['figure.figsize'] = (8, 8)
plt.rcParams['figure.dpi'] = 100

# Print versions for reproducibility
print(f"scanpy version: {sc.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"numpy version: {np.__version__}")

scanpy version: 1.11.2
pandas version: 2.3.0
numpy version: 2.2.6


## Loading the h5ad File

First, let's load the `data_debug_restricted.h5ad` file using scanpy and explore its structure.

In [42]:
import os

# Define the path to the h5ad file
file_path = 'dataset_restricted.h5ad'
base, ext = os.path.splitext(file_path)
output_path = f"{base}_processed{ext}"
print(output_path)  

# Load the h5ad file
adata = sc.read_h5ad(file_path)

# Print basic information about the AnnData object
print(f"Shape of AnnData: {adata.shape}")
print("\nKeys in .obs (per-cell annotations):")
print(adata.obs.columns.tolist())
print("\nKeys in .var (per-gene annotations):")
print(adata.var.columns.tolist())
print("\nKeys in .uns (unstructured annotations):")
print(list(adata.uns.keys()) if hasattr(adata, 'uns') else "No .uns keys found")

dataset_restricted_processed.h5ad
Shape of AnnData: (138727, 33541)

Keys in .obs (per-cell annotations):
[]

Keys in .var (per-gene annotations):
[]

Keys in .uns (unstructured annotations):
[]


In [43]:
# Let's look at the first few rows of the observation metadata
print("First few rows of .obs:")
display(adata.obs.head())

# Check if there's any cell type annotation in the metadata
# Common column names for cell type annotations: cell_type, leiden, louvain, cluster, etc.
cell_type_cols = [col for col in adata.obs.columns if any(x in col.lower() for x in ['cell_type', 'celltype', 'leiden', 'louvain', 'cluster'])]

if cell_type_cols:
    print(f"\nPotential cell type annotation columns: {cell_type_cols}")
    for col in cell_type_cols:
        print(f"\nUnique values in {col}:")
        print(adata.obs[col].value_counts())
else:
    print("\nNo obvious cell type annotation columns found.")

First few rows of .obs:


Gao2021_AAACCTGCAGTGACAG
Gao2021_AAACCTGGTCGAGATG
Gao2021_AAACCTGTCACCGGGT
Gao2021_AAACGGGGTGCACTTA
Gao2021_AAACGGGTCACGGTTA



No obvious cell type annotation columns found.


## Identifying Marker Genes for Each Cell Type

Now we'll identify marker genes for each cell type/cluster by computing differential expression between clusters. We'll use these as the marker genes in our CSV output.

In [44]:
# Define a cell type column to use for identifying markers
# This will be set based on the available columns we found above

# This is a placeholder - we need to fill in the actual column name after inspecting the data
cluster_col = None

# Dynamically set the cluster column based on what we found above
if cell_type_cols:
    cluster_col = cell_type_cols[0]  # Use the first identified cell type column
else:
    # If no cell type columns were found, we'll try to cluster the data
    print("No cell type annotations found. Performing clustering...")
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
    sc.pp.pca(adata, svd_solver='arpack')
    sc.pp.neighbors(adata)
    sc.tl.leiden(adata)
    cluster_col = 'leiden'

print(f"Using '{cluster_col}' as cell type/cluster column")

# Compute marker genes for each cluster
sc.tl.rank_genes_groups(adata, groupby=cluster_col, method='wilcoxon')

# Function to get top marker genes for each cluster
def get_top_markers(adata, n_genes=10):
    markers_dict = {}
    
    try:
        for cluster in adata.obs[cluster_col].unique():
            # Get the rank_genes_groups result for the cluster
            genes = adata.uns['rank_genes_groups']['names'][cluster][:n_genes]
            markers_dict[cluster] = list(genes)
            
        return markers_dict
    except KeyError:
        print("Error accessing rank_genes_groups results. Check if the computation completed successfully.")
        return {}

# Get top 10 marker genes for each cluster
top_markers = get_top_markers(adata, n_genes=10)

# Display the markers for each cluster
for cluster, markers in top_markers.items():
    print(f"\nCluster {cluster} top markers:")
    print(", ".join(markers))

No cell type annotations found. Performing clustering...


Using 'leiden' as cell type/cluster column


  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group


Cluster 40 top markers:
CCT2, GNB2L1, YEATS4, TPD52, CPNE3, ATP5B, GLTSCR2, SEPP1, EIF3H, C8orf59

Cluster 52 top markers:
ATP5L, TCEB2, GNB2L1, ATP5O, ATP5J, ATP5E, ATP5G3, ATP5C1, NGFRAP1, USMG5

Cluster 62 top markers:
PTRF, SEPW1, GNB2L1, GNG11, ECSCR.1, ATP5E, IGFBP7, GLTSCR2, C19orf43, RAMP2

Cluster 42 top markers:
ATP5E, GPX1, ATP5L, ATP5G2, GNB2L1, C14orf2, ATP5G3, TCEB2, ATP5J2, TYROBP

Cluster 39 top markers:
TAGLN, KRT17, MYLK, APOE, KRT14, MYL9, TPM2, DST, FBXO32, CALD1

Cluster 57 top markers:
SELM, NGFRAP1, IGFBP4, GNB2L1, ATP5E, WBP5, ATP5L, PTRF, PCOLCE, TCEB2

Cluster 8 top markers:
CD52, CD3D, CD2, TRAC, PTPRC, CORO1A, SRGN, IL32, B2M, HLA-A

Cluster 48 top markers:
IGFBP7, CALD1, MYL9, ACTA2, SPARCL1, TPM2, TAGLN, CAV1, COL18A1, SPARC

Cluster 64 top markers:
AC090498.1, ATP5G2, USMG5, ATP5I, ATP5E, C14orf2, ATP5J, ATP5L, GNB2L1, ATP5J2

Cluster 50 top markers:
TCEB1, C11orf31, HSP90AB1, TCEB2, SHFM1, ATP5J2, ATP5C1, ATP5L, ATP5E, YIPF3

Cluster 5 top markers:
LUM,

  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_

## Preparing the Output CSV Format

Now we'll prepare a DataFrame that matches the format of `coloncancer.csv`. We need to create:

1. `dataset`: Name of our dataset
2. `tissue`: Tissue type (inferred from metadata if available)
3. `marker`: Comma-separated list of marker genes
4. `manual_annotation`: Cell type annotation (use cluster names initially)
5. `manual_CLname`: Cell ontology name (we may need to leave this blank)
6. `manual_CLID`: Cell ontology ID (we may need to leave this blank) 
7. `manual_broadtype`: Broad cell type category (we may need to leave this blank)

In [45]:
# Get the dataset name from the file name
dataset_name = os.path.basename(file_path).split('.')[0]  # 'dataset_debug_restricted'

# Try to infer tissue type from metadata if available
tissue = None

# Look for tissue information in adata.uns
if hasattr(adata, 'uns') and isinstance(adata.uns, dict):
    tissue_keys = [k for k in adata.uns.keys() if 'tissue' in k.lower()]
    if tissue_keys:
        tissue = str(adata.uns[tissue_keys[0]])

# Look for tissue information in adata.obs
if tissue is None:
    tissue_cols = [c for c in adata.obs.columns if 'tissue' in c.lower()]
    if tissue_cols:
        tissue = adata.obs[tissue_cols[0]].iloc[0]

# If we still couldn't find tissue information, use a placeholder
if tissue is None:
    tissue = "unknown tissue"

# Create a list to store our rows
csv_rows = []

# For each cluster, create a row in our CSV format
for cluster, markers in top_markers.items():
    # Convert marker list to comma-separated string
    marker_str = ",".join(markers)
    
    # Create a row
    row = {
        'dataset': dataset_name,
        'tissue': tissue,
        'marker': marker_str,
        'manual_annotation': f"Cluster {cluster}",  # Initial annotation based on cluster name
        'manual_CLname': "",  # Would need cell ontology mapping
        'manual_CLID': "",    # Would need cell ontology mapping
        'manual_broadtype': ""  # Would need broad cell type mapping
    }
    
    csv_rows.append(row)

# Create DataFrame
output_df = pd.DataFrame(csv_rows)

# Display the DataFrame
display(output_df)

Unnamed: 0,dataset,tissue,marker,manual_annotation,manual_CLname,manual_CLID,manual_broadtype
0,dataset_restricted,unknown tissue,"CCT2,GNB2L1,YEATS4,TPD52,CPNE3,ATP5B,GLTSCR2,S...",Cluster 40,,,
1,dataset_restricted,unknown tissue,"ATP5L,TCEB2,GNB2L1,ATP5O,ATP5J,ATP5E,ATP5G3,AT...",Cluster 52,,,
2,dataset_restricted,unknown tissue,"PTRF,SEPW1,GNB2L1,GNG11,ECSCR.1,ATP5E,IGFBP7,G...",Cluster 62,,,
3,dataset_restricted,unknown tissue,"ATP5E,GPX1,ATP5L,ATP5G2,GNB2L1,C14orf2,ATP5G3,...",Cluster 42,,,
4,dataset_restricted,unknown tissue,"TAGLN,KRT17,MYLK,APOE,KRT14,MYL9,TPM2,DST,FBXO...",Cluster 39,,,
...,...,...,...,...,...,...,...
60,dataset_restricted,unknown tissue,"FXYD5,VIM,H3-3B,TPSAB1,RAC2,MS4A2,CPA3,GATA2,H...",Cluster 58,,,
61,dataset_restricted,unknown tissue,"H3-3A,MTCO2P12,H3-3B,H2AJ,ALDOA,RBIS,NME2,H2AZ...",Cluster 46,,,
62,dataset_restricted,unknown tissue,"LILRA4,IRF7,PLAC8,ITM2C,JCHAIN,GPX1,IRF8,CD74,...",Cluster 63,,,
63,dataset_restricted,unknown tissue,"H3-3A,MTCO2P12,RPS26,H3-3B,PPDPF,MTRNR2L12,GAB...",Cluster 11,,,


## Saving the Final CSV Output

Finally, let's save our DataFrame to a CSV file that matches the format of `coloncancer.csv`.

In [46]:
# Define the output file path
output_file = f"{dataset_name}_formatted.csv"

# Save the DataFrame to CSV
output_df.to_csv(output_file, index=False)

print(f"Successfully saved the formatted data to {output_file}")

# Let's also display the first few rows of the output
print("\nPreview of the output CSV:")
display(output_df.head())

Successfully saved the formatted data to dataset_restricted_formatted.csv

Preview of the output CSV:


Unnamed: 0,dataset,tissue,marker,manual_annotation,manual_CLname,manual_CLID,manual_broadtype
0,dataset_restricted,unknown tissue,"CCT2,GNB2L1,YEATS4,TPD52,CPNE3,ATP5B,GLTSCR2,S...",Cluster 40,,,
1,dataset_restricted,unknown tissue,"ATP5L,TCEB2,GNB2L1,ATP5O,ATP5J,ATP5E,ATP5G3,AT...",Cluster 52,,,
2,dataset_restricted,unknown tissue,"PTRF,SEPW1,GNB2L1,GNG11,ECSCR.1,ATP5E,IGFBP7,G...",Cluster 62,,,
3,dataset_restricted,unknown tissue,"ATP5E,GPX1,ATP5L,ATP5G2,GNB2L1,C14orf2,ATP5G3,...",Cluster 42,,,
4,dataset_restricted,unknown tissue,"TAGLN,KRT17,MYLK,APOE,KRT14,MYL9,TPM2,DST,FBXO...",Cluster 39,,,


# reverse engineer the csv file to h5ad

In [47]:
import pandas as pd
import scanpy as sc
import numpy as np
import os

# Load the original h5ad file
# file_path = 'dataset_debug_restricted.h5ad'
# adata = sc.read_h5ad(file_path)

# Load the labeled CSV file
csv_path = 'dataset_debug_restricted_labelled.csv'
labeled_df = pd.read_csv(csv_path)

print(f"Original h5ad shape: {adata.shape}")
print(f"Labeled CSV shape: {labeled_df.shape}")

# Display the labeled data
print("\nLabeled data:")
print(labeled_df.head())

# Identify the cluster column used in the original script
# (This should match what was used in your original clustering)
cell_type_cols = [col for col in adata.obs.columns if any(x in col.lower() for x in ['cell_type', 'celltype', 'leiden', 'louvain', 'cluster'])]

if cell_type_cols:
    cluster_col = cell_type_cols[0]
else:
    # If no clustering was found, we might need to re-run clustering
    print("No existing cluster column found. You may need to re-run the clustering part of your original script first.")
    cluster_col = 'leiden'  # Assuming leiden clustering was used

print(f"Using cluster column: {cluster_col}")

# Check if the cluster column exists
if cluster_col not in adata.obs.columns:
    print(f"Warning: {cluster_col} not found in adata.obs. Available columns:")
    print(adata.obs.columns.tolist())
    
    # If leiden doesn't exist, try to find any cluster-like column
    potential_cols = [col for col in adata.obs.columns if 'leiden' in col.lower() or 'cluster' in col.lower()]
    if potential_cols:
        cluster_col = potential_cols[0]
        print(f"Using alternative cluster column: {cluster_col}")
    else:
        raise ValueError("No suitable cluster column found. Please run clustering first.")

# Create a mapping dictionary from cluster to cell_type_pred
cluster_to_celltype = {}

for _, row in labeled_df.iterrows():
    # Extract cluster number from manual_annotation (e.g., "Cluster 0" -> "0")
    cluster_name = row['manual_annotation']
    if 'Cluster' in cluster_name:
        cluster_id = cluster_name.replace('Cluster ', '').strip()
        cluster_to_celltype[cluster_id] = row['cell_type_pred']
    else:
        # If manual_annotation doesn't follow "Cluster X" format, 
        # try to match directly
        cluster_to_celltype[cluster_name] = row['cell_type_pred']

print(f"\nCluster to cell type mapping:")
for cluster, cell_type in cluster_to_celltype.items():
    print(f"  {cluster} -> {cell_type}")

# Map cell type predictions to individual cells
cell_type_predictions = []

for cell_cluster in adata.obs[cluster_col]:
    # Convert cluster to string for consistent mapping
    cluster_str = str(cell_cluster)
    
    if cluster_str in cluster_to_celltype:
        cell_type_predictions.append(cluster_to_celltype[cluster_str])
    else:
        # Handle missing mappings
        print(f"Warning: No cell type prediction found for cluster {cluster_str}")
        cell_type_predictions.append('Unknown')

# Add the cell type predictions to adata.obs
adata.obs['cell_type_pred'] = cell_type_predictions

# Also add other annotations from the CSV if desired
# Create dictionaries for other annotations
cluster_to_clname = {}
cluster_to_clid = {}
cluster_to_broadtype = {}

for _, row in labeled_df.iterrows():
    cluster_name = row['manual_annotation']
    if 'Cluster' in cluster_name:
        cluster_id = cluster_name.replace('Cluster ', '').strip()
    else:
        cluster_id = cluster_name
    
    cluster_to_clname[cluster_id] = row['manual_CLname']
    cluster_to_clid[cluster_id] = row['manual_CLID'] 
    cluster_to_broadtype[cluster_id] = row['manual_broadtype']

# Map these to individual cells
manual_clname = [cluster_to_clname.get(str(cluster), '') for cluster in adata.obs[cluster_col]]
manual_clid = [cluster_to_clid.get(str(cluster), '') for cluster in adata.obs[cluster_col]]
manual_broadtype = [cluster_to_broadtype.get(str(cluster), '') for cluster in adata.obs[cluster_col]]

# Add all annotations to adata.obs
adata.obs['manual_CLname'] = manual_clname
adata.obs['manual_CLID'] = manual_clid
adata.obs['manual_broadtype'] = manual_broadtype

# Display summary of the annotations
print(f"\nCell type prediction summary:")
print(adata.obs['cell_type_pred'].value_counts())

print(f"\nAdded columns to adata.obs:")
new_columns = ['cell_type_pred', 'manual_CLname', 'manual_CLID', 'manual_broadtype']
for col in new_columns:
    if col in adata.obs.columns:
        print(f"  {col}: {adata.obs[col].nunique()} unique values")

# insert cell type predictions cleaning code
import re

pat = re.compile(r"[\[\]']")     # match [, ], or '
adata.obs['cell_type_pred'] = (
    adata.obs['cell_type_pred']
        .str.replace(pat, '', regex=True)  # → "non‑malignant"
        .str.strip()
)

# Save the updated h5ad file
adata.write(output_path)

print(f"\nUpdated h5ad file saved as: {output_path}")

# Verify the mapping worked correctly
print(f"\nVerification - Sample of cluster to cell type mapping:")
sample_df = adata.obs[[cluster_col, 'cell_type_pred']].drop_duplicates().sort_values(cluster_col)
print(sample_df)

# Optional: Display first few rows of updated obs
print(f"\nFirst few rows of updated adata.obs:")
display_cols = [cluster_col, 'cell_type_pred', 'manual_CLname', 'manual_CLID', 'manual_broadtype']
available_cols = [col for col in display_cols if col in adata.obs.columns]
adata.obs[available_cols].head()

Original h5ad shape: (138727, 33541)
Labeled CSV shape: (23, 9)

Labeled data:
                    dataset         tissue  \
0  dataset_debug_restricted  breast cancer   
1  dataset_debug_restricted  breast cancer   
2  dataset_debug_restricted  breast cancer   
3  dataset_debug_restricted  breast cancer   
4  dataset_debug_restricted  breast cancer   

                                              marker manual_annotation  \
0  SCGB2A2,XBP1,SCGB1D2,KRT18,TRPS1,TFF3,KRT8,MGP...         Cluster 0   
1  IGFBP7,PCAT19,CD93,SPARCL1,CD59,GNG11,SPRY1,TM...        Cluster 11   
2  AIF1,TYROBP,FCER1G,HLA-DRA,FTL,HLA-DPB1,HLA-DP...         Cluster 2   
3  CRYAB,KRT7,KRT17,SFRP1,NFIB,FBXO32,KRT14,CALD1...         Cluster 5   
4  COL3A1,COL1A2,COL1A1,CALD1,SPARC,COL6A2,COL6A1...         Cluster 4   

   manual_CLname  manual_CLID  manual_broadtype       cell_type_pred  \
0            NaN          NaN               NaN      [['malignant']]   
1            NaN          NaN               NaN  [['non

Unnamed: 0,leiden,cell_type_pred,manual_CLname,manual_CLID,manual_broadtype
Gao2021_AAACCTGCAGTGACAG,40,Unknown,,,
Gao2021_AAACCTGGTCGAGATG,52,Unknown,,,
Gao2021_AAACCTGTCACCGGGT,62,Unknown,,,
Gao2021_AAACGGGGTGCACTTA,40,Unknown,,,
Gao2021_AAACGGGTCACGGTTA,40,Unknown,,,
