In [None]:
import anndata as ad

adata = ad.read_h5ad("dataset_debug_restricted.h5ad")


In [11]:
import pandas as pd

# Configure pandas to show all columns and wider output
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Display the first few rows of the observation dataframe
adata.obs.head()

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,type,patient,annotation,percent.mt,Phase,CC.Difference,nCount_SCT,nFeature_SCT,seurat_clusters,integrated_snn_res.2,celltype,malignancy,cellclass,QuiescenceScore,QuiescenceStatus,QuiescenceType,disease,UMAP_1,UMAP_2,ident
Gao2021_ACTGCTCAGAAGAAGC,Gao2021,5577.0,1790,DCIS,Patient.1,T_cell,2.187556,G1,0.233609,14711.0,2213,21,21,CD4+ T cell,,IMM,,,,cancer,11.391339,7.731046,CD4+ T cell
Gao2021_AGGCCACTCAACTCTT,Gao2021,21665.0,4484,DCIS,Patient.1,Malignant,8.691438,G1,0.787248,15922.0,4465,37,37,PIP+ mammary luminal cell,malignant,EPI,-0.754145,Slow-cycling,,cancer,15.917251,-3.773292,PIP+ mammary luminal cell
Gao2021_CAAGGCCAGTGTCCCG,Gao2021,19553.0,4472,DCIS,Patient.1,Malignant,5.758707,G1,0.606015,15964.0,4471,12,12,PIP+ mammary luminal cell,malignant,EPI,-0.142115,Slow-cycling,,cancer,12.767733,-2.807837,PIP+ mammary luminal cell
Gao2021_CGGACTGTCTACTTAC,Gao2021,7217.0,2655,DCIS,Patient.1,Endothelial,2.868228,G1,0.233335,14594.0,2776,16,16,Endothelial,,EC,,,,cancer,6.548129,15.68747,Endothelial
Gao2021_GATCGATAGTATCGAA,Gao2021,10486.0,2525,DCIS,Patient.1,Macrophage,3.852756,G1,0.074222,14557.0,2524,4,4,Tumour-associated macrophage,,IMM,,,,cancer,0.078982,10.269024,Macrophage


# Checking for Gene Symbols

Let's examine the variable (gene) names in the dataset to see if it contains gene symbols like KCNJ8 and PDGFRA.

In [12]:
# Examine what's available in the variable (gene) annotations
print("Variable annotation keys:", list(adata.var.keys()))

# Get the names of the first few genes
print("\nFirst 5 gene names:")
print(list(adata.var_names[:5]))

# Check if genes are stored as indexes or in a separate column
if "gene_symbols" in adata.var.columns:
    gene_symbols = adata.var["gene_symbols"]
    gene_id_col = "gene_symbols"
elif "gene_symbol" in adata.var.columns:
    gene_symbols = adata.var["gene_symbol"]
    gene_id_col = "gene_symbol"
elif "symbol" in adata.var.columns:
    gene_symbols = adata.var["symbol"]
    gene_id_col = "symbol"
else:
    # Assume var_names are the gene symbols
    gene_symbols = adata.var_names
    gene_id_col = "var_names (index)"

print(f"\nGene identifiers are in: {gene_id_col}")

# Check for specific genes of interest
genes_to_check = ["KCNJ8", "PDGFRA"]
for gene in genes_to_check:
    # Check if the gene exists in the gene symbols
    if isinstance(gene_symbols, pd.Index):
        exists = gene in gene_symbols
    else:
        exists = gene in gene_symbols.values
    
    if exists:
        print(f"✅ Gene {gene} found in the dataset")
    else:
        print(f"❌ Gene {gene} not found in the dataset")

# Look at the structure of var dataframe
print("\nVariable (gene) annotation dataframe structure:")
print(adata.var.head())

# Check if we need to search in a case-insensitive manner
if not all(gene in gene_symbols for gene in genes_to_check):
    print("\nTrying case-insensitive search...")
    for gene in genes_to_check:
        if isinstance(gene_symbols, pd.Index):
            gene_list = gene_symbols.str.upper()
            exists = gene.upper() in gene_list
        else:
            gene_list = gene_symbols.str.upper() if hasattr(gene_symbols, 'str') else [str(g).upper() for g in gene_symbols]
            exists = gene.upper() in gene_list
        
        if exists:
            print(f"✅ Gene {gene} found when searching case-insensitively")
        else:
            print(f"❌ Gene {gene} not found even with case-insensitive search")

Variable annotation keys: []

First 5 gene names:
['RP11-34P13.7', 'RP11-34P13.8', 'FO538757.3', 'FO538757.2', 'AP006222.2']

Gene identifiers are in: var_names (index)
✅ Gene KCNJ8 found in the dataset
✅ Gene PDGFRA found in the dataset

Variable (gene) annotation dataframe structure:
Empty DataFrame
Columns: []
Index: [RP11-34P13.7, RP11-34P13.8, FO538757.3, FO538757.2, AP006222.2]


# Checking for Clustering and Marker Genes

Let's examine if clustering has been performed on the dataset and whether marker genes have been identified. Run the cell below to check for:

1. Cluster assignments in observation metadata
2. Dimensionality reduction results 
3. Marker gene information in unstructured annotations

In [19]:
# 1. Check observation metadata for cluster assignments
print("Observation metadata columns:", list(adata.obs.columns))

# Look for common cluster column names
cluster_columns = [col for col in adata.obs.columns if 
                  any(term in col.lower() for term in 
                     ["cluster", "louvain", "leiden", "group", "cell_type", "celltype", "type", "cluster", "label"])]

if cluster_columns:
    print(f"\nPotential clustering columns found: {cluster_columns}")
    
    # Display the first cluster column's content
    if len(cluster_columns) > 0:
        first_cluster_col = cluster_columns[0]
        print(f"\nValues in '{first_cluster_col}' column:")
        value_counts = adata.obs[first_cluster_col].value_counts()
        print(value_counts)
        print(f"Total number of clusters: {len(value_counts)}")
else:
    print("\nNo obvious clustering columns found in observation metadata.")

# 2. Check for dimensionality reduction results
print("\nDimensionality reduction results in obsm:")
print(list(adata.obsm.keys()))

# 3. Check for marker genes (commonly stored in uns)
print("\nUnstructured annotations (uns) keys:")
print(list(adata.uns.keys()))

# Look for common marker gene keys
marker_keys = [key for key in adata.uns.keys() if 
              any(term in key.lower() for term in 
                 ["marker", "gene", "deg", "diff", "rank", "score"])]

if marker_keys:
    print(f"\nPotential marker gene keys found: {marker_keys}")
    
    # Try to show a preview of the first marker gene result
    if len(marker_keys) > 0:
        first_marker_key = marker_keys[0]
        print(f"\nPreview of '{first_marker_key}':")
        try:
            marker_content = adata.uns[first_marker_key]
            if isinstance(marker_content, dict):
                print(f"Keys in {first_marker_key}: {list(marker_content.keys())}")
                if len(list(marker_content.keys())) > 0:
                    sample_group = list(marker_content.keys())[0]
                    print(f"\nMarker genes for group '{sample_group}' (showing top 5 if available):")
                    if isinstance(marker_content[sample_group], pd.DataFrame):
                        print(marker_content[sample_group].head())
                    else:
                        print(f"Content type: {type(marker_content[sample_group])}")
            else:
                print(f"Content type: {type(marker_content)}")
        except Exception as e:
            print(f"Could not display marker content: {e}")
else:
    print("\nNo obvious marker gene information found in unstructured annotations.")

# Additional check: Look for rank_genes_groups_* keys that might contain Scanpy marker gene results
scanpy_markers = [key for key in adata.uns.keys() if "rank_genes_groups" in key]
if scanpy_markers:
    print(f"\nScanpy marker gene results found: {scanpy_markers}")
    
    # Try to show a preview
    try:
        if "rank_genes_groups" in adata.uns:
            print("\nPreview of Scanpy rank_genes_groups:")
            if "names" in adata.uns["rank_genes_groups"]:
                marker_names = adata.uns["rank_genes_groups"]["names"]
                print("Top marker genes by group (first 5 for each group):")
                for group in marker_names.dtype.names:
                    print(f"Group {group}: {marker_names[group][:5]}")
    except Exception as e:
        print(f"Could not display Scanpy marker content: {e}")

Observation metadata columns: ['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'type', 'patient', 'annotation', 'percent.mt', 'Phase', 'CC.Difference', 'nCount_SCT', 'nFeature_SCT', 'seurat_clusters', 'integrated_snn_res.2', 'celltype', 'malignancy', 'cellclass', 'QuiescenceScore', 'QuiescenceStatus', 'QuiescenceType', 'disease', 'UMAP_1', 'UMAP_2', 'ident']

Potential clustering columns found: ['type', 'seurat_clusters', 'celltype', 'QuiescenceType']

Values in 'type' column:
type
ER          389
TNBC        257
HER         120
neoplasm    116
normal       96
PR           14
DCIS          8
Name: count, dtype: int64
Total number of clusters: 7

Dimensionality reduction results in obsm:
[]

Unstructured annotations (uns) keys:
['X_name', 'tiny_dataset_info']

No obvious marker gene information found in unstructured annotations.
