In [6]:
import scanpy as sc
import matplotlib.pyplot as plt

# Load the AnnData object
adata = sc.read_h5ad("dataset_debug.h5ad")

# Display basic information
print("AnnData object with n_obs × n_vars = {} × {}".format(adata.n_obs, adata.n_vars))

# Show available annotations
print("\nObservation annotations:")
print(adata.obs.columns.tolist())
print("\nVariable annotations:")
print(adata.var.columns.tolist())

# Show available embeddings
print("\nAvailable embeddings:")
for key in adata.obsm.keys():
    print(f"- {key}")

# Display the first few observations
print("\nSample of observation data:")
display(adata.obs.head())

# Basic plot if UMAP is available
if 'X_umap' in adata.obsm:
    sc.pl.umap(adata, show=False)
    plt.title("UMAP visualization")
    plt.show()

AnnData object with n_obs × n_vars = 1000 × 33541

Observation annotations:
['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'type', 'patient', 'annotation', 'percent.mt', 'Phase', 'CC.Difference', 'nCount_SCT', 'nFeature_SCT', 'seurat_clusters', 'integrated_snn_res.2', 'celltype', 'malignancy', 'cellclass', 'QuiescenceScore', 'QuiescenceStatus', 'QuiescenceType', 'disease', 'UMAP_1', 'UMAP_2', 'ident']

Variable annotations:
[]

Available embeddings:

Sample of observation data:


Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,type,patient,annotation,percent.mt,Phase,CC.Difference,nCount_SCT,nFeature_SCT,seurat_clusters,integrated_snn_res.2,celltype,malignancy,cellclass,QuiescenceScore,QuiescenceStatus,QuiescenceType,disease,UMAP_1,UMAP_2,ident
Gao2021_ACTGCTCAGAAGAAGC,Gao2021,5577.0,1790,DCIS,Patient.1,T_cell,2.187556,G1,0.233609,14711.0,2213,21,21,CD4+ T cell,,IMM,,,,cancer,11.391339,7.731046,CD4+ T cell
Gao2021_AGGCCACTCAACTCTT,Gao2021,21665.0,4484,DCIS,Patient.1,Malignant,8.691438,G1,0.787248,15922.0,4465,37,37,PIP+ mammary luminal cell,malignant,EPI,-0.754145,Slow-cycling,,cancer,15.917251,-3.773292,PIP+ mammary luminal cell
Gao2021_CAAGGCCAGTGTCCCG,Gao2021,19553.0,4472,DCIS,Patient.1,Malignant,5.758707,G1,0.606015,15964.0,4471,12,12,PIP+ mammary luminal cell,malignant,EPI,-0.142115,Slow-cycling,,cancer,12.767733,-2.807837,PIP+ mammary luminal cell
Gao2021_CGGACTGTCTACTTAC,Gao2021,7217.0,2655,DCIS,Patient.1,Endothelial,2.868228,G1,0.233335,14594.0,2776,16,16,Endothelial,,EC,,,,cancer,6.548129,15.68747,Endothelial
Gao2021_GATCGATAGTATCGAA,Gao2021,10486.0,2525,DCIS,Patient.1,Macrophage,3.852756,G1,0.074222,14557.0,2524,4,4,Tumour-associated macrophage,,IMM,,,,cancer,0.078982,10.269024,Macrophage


In [5]:
import pandas as pd

# Remove pandas display limits
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', 100)      # Show 100 rows
pd.set_option('display.width', None)        # Auto-detect display width
pd.set_option('display.max_colwidth', None) # Show full content of each cell

In [14]:
import scanpy as sc
import pandas as pd
import anndata as ad

# Load the original dataset
adata = sc.read_h5ad("dataset_debug.h5ad")

# Create a restricted AnnData object
adata_restricted = ad.AnnData(
    X=adata.X.copy(),                      # gene expression matrix
    obs=pd.DataFrame(index=adata.obs_names),  # only cell barcodes
    var=pd.DataFrame(index=adata.var_names)   # only gene names
)

# Save the restricted dataset
adata_restricted.write("dataset_debug_restricted.h5ad")

print("Restricted dataset saved as 'dataset_debug_restricted.h5ad'")


Restricted dataset saved as 'dataset_debug_restricted.h5ad'


In [15]:
import scanpy as sc
import pandas as pd
import anndata as ad

# Load the original dataset
adata = sc.read_h5ad("dataset_debug.h5ad")

# Select the columns to retain from .obs
columns_to_keep = ['malignancy', 'celltype']
obs_filtered = adata.obs[columns_to_keep].copy()
obs_filtered.index = adata.obs_names  # ensure index is preserved

# Create a restricted AnnData object
adata_restricted = ad.AnnData(
    X=adata.X.copy(),                      # gene expression matrix
    obs=obs_filtered,                     # selected annotations
    var=pd.DataFrame(index=adata.var_names)  # only gene names
)

# Save the restricted dataset
adata_restricted.write("dataset_debug_restricted_with_labels.h5ad")

print("Restricted dataset with 'malignancy' and 'celltype' saved as 'dataset_debug_restricted_with_labels.h5ad'")

Restricted dataset with 'malignancy' and 'celltype' saved as 'dataset_debug_restricted_with_labels.h5ad'
