In [1]:
import scanpy as sc
import matplotlib.pyplot as plt

# Load the AnnData object
adata = sc.read_h5ad("/cs/student/projects2/aisd/2024/shekchu/projects/cell_agents/subset_dataset_creation/integrated_with_quiescence.h5ad")

# Display basic information
print("AnnData object with n_obs × n_vars = {} × {}".format(adata.n_obs, adata.n_vars))

# Show available annotations
print("\nObservation annotations:")
print(adata.obs.columns.tolist())
print("\nVariable annotations:")
print(adata.var.columns.tolist())

# Show available embeddings
print("\nAvailable embeddings:")
for key in adata.obsm.keys():
    print(f"- {key}")

# Display the first few observations
print("\nSample of observation data:")
display(adata.obs.head())

# Basic plot if UMAP is available
if 'X_umap' in adata.obsm:
    sc.pl.umap(adata, show=False)
    plt.title("UMAP visualization")
    plt.show()

AnnData object with n_obs × n_vars = 138727 × 33541

Observation annotations:
['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'type', 'patient', 'annotation', 'percent.mt', 'Phase', 'CC.Difference', 'nCount_SCT', 'nFeature_SCT', 'seurat_clusters', 'integrated_snn_res.2', 'celltype', 'malignancy', 'cellclass', 'QuiescenceScore', 'QuiescenceStatus', 'QuiescenceType', 'disease', 'UMAP_1', 'UMAP_2', 'ident']

Variable annotations:
[]

Available embeddings:

Sample of observation data:


Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,type,patient,annotation,percent.mt,Phase,CC.Difference,nCount_SCT,...,celltype,malignancy,cellclass,QuiescenceScore,QuiescenceStatus,QuiescenceType,disease,UMAP_1,UMAP_2,ident
Gao2021_AAACCTGCAGTGACAG,Gao2021,8264.0,2739,DCIS,Patient.1,Malignant,4.440949,G1,0.109523,14842.0,...,PIP+ mammary luminal cell,malignant,EPI,0.42407,Slow-cycling,,cancer,12.810999,-1.927213,PIP+ mammary luminal cell
Gao2021_AAACCTGGTCGAGATG,Gao2021,8114.0,3079,DCIS,Patient.1,Epithelial,2.896229,G1,0.057076,14755.0,...,PIP+ mammary luminal cell,malignant,EPI,0.11708,Slow-cycling,,cancer,12.729243,-2.078249,PIP+ mammary luminal cell
Gao2021_AAACCTGTCACCGGGT,Gao2021,10870.0,3241,DCIS,Patient.1,Endothelial,7.102116,G1,0.192513,14435.0,...,Endothelial,,EC,,,,cancer,6.316977,15.984601,Endothelial
Gao2021_AAACGGGGTGCACTTA,Gao2021,11894.0,3373,DCIS,Patient.1,Malignant,3.57323,S,0.545934,14577.0,...,PIP+ mammary luminal cell,malignant,EPI,-1.149291,Slow-cycling,,cancer,13.69435,-1.104929,PIP+ mammary luminal cell
Gao2021_AAACGGGTCACGGTTA,Gao2021,20491.0,4161,DCIS,Patient.1,Malignant,4.611781,G1,-0.3062,15826.0,...,PIP+ mammary luminal cell,malignant,EPI,-4.777074,Proliferating,,cancer,14.134395,-0.761374,PIP+ mammary luminal cell


In [None]:
import scanpy as sc
import pandas as pd
import anndata as ad

# Load the original dataset
adata = sc.read_h5ad("/cs/student/projects2/aisd/2024/shekchu/projects/cell_agents/sarcoma/Data_Jerby-Arnon2021_Sarcoma/10X/sarcoma_combined.h5ad")

# Create a restricted AnnData object
adata_restricted = ad.AnnData(
    X=adata.X.copy(),                      # gene expression matrix
    obs=pd.DataFrame(index=adata.obs_names),  # only cell barcodes
    var=pd.DataFrame(index=adata.var_names)   # only gene names
)

# Save the restricted dataset
adata_restricted.write("sarcoma_dataset_restricted.h5ad")

print("Restricted dataset saved as 'sarcoma_dataset_restricted.h5ad'")


Restricted dataset saved as 'sarcoma_dataset_restricted.h5ad'


In [17]:
import scanpy as sc
import pandas as pd
import anndata as ad

# Load the original dataset
adata = sc.read_h5ad("/cs/student/projects2/aisd/2024/shekchu/projects/cell_agents/subset_dataset_creation/integrated_with_quiescence.h5ad")

# Select the columns to retain from .obs
columns_to_keep = ['malignancy']
obs_filtered = adata.obs[columns_to_keep].copy()
obs_filtered.index = adata.obs_names  # ensure index is preserved

# Create a restricted AnnData object
adata_restricted = ad.AnnData(
    X=adata.X.copy(),                      # gene expression matrix
    obs=obs_filtered,                     # selected annotations
    var=pd.DataFrame(index=adata.var_names)  # only gene names
)

# create a mapping for the 'malignancy' column, anything that is not 'malignant' will be 'non-malignant'
mapping = {'malignant': 'malignant'}
# Apply the mapping to the 'malignancy' column
adata_restricted.obs['malignancy'] = adata.obs['malignancy'].map(mapping)

# fill all nan with 'non-malignant'
adata_restricted.obs['malignancy'].fillna('non-malignant', inplace=True )

# Save the restricted dataset
adata_restricted.write("dataset_restricted_with_labels.h5ad")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata_restricted.obs['malignancy'].fillna('non-malignant', inplace=True )


# with expert clustering

In [6]:
import scanpy as sc
import pandas as pd
import anndata as ad

# Load the original dataset
adata = sc.read_h5ad("/cs/student/projects2/aisd/2024/shekchu/projects/cell_agents/subset_dataset_creation/integrated_with_quiescence.h5ad")

# Create a restricted AnnData object
adata_restricted = ad.AnnData(
    X=adata.X.copy(),                      # gene expression matrix
    # only cell barcodes and seurat_clusters
    obs=adata.obs[["seurat_clusters"]].copy(), 
    # only gene names
    var=pd.DataFrame(index=adata.var_names)   # only gene names
)

# Save the restricted dataset
adata_restricted.write("dataset_restricted_with_cluster.h5ad")

print("Restricted dataset saved as 'dataset_restricted_with_cluster.h5ad'")


Restricted dataset saved as 'dataset_restricted_with_cluster.h5ad'
