In [1]:
import anndata as ad
import numpy as np
import scipy
import pandas as pd
import torch
import scanpy as sc
# import cellrank as cr

In [2]:
sc.settings.verbosity = 3 

In [3]:
import warnings

warnings.simplefilter("ignore", category=UserWarning)

In [4]:
adata = ad.read_h5ad("/Users/apple/Desktop/KB/data/BiddyData/biddy_fulldata.h5ad")
adata.X.shape

(104679, 22630)

In [5]:
# save the full data into h5ad file
# adata.write("biddy_fulldata.h5ad")

### Filter cells

In [6]:
nan_rows = adata.obs['CellTagD0_48k'].isna()
print("number of cells without CellTagD0_48k barcode: ",sum(nan_rows))
adata_filter = adata[~nan_rows].copy()
print("adata shape after filtering out clone id nans: ", adata_filter.shape)
adata_filter = adata_filter[adata_filter.obs['cell_type'].notnull()].copy()
print("adata shape after filtering out cell type nans: ", adata_filter.shape)

number of cells without CellTagD0_48k barcode:  93792
adata shape after filtering out clone id nans:  (10887, 22630)
adata shape after filtering out cell type nans:  (7251, 22630)


In [7]:
print("number of CellTagD0_48k lineages: ", len(adata_filter.obs["CellTagD0_48k"].unique()))
value_counts = adata_filter.obs['CellTagD0_48k'].value_counts()
valid_tags = value_counts[value_counts >= 5].index

# Filter the adata_filter object
adata_filter = adata_filter[adata_filter.obs['CellTagD0_48k'].isin(valid_tags)].copy()

print("number of CellTagD0_48k lineages that has at least 5 cells: ", len(valid_tags))
print(f"Number of observations after filtering: {adata_filter.n_obs}")

number of CellTagD0_48k lineages:  505
number of CellTagD0_48k lineages that has at least 5 cells:  169
Number of observations after filtering: 6534


In [8]:
adata_filter.obs["cell_type"].unique()

['Ambiguous', 'Fibroblast', 'iEP']
Categories (3, object): ['Ambiguous', 'Fibroblast', 'iEP']

In [9]:
adata_filter.obs["clone_id"] = adata_filter.obs["CellTagD0_48k"]

In [10]:
adata_filter.obs

Unnamed: 0,timecourse,reprogramming_day,reprogramming,cell_type,cell_cycle,cluster,monocle_state,pseudotime,CellTagD0_85k,CellTagD3_85k,CellTagD13_85k,CellTagD0_48k,CellTagD3_48k,CellTagD13_48k,clone_id
HF1_AAAGCAATCCAGATCA_5,1,12,,Ambiguous,G1,5,2,0.426868,424.0,476.0,,424.0,476.0,,424.0
HF1_AACACGTAGAACAACT_5,1,12,,Ambiguous,S,5,2,0.549377,195.0,,,195.0,,,195.0
HF1_AAACCTGAGAGCAATT_5,1,12,,Fibroblast,G2M,2,2,0.664971,647.0,309.0,,647.0,309.0,,647.0
HF1_AACACGTTCGAGGTAG_5,1,12,False,iEP,G1,2,2,0.730338,487.0,240.0,,487.0,240.0,,487.0
HF1_AACCATGGTCTAGAGG_5,1,12,,iEP,G1,2,2,0.568279,652.0,203.0,,652.0,203.0,,652.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HF2_TTCTTAGAGCGACGTA_4,2,9,,iEP,G2M,5,2,0.509046,2657.0,,,2657.0,,,2657.0
HF2_TTCCCAGGTGATGATA_4,2,9,,iEP,G2M,5,2,0.530828,2367.0,2514.0,,2367.0,2514.0,,2367.0
HF2_TTCTCCTCATGAACCT_4,2,9,,iEP,G2M,5,2,0.403314,2490.0,2042.0,,2490.0,2042.0,,2490.0
HF2_TTGGAACCAAGAAAGG_4,2,9,,Ambiguous,G1,2,2,1.029464,2764.0,2525.0,,2764.0,2525.0,,2764.0


In [11]:
adata_filter.obs["clone_id"].value_counts()

clone_id
493.0     1309
2352.0     657
487.0      366
666.0      329
2721.0     293
          ... 
2630.0       5
2350.0       5
2915.0       5
2920.0       5
2367.0       5
Name: count, Length: 169, dtype: int64

In [12]:
sc.pp.filter_cells(adata_filter, min_genes=200)
sc.pp.filter_genes(adata_filter, min_cells=3)

filtered out 8764 genes that are detected in less than 3 cells


In [13]:
sc.pp.normalize_total(adata_filter, target_sum=1e4)
sc.pp.log1p(adata_filter)
sc.pp.highly_variable_genes(adata_filter,n_top_genes=2000)


normalizing counts per cell
    finished (0:00:01)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:02)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)


  disp_grouped = df.groupby('mean_bin')['dispersions']


In [14]:
adata_filter = adata_filter[:, adata_filter.var.highly_variable]

In [15]:
adata_filter.X.shape

(6534, 2000)

In [16]:
adata_filter.obs["clone_id"].value_counts()

clone_id
493.0     1309
2352.0     657
487.0      366
666.0      329
2721.0     293
          ... 
2630.0       5
2350.0       5
2915.0       5
2920.0       5
2367.0       5
Name: count, Length: 169, dtype: int64

In [17]:
adata_filter.write("biddy_6534_2000_norm_log.h5ad")