In [None]:
import scanpy
import numpy as np
import pandas as pd

days_df = pd.read_csv("raw/cell_days.txt", index_col='id', sep='\t')
adata = scanpy.read_h5ad("raw/ExprMatrix.h5ad") # cell x gene
adata.obs = adata.obs.join(days_df)

adata = adata[~pd.isna(adata.obs['day']), :]
meta_df = adata.obs
unique_days = adata.obs['day'].unique()
unique_days = unique_days[np.isnan(unique_days) == False]
print("Data shape: ", adata.shape)
print("Num of unique days = {}".format(len(unique_days)))

Data shape:  (236285, 19089)
Num of unique days = 39


In [3]:
print("-" * 70)
print("Merge timepoints...")
cell_tps = np.floor(adata.obs['day'])
adata.obs['day'] = cell_tps
unique_days = adata.obs['day'].unique()
print("Data shape: ", adata.shape)
print("Num of unique days (merged) = {}".format(len(unique_days)))
print("Num of cell:")
cell_idx_per_tp = [np.where(adata.obs["day"] == t)[0] for t in unique_days]
cell_num_per_tp = [len(x) for x in cell_idx_per_tp]
print(cell_num_per_tp)

print("-" * 70)
ratio = 0.1
print("Subsampling (ratio={})...".format(ratio))
sample_cell_idx_per_tp = [np.random.choice(x, int(len(x)*ratio), replace=False) for x in cell_idx_per_tp]
adata = adata[np.concatenate(sample_cell_idx_per_tp), :]
unique_days = adata.obs['day'].unique()
print("Data shape: ", adata.shape)
print("Num of unique days (sampled) = {}".format(len(unique_days)))
cell_idx_per_tp = [np.where(adata.obs["day"] == t)[0] for t in unique_days]
cell_num_per_tp = [len(x) for x in cell_idx_per_tp]
print("Num of cell:")
print(cell_num_per_tp)


----------------------------------------------------------------------
Merge timepoints...


  adata.obs['day'] = cell_tps


Data shape:  (236285, 19089)
Num of unique days (merged) = 19
Num of cell:
[8005, 5604, 13715, 14132, 16089, 13777, 11533, 11568, 24200, 8865, 7327, 7122, 7478, 7080, 14443, 20615, 21068, 16228, 7436]
----------------------------------------------------------------------
Subsampling (ratio=0.1)...
Data shape:  (23619, 19089)
Num of unique days (sampled) = 19
Num of cell:
[800, 560, 1371, 1413, 1608, 1377, 1153, 1156, 2420, 886, 732, 712, 747, 708, 1444, 2061, 2106, 1622, 743]


In [3]:
print("-" * 70)
split_type = "all_times" 
print("Data shape: ", adata.shape)
print("Num of tps: ", len(unique_days))
print("Split type: {}".format(split_type))
if split_type == "three_forecasting": 
    train_tps = unique_days[:16].tolist()
    test_tps = unique_days[16:].tolist()
elif split_type == "three_interpolation": 
    train_tps = unique_days.tolist()
    test_tps = [train_tps[5], train_tps[10], train_tps[15]]
    train_tps.remove(unique_days[5])
    train_tps.remove(unique_days[10])
    train_tps.remove(unique_days[15])
elif split_type == "remove_recovery": 
    train_tps = unique_days.tolist()
    test_idx = [5, 7, 9, 11, 15, 16, 17, 18]
    test_tps = [train_tps[t] for t in test_idx]
    for t in test_idx:
        train_tps.remove(unique_days[t])
elif split_type == "all_times": 
    train_tps = unique_days.tolist()
    test_tps = []
print("Train tps: ", train_tps)
print("Test tps: ", test_tps)

train_adata = adata[adata.obs['day'].apply(lambda x: x in train_tps)]
print("Train data shape: ", train_adata.shape)
hvgs_summary = scanpy.pp.highly_variable_genes(train_adata, n_top_genes=2000, inplace=False)
hvgs = train_adata.var.index.values[hvgs_summary.highly_variable]
adata = adata[:, hvgs]
meta_df = adata.obs
print("HVG data shape: ", adata.shape)
print("HVG meta shape: ", meta_df.shape)
adata.to_df().to_csv("reduce_processed/{}-norm_data-hvg.csv".format(split_type)) # cell x genes
pd.DataFrame(hvgs).to_csv("reduce_processed/{}-var_genes_list.csv".format(split_type))
meta_df.to_csv("reduce_processed/{}-meta_data.csv".format(split_type))

----------------------------------------------------------------------
Data shape:  (23619, 2000)
Num of tps:  19
Split type: all_times
Train tps:  [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0]
Test tps:  []
Train data shape:  (23619, 2000)
HVG data shape:  (23619, 2000)
HVG meta shape:  (23619, 1)
