In [1]:
import numpy as np
import anndata as ad
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from copy import deepcopy
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
import scanpy as sc
import scipy

In [2]:
input_dir = "/Users/apple/Desktop/KB/data/LarryData"

adata_raw = ad.read_h5ad(input_dir + "/larry_raw_w_clone_id.h5ad")
adata_normlog = ad.read_h5ad(input_dir + "/Larry_41093_2000_norm_log.h5ad")

adata_train = ad.read_h5ad(input_dir + "/train_test/Larry_train.h5ad")
adata_test = ad.read_h5ad(input_dir + "/train_test/Larry_test.h5ad")

adata_train_200 = ad.read_h5ad(input_dir + "/train_test/Larry_200_train.h5ad")
adata_test_200 = ad.read_h5ad(input_dir + "/train_test/Larry_200_test.h5ad")

adata_train_500 = ad.read_h5ad(input_dir + "/train_test/Larry_500_train.h5ad")
adata_test_500 = ad.read_h5ad(input_dir + "/train_test/Larry_500_test.h5ad")


In [4]:
adata_raw.shape, adata_normlog.shape, adata_train.shape, adata_test.shape, adata_train_200.shape, adata_test_200.shape, adata_train_500.shape, adata_test_500.shape


((49302, 23420),
 (41093, 2000),
 (37207, 2000),
 (3886, 2000),
 (10148, 2000),
 (1225, 2000),
 (17054, 2000),
 (2177, 2000))

In [3]:
def scvi_data(adata_ds, adata_scCL):

    print("adata_ds.shape: ", adata_ds.shape)
    print("adata_scCL.shape: ", adata_scCL.shape)

    cells_to_select = adata_scCL.obs.index
    genes_sim = adata_scCL.var_names
    adata_scvi_subset = adata_ds[cells_to_select, genes_sim]
    
    # Create new anndata object with X from adata_scvi_subset and obs from adata_scCL
    adata_scvi = ad.AnnData(X=adata_scvi_subset.X, obs=adata_scCL.obs, var=adata_scCL.var)

    return adata_scvi

### full data

In [6]:
adata_scvi_full = scvi_data(adata_raw, adata_normlog)
adata_scvi_full.shape

adata_ds.shape:  (49302, 23420)
adata_scCL.shape:  (41093, 2000)


(41093, 2000)

### train test data

In [7]:
adata_scvi_train = scvi_data(adata_raw, adata_train)
adata_scvi_test = scvi_data(adata_raw, adata_test)
adata_scvi_train.shape, adata_scvi_test.shape

adata_ds.shape:  (49302, 23420)
adata_scCL.shape:  (37207, 2000)
adata_ds.shape:  (49302, 23420)
adata_scCL.shape:  (3886, 2000)


((37207, 2000), (3886, 2000))

### train test top 200 data

In [8]:
adata_scvi_train_200 = scvi_data(adata_raw, adata_train_200)
adata_scvi_test_200 = scvi_data(adata_raw, adata_test_200)
adata_scvi_train_200.shape, adata_scvi_test_200.shape

adata_ds.shape:  (49302, 23420)
adata_scCL.shape:  (10148, 2000)
adata_ds.shape:  (49302, 23420)
adata_scCL.shape:  (1225, 2000)


((10148, 2000), (1225, 2000))

### train test top 500 data

In [9]:
adata_scvi_train_500 = scvi_data(adata_raw, adata_train_500)
adata_scvi_test_500 = scvi_data(adata_raw, adata_test_500)
adata_scvi_train_500.shape, adata_scvi_test_500.shape

adata_ds.shape:  (49302, 23420)
adata_scCL.shape:  (17054, 2000)
adata_ds.shape:  (49302, 23420)
adata_scCL.shape:  (2177, 2000)


((17054, 2000), (2177, 2000))

In [10]:
adata_scvi_full.write_h5ad('Larry_scvi_full.h5ad')

adata_scvi_train.write_h5ad('Larry_scvi_train.h5ad')
adata_scvi_test.write_h5ad('Larry_scvi_test.h5ad')

adata_scvi_train_200.write_h5ad('Larry_scvi_train_200.h5ad')
adata_scvi_test_200.write_h5ad('Larry_scvi_test_200.h5ad')

adata_scvi_train_500.write_h5ad('Larry_scvi_train_500.h5ad')
adata_scvi_test_500.write_h5ad('Larry_scvi_test_500.h5ad')

adata_scvi_full.shape, adata_scvi_train.shape, adata_scvi_test.shape, adata_scvi_train_200.shape, adata_scvi_test_200.shape, adata_scvi_train_500.shape, adata_scvi_test_500.shape

((41093, 2000),
 (37207, 2000),
 (3886, 2000),
 (10148, 2000),
 (1225, 2000),
 (17054, 2000),
 (2177, 2000))

### Train Test lineage split

In [2]:
input_dir = "/Users/apple/Desktop/KB/data/LarryData"

adata_raw = ad.read_h5ad(input_dir + "/larry_raw_w_clone_id.h5ad")
adata_normlog = ad.read_h5ad(input_dir + "/Larry_41093_2000_norm_log.h5ad")

adata_train_lin = ad.read_h5ad(input_dir + "/train_test/Larry_train_lineage.h5ad")
adata_test_lin = ad.read_h5ad(input_dir + "/train_test/Larry_test_lineage.h5ad")

adata_train_200_lin = ad.read_h5ad(input_dir + "/train_test/Larry_200_train_lineage.h5ad")
adata_test_200_lin = ad.read_h5ad(input_dir + "/train_test/Larry_200_test_lineage.h5ad")

adata_train_500_lin = ad.read_h5ad(input_dir + "/train_test/Larry_500_train_lineage.h5ad")
adata_test_500_lin = ad.read_h5ad(input_dir + "/train_test/Larry_500_test_lineage.h5ad")


In [4]:
adata_scvi_train_lin = scvi_data(adata_raw, adata_train_lin)
adata_scvi_test_lin = scvi_data(adata_raw, adata_test_lin)
adata_scvi_train_lin.shape, adata_scvi_test_lin.shape

adata_ds.shape:  (49302, 23420)
adata_scCL.shape:  (37070, 2000)
adata_ds.shape:  (49302, 23420)
adata_scCL.shape:  (4023, 2000)


((37070, 2000), (4023, 2000))

In [5]:
adata_scvi_train_200_lin = scvi_data(adata_raw, adata_train_200_lin)
adata_scvi_test_200_lin = scvi_data(adata_raw, adata_test_200_lin)
adata_scvi_train_200_lin.shape, adata_scvi_test_200_lin.shape

adata_ds.shape:  (49302, 23420)
adata_scCL.shape:  (10252, 2000)
adata_ds.shape:  (49302, 23420)
adata_scCL.shape:  (1121, 2000)


((10252, 2000), (1121, 2000))

In [6]:
adata_scvi_train_500_lin = scvi_data(adata_raw, adata_train_500_lin)
adata_scvi_test_500_lin = scvi_data(adata_raw, adata_test_500_lin)
adata_scvi_train_500_lin.shape, adata_scvi_test_500_lin.shape

adata_ds.shape:  (49302, 23420)
adata_scCL.shape:  (17422, 2000)
adata_ds.shape:  (49302, 23420)
adata_scCL.shape:  (1809, 2000)


((17422, 2000), (1809, 2000))

In [11]:
len(adata_normlog.obs["clone_id"].unique())

2813

In [7]:
adata_scvi_train_lin.write_h5ad('Larry_scvi_train_lineage.h5ad')
adata_scvi_test_lin.write_h5ad('Larry_scvi_test_lineage.h5ad')

adata_scvi_train_200_lin.write_h5ad('Larry_scvi_train_200_lineage.h5ad')
adata_scvi_test_200_lin.write_h5ad('Larry_scvi_test_200_lineage.h5ad')

adata_scvi_train_500_lin.write_h5ad('Larry_scvi_train_500_lineage.h5ad')
adata_scvi_test_500_lin.write_h5ad('Larry_scvi_test_500_lineage.h5ad')
