In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import torch
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = sc.read_h5ad(r'/public/home/syj/scpDeconv/dataset/raw_data/Final_Primary_All_Lin.h5ad') # Please modify this to your address.
data

AnnData object with n_obs × n_vars = 751970 × 34
    obs: 'sample', 'patient', 'genotype', 'surgery', 'age', 'pregnancies', 'births', 'race', 'menopause', 'batch', 'AR', 'H3K27ME3', 'Ir_191', 'Ir_193', 'KI67', 'Pt_195', 'RANK', 'c-PARP', 'p53', 'leiden', 'Clusters', 'Lineage', 'Merge_Lin'
    uns: 'Clusters_colors', 'Lineage_colors', 'batch_colors', 'leiden', 'leiden_colors', 'leiden_sizes', 'neighbors', 'paga', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [3]:
data.obs['Merge_Lin']

171926-0-0     VA
31600-0-0      VA
58101-0-0      FI
119072-0-0     VA
115236-0-0     VA
               ..
110849-10-2    AV
172758-10-2    BA
198359-10-2    VA
140442-10-2    VA
34032-10-2     AV
Name: Merge_Lin, Length: 751970, dtype: category
Categories (6, object): ['AV', 'BA', 'FI', 'HS', 'IM', 'VA']

In [4]:
data0 = data[data.obs['batch'] == '0']
x0 = data0.X
y0 = data0.obs['Merge_Lin']
celltypes = np.unique(y0)

In [5]:
x, y = None, None
celltype_num = len(celltypes)
cells_sub = []

for i in range(celltype_num):
    cells_sub.append(x0[np.array(y0 == celltypes[i]), :])

for i in range(20000):

    # Create fractions for available celltypes
    fracs = np.random.rand(celltype_num)
    fracs_sum = np.sum(fracs)
    fracs = np.divide(fracs, fracs_sum)

    samp_fracs = np.multiply(fracs, 200)
    samp_fracs = np.array(list(map(round, samp_fracs)))

    artificial_samples = None

    for j in range(celltype_num):
        cells_fraction = np.random.randint(0, cells_sub[j].shape[0], samp_fracs[j])
        cells_choice = cells_sub[j][cells_fraction, :]
        artificial_samples = np.concatenate((artificial_samples, cells_choice)) if artificial_samples is not None else cells_choice

    
    cell = artificial_samples.mean(0).reshape(-1, 1)
    cell_y = (samp_fracs/samp_fracs.sum()).reshape(-1, 1)
    
    x = np.concatenate((x, cell), axis=1) if x is not None else cell
    y = np.concatenate((y, cell_y), axis=1) if y is not None else cell_y
    if i % 1000 == 0: print(i)

print(x.shape, y.shape)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
(34, 20000) (6, 20000)


In [6]:
df = pd.DataFrame(x.T, columns=data0.var_names)
df

Unnamed: 0,ANPEP,ANXA8,BRCA1,CD10,CD133,CD140B,CD24,CD31,CD44,CD45,...,HLA,HSP27,K14,K17,K8K18,LAM5,MUC1,PR,SMA,VIM
0,1.503097,0.702095,0.992496,2.442733,0.757391,0.495362,1.860158,0.192899,1.185303,0.551473,...,2.143342,1.654785,1.683041,1.716538,1.288452,1.211377,0.766505,1.044744,1.576929,1.257672
1,1.210385,0.793191,0.939999,1.780228,0.897260,0.320725,1.787678,0.443410,1.228333,1.099307,...,2.761168,1.370445,1.148291,1.245319,0.852744,1.273413,0.489326,0.576969,1.656260,1.375532
2,1.486300,1.007947,1.065636,1.400030,1.101158,0.313578,2.095252,0.685059,1.019253,0.397283,...,2.942965,1.657349,0.965683,1.185653,0.960665,1.202355,0.470336,0.474958,1.394147,1.486900
3,1.609294,0.962465,1.093672,1.445395,1.126769,0.358265,2.374508,0.441726,1.110802,0.793684,...,2.952665,1.579991,0.629316,0.690230,1.772840,0.875428,1.072339,0.974050,0.588928,1.257742
4,0.866329,0.676443,0.859032,2.013791,0.694930,0.145512,1.789441,0.257218,1.353417,1.648408,...,2.456021,1.094927,1.237691,1.335929,1.175183,1.148087,0.681680,0.924333,1.273818,1.114129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1.209994,0.772749,1.098716,1.491849,0.969644,0.274471,2.157771,0.459761,1.319533,1.248922,...,2.850404,1.627026,0.813007,0.844858,1.341571,1.030711,0.839273,0.915646,1.263443,1.436735
19996,1.121095,0.880397,1.112376,1.265849,1.099913,0.148031,2.163548,0.413927,1.123280,1.045100,...,2.826598,1.279799,0.777379,0.923461,1.373537,1.035593,0.822131,0.737628,1.169610,1.085897
19997,1.447390,0.901358,1.064027,2.198018,0.955045,0.400347,2.001171,0.142926,1.229833,0.771941,...,2.346528,1.608739,1.370065,1.362969,1.355291,1.094615,0.802803,0.949919,1.369572,1.280209
19998,1.796081,0.954518,1.058682,1.831782,1.206769,0.410898,2.176163,0.261998,1.081485,0.583177,...,2.654024,1.241671,1.200802,1.306036,1.280382,0.942508,0.590893,0.462331,1.236578,1.349902


In [7]:
adata_0 = ad.AnnData(df)
for i in range(len(celltypes)):
    adata_0.obs[celltypes[i]] = y[i].T
adata_0.obs['batch'] = np.array(['0'] * adata_0.shape[0])
adata_0.uns['cell_types'] = celltypes

adata_0



AnnData object with n_obs × n_vars = 20000 × 34
    obs: 'AV', 'BA', 'FI', 'HS', 'IM', 'VA', 'batch'
    uns: 'cell_types'

In [8]:
data1 = data[data.obs['batch'] == '1']
x1 = data1.X
y1 = data1.obs['Merge_Lin']
celltypes = np.unique(y1)

In [9]:
x, y = None, None
celltype_num = len(celltypes)
cells_sub = []

for i in range(celltype_num):
    cells_sub.append(x1[np.array(y1 == celltypes[i]), :])

for i in range(4000):

    # Create fractions for available celltypes
    fracs = np.random.rand(celltype_num)
    fracs_sum = np.sum(fracs)
    fracs = np.divide(fracs, fracs_sum)

    samp_fracs = np.multiply(fracs, 200)
    samp_fracs = np.array(list(map(round, samp_fracs)))

    artificial_samples = None

    for j in range(celltype_num):
        cells_fraction = np.random.randint(0, cells_sub[j].shape[0], samp_fracs[j])
        cells_choice = cells_sub[j][cells_fraction, :]
        artificial_samples = np.concatenate((artificial_samples, cells_choice)) if artificial_samples is not None else cells_choice

    
    cell = artificial_samples.mean(0).reshape(-1, 1)
    cell_y = (samp_fracs/samp_fracs.sum()).reshape(-1, 1)
    
    x = np.concatenate((x, cell), axis=1) if x is not None else cell
    y = np.concatenate((y, cell_y), axis=1) if y is not None else cell_y
    if i % 1000 == 0: print(i)

print(x.shape, y.shape)

0
1000
2000
3000
(34, 4000) (6, 4000)


In [10]:
df2 = pd.DataFrame(x.T, columns=data1.var_names)
df2

Unnamed: 0,ANPEP,ANXA8,BRCA1,CD10,CD133,CD140B,CD24,CD31,CD44,CD45,...,HLA,HSP27,K14,K17,K8K18,LAM5,MUC1,PR,SMA,VIM
0,1.329367,0.810188,1.058029,1.629237,0.915398,0.369752,1.771176,0.331820,1.930062,1.539151,...,3.009558,1.603965,0.764837,0.776531,1.077182,0.875344,0.687910,0.784003,1.024617,1.777057
1,1.792075,0.567681,1.075637,2.228312,0.598405,1.028899,1.544127,0.245575,2.214256,1.286661,...,2.809299,1.986557,0.707250,0.672378,0.795753,0.824281,0.826184,0.875351,1.138544,2.247991
2,1.621611,0.617394,1.062254,1.676584,0.702540,0.811030,1.787931,0.610211,1.836371,0.994257,...,3.097498,2.320201,0.438980,0.398850,1.144378,0.781514,0.856683,0.961007,0.707811,2.174379
3,2.140461,0.608104,1.003314,2.008582,0.689650,1.185308,1.460775,0.187900,2.299494,1.581792,...,2.883656,1.508841,0.601957,0.486060,0.521194,0.640097,0.452697,0.461495,0.813504,2.409139
4,1.608195,0.569648,1.014962,2.300561,0.655581,0.911602,1.426666,0.420188,1.917235,1.077909,...,2.933736,1.904139,0.936829,0.904304,0.632084,1.006549,0.519693,0.679109,1.215647,2.331979
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,1.363051,0.506531,0.965524,2.438309,0.549926,0.885957,1.207392,0.541940,1.885117,0.880888,...,2.810590,1.934763,1.105710,1.023127,0.297249,1.241775,0.365533,0.606271,1.736615,2.232057
3996,1.492856,0.646507,0.917238,2.410953,0.828464,0.444246,1.787343,0.111448,1.860983,1.473616,...,2.554623,1.253394,1.115988,1.161379,0.934505,1.019090,0.619637,0.752631,1.394257,1.483797
3997,1.941790,0.722569,1.169067,1.817071,0.749324,1.111946,1.660971,0.419107,1.776971,0.630646,...,2.888368,2.410063,0.714628,0.733237,1.129829,0.883272,0.911323,0.985898,1.053915,2.343006
3998,1.603757,0.810625,1.212947,1.877371,0.940221,0.612985,1.968925,0.384839,1.531845,0.958608,...,2.803972,1.821415,0.698071,0.823476,1.148129,1.007774,0.820188,0.790141,1.139609,1.782104


In [11]:
adata_1 = ad.AnnData(df2)
for i in range(len(celltypes)):
    adata_1.obs[celltypes[i]] = y[i].T
adata_1.obs['batch'] = np.array(['1'] * adata_1.shape[0])
adata_1.uns['cell_types'] = celltypes

adata_1



AnnData object with n_obs × n_vars = 4000 × 34
    obs: 'AV', 'BA', 'FI', 'HS', 'IM', 'VA', 'batch'
    uns: 'cell_types'

In [12]:
adata = ad.concat((adata_0, adata_1))
adata.uns['cell_types'] = celltypes
adata

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 24000 × 34
    obs: 'AV', 'BA', 'FI', 'HS', 'IM', 'VA', 'batch'
    uns: 'cell_types'

In [13]:
adata.write_h5ad('/public/home/syj/scpDeconv/dataset/mixup_data/CyTOF_200.h5ad') # Please modify this to your address.