In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import torch
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data1 = sc.read_h5ad('/public/home/syj/scpDeconv/dataset/raw_data/GSM2685243_protein_2_PBMCs_filtered_ADT.h5ad') # Please modify this to your address.
data1.var_names_make_unique()
data1.var_names

  utils.warn_names_duplicates("var")


Index(['CD11b', 'CD8', 'CD152', 'CD127', 'CD25', 'TIGIT', 'CD45', 'CD66b',
       'CD56', 'CD27', 'CD4', 'CD45RO', 'CD8a', 'CD68', 'CD223', 'CD73',
       'CD69', 'CD279', 'CD9', 'CD19', 'CD273', 'CD4-1', 'CD154', 'CD20',
       'CD274', 'CD137', 'CD357', 'CD45-1', 'CD45RA', 'CD155', 'CD272',
       'CD278', 'CD14', 'CD134', 'CD8-1'],
      dtype='object')

In [3]:
data2 = sc.read_h5ad('/public/home/syj/scpDeconv/dataset/raw_data/GSM2685244_protein_3_PBMCs_filtered_ADT.h5ad') # Please modify this to your address.
data2.var_names_make_unique()
data2 = data2[:, data1.var_names]
data2

  utils.warn_names_duplicates("var")


View of AnnData object with n_obs × n_vars = 4021 × 35
    obs: 'scores.Astrocyte', 'scores.B_cell', 'scores.BM', 'scores.BM...Prog.', 'scores.Chondrocytes', 'scores.CMP', 'scores.DC', 'scores.Embryonic_stem_cells', 'scores.Endothelial_cells', 'scores.Epithelial_cells', 'scores.Erythroblast', 'scores.Fibroblasts', 'scores.Gametocytes', 'scores.GMP', 'scores.Hepatocytes', 'scores.HSC_.G.CSF', 'scores.HSC_CD34.', 'scores.iPS_cells', 'scores.Keratinocytes', 'scores.Macrophage', 'scores.MEP', 'scores.Monocyte', 'scores.MSC', 'scores.Myelocyte', 'scores.Neuroepithelial_cell', 'scores.Neurons', 'scores.Neutrophils', 'scores.NK_cell', 'scores.Osteoblasts', 'scores.Platelets', 'scores.Pre.B_cell_CD34.', 'scores.Pro.B_cell_CD34.', 'scores.Pro.Myelocyte', 'scores.Smooth_muscle_cells', 'scores.T_cells', 'scores.Tissue_stem_cells', 'labels', 'delta.next', 'pruned.labels'

In [4]:
data1.obs['labels']

GATTTGCTAGCGTT.s4            Monocyte
TATAAGTGTAAAGG.s4            Monocyte
CAAGACTGAGTGCT.s4    Pre-B_cell_CD34-
CACCCATGAGTAGA.s4            Monocyte
CTGGCACTGTCTAG.s4              B_cell
                           ...       
ATGCGATGACGGAG.s4             T_cells
GAATGCTGGTTTCT.s4             T_cells
CGCAAATGAGGTCT.s4             T_cells
TGCCACTGTCTATC.s4             T_cells
AGTGACTGAAGAGT.s4             T_cells
Name: labels, Length: 3064, dtype: category
Categories (12, object): ['B_cell', 'CMP', 'DC', 'GMP', ..., 'Platelets', 'Pre-B_cell_CD34-', 'Pro-B_cell_CD34+', 'T_cells']

In [5]:
data2.obs['labels']

CAGAAGCTGGTACT.s5      NK_cell
AGAGCTACGCTACA.s5    Platelets
TTGGAGACAGCATC.s5     Monocyte
TACGAGTGGGACTT.s5    Platelets
GGAATCTGGTGCTA.s5      NK_cell
                       ...    
CACCGTACCAGAGG.s5      T_cells
GCAGGGCTTCGTGA.s5      T_cells
CCATAGGAGTTGCA.s5      T_cells
TTCAACACTGCGTA.s5      T_cells
GGACAACTCCCGTT.s5      NK_cell
Name: labels, Length: 4021, dtype: category
Categories (12, object): ['B_cell', 'CMP', 'DC', 'GMP', ..., 'Platelets', 'Pre-B_cell_CD34-', 'Pro-B_cell_CD34+', 'T_cells']

In [6]:
x1 = data1.X
y1 = data1.obs['labels']
celltypes1 = np.unique(y1)
print(celltypes1)

['B_cell' 'CMP' 'DC' 'GMP' 'HSC_-G-CSF' 'MEP' 'Monocyte' 'NK_cell'
 'Platelets' 'Pre-B_cell_CD34-' 'Pro-B_cell_CD34+' 'T_cells']


In [7]:
x, y = None, None
celltype_num = len(celltypes1)
cells_sub = []

for i in range(celltype_num):
    cells_sub.append(x1[np.array(y1 == celltypes1[i]), :])

for i in range(20000):

    # Create fractions for available celltypes
    fracs = np.random.rand(celltype_num)
    fracs_sum = np.sum(fracs)
    fracs = np.divide(fracs, fracs_sum)

    samp_fracs = np.multiply(fracs, 50)
    samp_fracs = np.array(list(map(round, samp_fracs)))

    artificial_samples = None

    for j in range(celltype_num):
        cells_fraction = np.random.randint(0, cells_sub[j].shape[0], samp_fracs[j])
        cells_choice = cells_sub[j][cells_fraction, :]
        artificial_samples = np.concatenate((artificial_samples, cells_choice)) if artificial_samples is not None else cells_choice

    
    cell = artificial_samples.mean(0).reshape(-1, 1)
    cell_y = (samp_fracs/samp_fracs.sum()).reshape(-1, 1)
    
    x = np.concatenate((x, cell), axis=1) if x is not None else cell
    y = np.concatenate((y, cell_y), axis=1) if y is not None else cell_y
    if i % 1000 == 0: print(i)

print(x.shape, y.shape)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
(35, 20000) (12, 20000)


In [8]:
df = pd.DataFrame(x.T, columns=data1.var_names)
df

Unnamed: 0,CD11b,CD8,CD152,CD127,CD25,TIGIT,CD45,CD66b,CD56,CD27,...,CD137,CD357,CD45-1,CD45RA,CD155,CD272,CD278,CD14,CD134,CD8-1
0,4.320000,7.520000,0.120000,0.420000,0.200000,2.360000,0.000000,0.780000,1.220000,3.700000,...,0.040000,0.360000,0.000000,45.900000,6.640000,0.160000,0.640000,6.000000,0.160000,7.520000
1,11.840000,4.120000,0.220000,0.580000,0.280000,3.100000,0.000000,1.240000,1.120000,3.720000,...,0.180000,0.280000,0.000000,45.960000,10.480000,0.060000,0.320000,12.640000,0.080000,4.120000
2,4.060000,3.040000,0.200000,0.280000,0.440000,3.440000,0.000000,0.340000,0.820000,3.820000,...,0.000000,0.220000,0.000000,35.260000,6.420000,0.140000,0.380000,4.360000,0.180000,3.040000
3,7.860000,3.460000,0.060000,0.440000,0.100000,2.620000,0.000000,0.760000,1.340000,2.840000,...,0.120000,0.220000,0.000000,40.440000,8.380000,0.080000,0.740000,7.000000,0.140000,3.460000
4,7.937500,2.604167,0.062500,0.729167,0.291667,2.770833,0.000000,1.187500,1.020833,4.187500,...,0.083333,0.270833,0.000000,67.854167,7.500000,0.291667,0.687500,7.270833,0.187500,2.604167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,7.729167,2.270833,0.083333,0.541667,0.250000,2.208333,0.000000,4.145833,1.354167,2.937500,...,0.104167,0.083333,0.000000,53.520833,8.062500,0.208333,0.854167,8.333333,0.291667,2.270833
19996,10.568627,7.333333,0.294118,0.921569,0.117647,3.117647,0.215686,1.254902,1.411765,4.176471,...,0.176471,0.176471,0.215686,56.921569,13.333333,0.098039,1.176471,13.490196,0.274510,7.333333
19997,4.600000,2.480000,0.180000,0.660000,0.120000,2.240000,0.000000,0.720000,0.640000,3.200000,...,0.200000,0.280000,0.000000,30.000000,7.140000,0.080000,0.880000,8.640000,0.320000,2.480000
19998,6.117647,2.647059,0.215686,0.156863,0.901961,2.862745,0.019608,0.745098,0.588235,4.705882,...,0.039216,0.352941,0.019608,52.313725,8.411765,0.098039,0.941176,8.627451,0.372549,2.647059


In [9]:
adata_0 = ad.AnnData(df)
for i in range(len(celltypes1)):
    adata_0.obs[celltypes1[i]] = y[i].T
adata_0.obs['batch'] = np.array(['0'] * adata_0.shape[0])
adata_0.uns['cell_types'] = celltypes1

adata_0



AnnData object with n_obs × n_vars = 20000 × 35
    obs: 'B_cell', 'CMP', 'DC', 'GMP', 'HSC_-G-CSF', 'MEP', 'Monocyte', 'NK_cell', 'Platelets', 'Pre-B_cell_CD34-', 'Pro-B_cell_CD34+', 'T_cells', 'batch'
    uns: 'cell_types'

In [10]:
x2 = data2.X
y2 = data2.obs['labels']
celltypes2 = np.unique(y1)
celltypes2 = np.delete(celltypes2, np.where(celltypes2 == 'Macrophage'))
print(celltypes1)
print(celltypes2)
print(celltypes1 == celltypes2)

['B_cell' 'CMP' 'DC' 'GMP' 'HSC_-G-CSF' 'MEP' 'Monocyte' 'NK_cell'
 'Platelets' 'Pre-B_cell_CD34-' 'Pro-B_cell_CD34+' 'T_cells']
['B_cell' 'CMP' 'DC' 'GMP' 'HSC_-G-CSF' 'MEP' 'Monocyte' 'NK_cell'
 'Platelets' 'Pre-B_cell_CD34-' 'Pro-B_cell_CD34+' 'T_cells']
[ True  True  True  True  True  True  True  True  True  True  True  True]


In [11]:
x, y = None, None
celltype_num = len(celltypes2)
cells_sub = []

for i in range(celltype_num):
    cells_sub.append(x2[np.array(y2 == celltypes2[i]), :])

for i in range(4000):

    # Create fractions for available celltypes
    fracs = np.random.rand(celltype_num)
    fracs_sum = np.sum(fracs)
    fracs = np.divide(fracs, fracs_sum)

    samp_fracs = np.multiply(fracs, 50)
    samp_fracs = np.array(list(map(round, samp_fracs)))

    artificial_samples = None

    for j in range(celltype_num):
        if cells_sub[j].shape[0] > 0:
            cells_fraction = np.random.randint(0, cells_sub[j].shape[0], samp_fracs[j])
            cells_choice = cells_sub[j][cells_fraction, :]
            artificial_samples = np.concatenate((artificial_samples, cells_choice)) if artificial_samples is not None else cells_choice

    
    cell = artificial_samples.mean(0).reshape(-1, 1)
    cell_y = (samp_fracs/samp_fracs.sum()).reshape(-1, 1)
    
    x = np.concatenate((x, cell), axis=1) if x is not None else cell
    y = np.concatenate((y, cell_y), axis=1) if y is not None else cell_y
    if i % 1000 == 0: print(i)

print(x.shape, y.shape)

0
1000
2000
3000
(35, 4000) (12, 4000)


In [12]:
df2 = pd.DataFrame(x.T, columns=data2.var_names)
df2

Unnamed: 0,CD11b,CD8,CD152,CD127,CD25,TIGIT,CD45,CD66b,CD56,CD27,...,CD137,CD357,CD45-1,CD45RA,CD155,CD272,CD278,CD14,CD134,CD8-1
0,4.319149,1.382979,0.000000,0.319149,0.106383,1.872340,0.000000,0.702128,0.468085,1.297872,...,0.000000,0.148936,0.000000,31.659574,6.276596,0.127660,0.148936,4.042553,0.042553,1.382979
1,2.500000,6.142857,0.000000,0.404762,0.047619,2.595238,0.000000,0.214286,0.238095,1.619048,...,0.023810,0.142857,0.000000,28.190476,4.476190,0.023810,0.166667,4.738095,0.071429,6.142857
2,3.980000,1.560000,0.220000,0.420000,0.060000,2.100000,0.000000,0.760000,0.200000,2.580000,...,0.040000,0.140000,0.000000,43.480000,4.780000,0.100000,0.200000,4.840000,0.060000,1.560000
3,6.851064,0.957447,0.042553,0.212766,0.085106,2.255319,0.042553,1.212766,0.468085,0.893617,...,0.000000,0.255319,0.042553,36.829787,7.787234,0.276596,0.127660,8.787234,0.148936,0.957447
4,4.562500,2.312500,0.125000,0.208333,0.145833,2.708333,0.041667,0.479167,0.875000,2.208333,...,0.000000,0.020833,0.041667,35.041667,6.291667,0.104167,0.166667,7.354167,0.125000,2.312500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,3.319149,2.468085,0.085106,0.468085,0.106383,2.085106,0.000000,0.765957,0.340426,1.829787,...,0.042553,0.255319,0.000000,34.872340,5.446809,0.148936,0.276596,5.446809,0.042553,2.468085
3996,3.580000,2.800000,0.120000,0.240000,0.100000,3.400000,0.000000,0.580000,0.420000,1.760000,...,0.000000,0.080000,0.000000,33.300000,5.640000,0.180000,0.200000,6.000000,0.140000,2.800000
3997,3.382979,2.000000,0.042553,0.148936,0.085106,2.787234,0.000000,0.468085,0.553191,0.638298,...,0.000000,0.106383,0.000000,34.659574,5.000000,0.106383,0.106383,4.531915,0.063830,2.000000
3998,7.531915,4.319149,0.127660,0.382979,0.170213,4.021277,0.000000,0.297872,0.574468,1.702128,...,0.063830,0.212766,0.000000,35.000000,9.914894,0.063830,0.234043,10.680851,0.085106,4.319149


In [13]:
adata_1 = ad.AnnData(df2)
for i in range(len(celltypes2)):
    adata_1.obs[celltypes2[i]] = y[i].T
adata_1.obs['batch'] = np.array(['1'] * adata_1.shape[0])
adata_1.uns['cell_types'] = celltypes2

adata_1



AnnData object with n_obs × n_vars = 4000 × 35
    obs: 'B_cell', 'CMP', 'DC', 'GMP', 'HSC_-G-CSF', 'MEP', 'Monocyte', 'NK_cell', 'Platelets', 'Pre-B_cell_CD34-', 'Pro-B_cell_CD34+', 'T_cells', 'batch'
    uns: 'cell_types'

In [14]:
adata = ad.concat((adata_0, adata_1))
adata.uns['cell_types'] = celltypes2
adata

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 24000 × 35
    obs: 'B_cell', 'CMP', 'DC', 'GMP', 'HSC_-G-CSF', 'MEP', 'Monocyte', 'NK_cell', 'Platelets', 'Pre-B_cell_CD34-', 'Pro-B_cell_CD34+', 'T_cells', 'batch'
    uns: 'cell_types'

In [15]:
adata.write_h5ad('/public/home/syj/scpDeconv/dataset/mixup_data/reap_50.h5ad') # Please modify this to your address.