In [1]:
import scanpy as sc
import numpy as np
import pandas as pd

data1 = sc.read_h5ad('/public/home/syj/scpDeconv/dataset/raw_data/GSE164378_batch1.h5ad') # Please modify this to your address.
data1

  from .autonotebook import tqdm as notebook_tqdm


AnnData object with n_obs × n_vars = 67090 × 228
    obs: 'Unnamed: 0', 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'cluster_s', 'celltype.l2', 'celltype.l3', 'Phase', 'Batch'
    var: 'prot_id'

In [2]:
data1.obs['cluster_s']

0         Mono
1        CD4 T
2        CD8 T
3           NK
4        CD8 T
         ...  
67085    CD8 T
67086    CD8 T
67087     Mono
67088    CD4 T
67089    CD4 T
Name: cluster_s, Length: 67090, dtype: category
Categories (8, object): ['B', 'CD4 T', 'CD8 T', 'DC', 'Mono', 'NK', 'other', 'other T']

In [3]:
data1.var_names

Index(['CD39', 'Rat-IgG1-1', 'CD107a', 'CD62P', 'TCR-2', 'CD30', 'CD31',
       'CD34', 'CD35', 'CD36',
       ...
       'CD169', 'CD28', 'CD161', 'CD163', 'CD138-1', 'CD164', 'CD138-2',
       'CD144', 'CD202b', 'CD11c'],
      dtype='object', name='prot_id', length=228)

In [4]:
target_cell_types = ['B', 'CD4 T', 'CD8 T', 'other T']
adata1 = data1[data1.obs['cluster_s'].isin(target_cell_types)].copy()
print(adata1)

AnnData object with n_obs × n_vars = 40739 × 228
    obs: 'Unnamed: 0', 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'cluster_s', 'celltype.l2', 'celltype.l3', 'Phase', 'Batch'
    var: 'prot_id'


In [5]:
data2 = sc.read_h5ad('/public/home/syj/scpDeconv/dataset/raw_data/GSE164378_batch2.h5ad') # Please modify this to your address.
data2

AnnData object with n_obs × n_vars = 94674 × 228
    obs: 'Unnamed: 0', 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'cluster_s', 'celltype.l2', 'celltype.l3', 'Phase', 'Batch'
    var: 'prot_id'

In [6]:
adata2 = data2[data2.obs['cluster_s'].isin(target_cell_types)].copy()
print(adata2)

AnnData object with n_obs × n_vars = 46320 × 228
    obs: 'Unnamed: 0', 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'cluster_s', 'celltype.l2', 'celltype.l3', 'Phase', 'Batch'
    var: 'prot_id'


In [7]:
var_names_union = adata1.var_names.intersection(adata2.var_names)

print(var_names_union)
print(var_names_union.shape)

Index(['CD39', 'Rat-IgG1-1', 'CD107a', 'CD62P', 'TCR-2', 'CD30', 'CD31',
       'CD34', 'CD35', 'CD36',
       ...
       'CD169', 'CD28', 'CD161', 'CD163', 'CD138-1', 'CD164', 'CD138-2',
       'CD144', 'CD202b', 'CD11c'],
      dtype='object', name='prot_id', length=228)
(228,)


In [9]:
celltype = np.intersect1d(adata1.obs['cluster_s'].values, adata2.obs['cluster_s'].values)
celltype

array(['B', 'CD4 T', 'CD8 T', 'other T'], dtype=object)

In [10]:
x1 = adata1.X
y1 = adata1.obs['cluster_s']

x, y = None, None
celltype_num = len(celltype)
cells_sub = []

for i in range(celltype_num):
    cells_sub.append(x1[np.array(y1 == celltype[i]), :])

for i in range(20000):

    # Create fractions for available celltypes
    fracs = np.random.rand(celltype_num)
    fracs_sum = np.sum(fracs)
    fracs = np.divide(fracs, fracs_sum)

    samp_fracs = np.multiply(fracs, 50)
    samp_fracs = np.array(list(map(round, samp_fracs)))

    artificial_samples = None

    for j in range(celltype_num):
        cells_fraction = np.random.randint(0, cells_sub[j].shape[0], samp_fracs[j])
        cells_choice = cells_sub[j][cells_fraction, :]
        if cells_choice.shape[0] > 0:
            artificial_samples = np.concatenate((artificial_samples, cells_choice)) if artificial_samples is not None else cells_choice

    cell = artificial_samples.mean(0).reshape(-1, 1)
    cell_y = (samp_fracs/samp_fracs.sum()).reshape(-1, 1)
    
    x = np.concatenate((x, cell), axis=1) if x is not None else cell
    y = np.concatenate((y, cell_y), axis=1) if y is not None else cell_y
    if i % 1000 == 0: print(i)

print(x.shape, y.shape)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
(228, 20000) (4, 20000)


In [11]:
df = pd.DataFrame(x.T, columns=adata1.var_names)
df

prot_id,CD39,Rat-IgG1-1,CD107a,CD62P,TCR-2,CD30,CD31,CD34,CD35,CD36,...,CD169,CD28,CD161,CD163,CD138-1,CD164,CD138-2,CD144,CD202b,CD11c
0,11.760000,9.860000,15.260000,78.459999,18.760000,3.900000,71.260002,6.240000,188.600006,1.280000,...,1.540000,36.119999,6.200000,4.100000,8.840000,2.760000,9.260000,8.940000,11.100000,12.640000
1,17.784313,10.490196,18.431372,76.058823,14.549020,5.254902,64.666664,7.745098,194.098038,1.274510,...,2.000000,38.176472,6.784314,3.333333,10.549020,3.215686,12.607843,8.901960,15.411765,12.529411
2,12.040000,9.920000,17.100000,79.839996,18.340000,3.600000,67.820000,7.180000,130.320007,1.280000,...,2.240000,42.020000,6.760000,3.240000,9.820000,2.440000,10.560000,9.080000,11.200000,13.000000
3,25.240000,11.000000,20.180000,72.080002,17.600000,3.260000,64.300003,6.920000,282.660004,0.920000,...,2.040000,30.700001,3.100000,3.960000,9.500000,6.100000,11.080000,9.420000,11.900000,13.020000
4,22.219999,10.920000,23.459999,68.559998,15.880000,4.300000,59.599998,7.880000,275.940002,1.020000,...,2.040000,35.700001,9.280000,3.760000,9.740000,4.800000,15.700000,8.440000,18.540001,13.680000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,16.529411,11.607843,20.254902,81.372551,18.117647,4.862745,71.647057,7.607843,218.666672,1.470588,...,1.784314,43.705883,9.372549,4.156863,9.235294,3.823529,10.176471,9.705882,12.176471,14.392157
19996,20.379999,11.160000,17.879999,72.199997,18.320000,3.280000,62.639999,6.800000,183.600006,1.260000,...,2.460000,33.060001,4.260000,4.120000,9.620000,3.640000,9.540000,9.520000,12.000000,11.580000
19997,16.920000,10.340000,15.600000,78.459999,16.100000,3.760000,65.680000,6.920000,178.740005,1.460000,...,1.760000,38.480000,7.800000,3.960000,8.160000,3.620000,8.840000,7.980000,10.960000,11.400000
19998,18.200001,10.800000,17.740000,85.800003,17.100000,3.580000,65.839996,7.240000,209.440002,1.720000,...,1.760000,28.320000,3.380000,3.600000,8.580000,4.260000,9.280000,8.620000,10.900000,13.040000


In [12]:
adata_0 = sc.AnnData(df)
for i in range(len(celltype)):
    adata_0.obs[celltype[i]] = y[i].T
adata_0.obs['batch'] = np.array(['0'] * adata_0.shape[0])
adata_0.uns['cell_types'] = celltype

adata_0



AnnData object with n_obs × n_vars = 20000 × 228
    obs: 'B', 'CD4 T', 'CD8 T', 'other T', 'batch'
    uns: 'cell_types'

In [13]:
x2 = adata2.X
y2 = adata2.obs['cluster_s']

x, y = None, None
celltype_num = len(celltype)
cells_sub = []

for i in range(celltype_num):
    cells_sub.append(x2[np.array(y2 == celltype[i]), :])

for i in range(4000):

    # Create fractions for available celltypes
    fracs = np.random.rand(celltype_num)
    fracs_sum = np.sum(fracs)
    fracs = np.divide(fracs, fracs_sum)

    samp_fracs = np.multiply(fracs, 50)
    samp_fracs = np.array(list(map(round, samp_fracs)))

    artificial_samples = None

    for j in range(celltype_num):
        cells_fraction = np.random.randint(0, cells_sub[j].shape[0], samp_fracs[j])
        cells_choice = cells_sub[j][cells_fraction, :]
        if cells_choice.shape[0] > 0:
            artificial_samples = np.concatenate((artificial_samples, cells_choice)) if artificial_samples is not None else cells_choice

    cell = artificial_samples.mean(0).reshape(-1, 1)
    cell_y = (samp_fracs/samp_fracs.sum()).reshape(-1, 1)
    
    x = np.concatenate((x, cell), axis=1) if x is not None else cell
    y = np.concatenate((y, cell_y), axis=1) if y is not None else cell_y
    if i % 1000 == 0: print(i)

print(x.shape, y.shape)

0
1000
2000
3000
(228, 4000) (4, 4000)


In [14]:
df2 = pd.DataFrame(x.T, columns=adata2.var_names)
df2

prot_id,CD39,Rat-IgG1-1,CD107a,CD62P,TCR-2,CD30,CD31,CD34,CD35,CD36,...,CD169,CD28,CD161,CD163,CD138-1,CD164,CD138-2,CD144,CD202b,CD11c
0,11.122449,4.714286,10.877551,74.204079,11.530612,0.979592,33.142857,3.612245,173.285721,0.918367,...,1.000000,30.489796,6.693878,2.816327,2.755102,5.142857,4.816327,3.897959,4.387755,8.326530
1,4.240000,4.500000,7.840000,78.379997,15.860000,1.620000,35.040001,4.040000,67.379997,0.760000,...,1.140000,28.620001,3.720000,2.380000,3.060000,3.300000,4.260000,4.000000,4.840000,6.560000
2,7.000000,5.244898,8.244898,79.142860,13.367347,1.489796,37.061226,3.653061,59.061226,0.897959,...,1.306122,27.591837,3.795918,2.938776,3.040816,2.428571,4.714286,4.000000,5.204082,6.959184
3,9.540000,4.340000,9.620000,73.080002,10.680000,1.520000,37.180000,2.940000,145.339996,0.820000,...,1.300000,23.320000,3.800000,2.380000,2.920000,4.240000,5.100000,4.140000,4.740000,8.280000
4,4.480000,4.460000,9.140000,69.940002,11.980000,2.040000,31.920000,4.100000,83.139999,0.880000,...,1.460000,22.760000,6.880000,2.560000,3.740000,2.260000,6.640000,3.640000,6.500000,7.080000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,14.140000,4.340000,9.360000,71.099998,9.780000,0.960000,33.200001,3.380000,153.199997,0.840000,...,1.520000,19.139999,5.720000,2.460000,2.740000,5.260000,4.720000,4.320000,4.400000,6.260000
3996,4.734694,4.673470,8.653061,76.204079,15.653061,1.448980,34.346939,4.122449,48.673470,1.000000,...,1.102041,31.081633,6.591837,2.408163,3.020408,3.612245,3.653061,4.448979,4.530612,7.142857
3997,4.254902,4.470588,9.274509,69.941177,15.784314,1.235294,32.352940,3.627451,51.019608,0.921569,...,1.352941,34.823528,5.294117,2.215686,3.254902,2.529412,5.098039,4.176471,4.411765,6.647059
3998,9.568627,5.882353,11.921569,72.509804,12.470589,1.686275,34.568626,3.882353,98.372551,0.764706,...,0.960784,28.176470,7.156863,2.607843,3.098039,4.980392,6.333333,4.764706,5.862745,10.686275


In [15]:
adata_1 = sc.AnnData(df2)
for i in range(len(celltype)):
    adata_1.obs[celltype[i]] = y[i].T
adata_1.obs['batch'] = np.array(['1'] * adata_1.shape[0])
adata_1.uns['cell_types'] = celltype

adata_1



AnnData object with n_obs × n_vars = 4000 × 228
    obs: 'B', 'CD4 T', 'CD8 T', 'other T', 'batch'
    uns: 'cell_types'

In [16]:
adata_0.var_names_make_unique()
adata_1.var_names_make_unique()
adata = sc.concat((adata_0, adata_1))
adata.uns['cell_types'] = celltype
adata

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 24000 × 228
    obs: 'B', 'CD4 T', 'CD8 T', 'other T', 'batch'
    uns: 'cell_types'

In [17]:
adata.write_h5ad('/public/home/syj/scpDeconv/dataset/mixup_data/GSE164378_50.h5ad') # Please modify this to your address.