In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import torch
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = sc.read_h5ad('/public/home/syj/scpDeconv/dataset/raw_data/GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad') # Please modify this to your address.
data

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 90261 × 14087
    obs: 'GEX_n_genes_by_counts', 'GEX_pct_counts_mt', 'GEX_size_factors', 'GEX_phase', 'ADT_n_antibodies_by_counts', 'ADT_total_counts', 'ADT_iso_count', 'cell_type', 'batch', 'ADT_pseudotime_order', 'GEX_pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality', 'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType', 'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker', 'is_train'
    var: 'feature_types', 'gene_id'
    uns: 'dataset_id', 'genome', 'organism'
    obsm: 'ADT_X_pca', 'ADT_X_umap', 'ADT_isotype_controls', 'GEX_X_pca', 'GEX_X_umap'
    layers: 'counts'

In [3]:
data.obs['batch']

GCATTAGCATAAGCGG-1-s1d1    s1d1
TACAGGTGTTAGAGTA-1-s1d1    s1d1
AGGATCTAGGTCTACT-1-s1d1    s1d1
GTAGAAAGTGACACAG-1-s1d1    s1d1
TCCGAAAAGGATCATA-1-s1d1    s1d1
                           ... 
GAATCACCACGGAAGT-1-s4d9    s4d9
GCTGGGTGTACGGATG-1-s4d9    s4d9
TCGAAGTGTGACAGGT-1-s4d9    s4d9
GCAGGCTGTTGCATAC-1-s4d9    s4d9
ACGTAACAGGTCTACT-1-s4d9    s4d9
Name: batch, Length: 90261, dtype: category
Categories (12, object): ['s1d1', 's1d2', 's1d3', 's2d1', ..., 's3d7', 's4d1', 's4d8', 's4d9']

In [4]:
data1 = data[data.obs['batch'] == 's1d1']
data1

View of AnnData object with n_obs × n_vars = 5227 × 14087
    obs: 'GEX_n_genes_by_counts', 'GEX_pct_counts_mt', 'GEX_size_factors', 'GEX_phase', 'ADT_n_antibodies_by_counts', 'ADT_total_counts', 'ADT_iso_count', 'cell_type', 'batch', 'ADT_pseudotime_order', 'GEX_pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality', 'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType', 'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker', 'is_train'
    var: 'feature_types', 'gene_id'
    uns: 'dataset_id', 'genome', 'organism'
    obsm: 'ADT_X_pca', 'ADT_X_umap', 'ADT_isotype_controls', 'GEX_X_pca', 'GEX_X_umap'
    layers: 'counts'

In [5]:
data1.obs['cell_type']

GCATTAGCATAAGCGG-1-s1d1             Naive CD20+ B IGKC+
TACAGGTGTTAGAGTA-1-s1d1                      CD14+ Mono
AGGATCTAGGTCTACT-1-s1d1             Naive CD20+ B IGKC+
GTAGAAAGTGACACAG-1-s1d1                             HSC
TCCGAAAAGGATCATA-1-s1d1                    Reticulocyte
                                       ...             
GTCGAATAGTTTCGGT-1-s1d1             Naive CD20+ B IGKC+
AGTAGTCTCTGGGCGT-1-s1d1    CD4+ T activated integrinB7+
GCCCGAAGTATGGAGC-1-s1d1                    CD4+ T naive
CTACATTAGCGCGTTC-1-s1d1                      CD14+ Mono
GATTCTTTCACCCATC-1-s1d1                      Lymph prog
Name: cell_type, Length: 5227, dtype: category
Categories (37, object): ['B1 B IGKC+', 'B1 B IGKC-', 'CD4+ T activated', 'CD4+ T activated integrinB7+', ..., 'Transitional B', 'cDC2', 'gdT TCRVD2+', 'pDC']

In [6]:
data2 = data[data.obs['batch'] == 's1d2']
data2

View of AnnData object with n_obs × n_vars = 4978 × 14087
    obs: 'GEX_n_genes_by_counts', 'GEX_pct_counts_mt', 'GEX_size_factors', 'GEX_phase', 'ADT_n_antibodies_by_counts', 'ADT_total_counts', 'ADT_iso_count', 'cell_type', 'batch', 'ADT_pseudotime_order', 'GEX_pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality', 'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType', 'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker', 'is_train'
    var: 'feature_types', 'gene_id'
    uns: 'dataset_id', 'genome', 'organism'
    obsm: 'ADT_X_pca', 'ADT_X_umap', 'ADT_isotype_controls', 'GEX_X_pca', 'GEX_X_umap'
    layers: 'counts'

In [7]:
data2.obs['cell_type']

ACGGGTCGTGGAACAC-1-s1d2           Transitional B
CTCCCTCAGCCGATTT-1-s1d2                 G/M prog
AAAGAACCAAGTCATC-1-s1d2                       NK
TTTGACTCAGAAATTG-1-s1d2              NK CD158e1+
TCTTGCGGTCATTGCA-1-s1d2    CD4+ T CD314+ CD45RA+
                                   ...          
GACAGCCCACCAGCTG-1-s1d2              gdT CD158b+
GTTGCTCCAGTTGAAA-1-s1d2             Reticulocyte
CTGAGGCAGTAGTGCG-1-s1d2      Naive CD20+ B IGKC+
GTATTGGTCTCGTCAC-1-s1d2         CD4+ T activated
GGGACCTAGGTCTGGA-1-s1d2             CD4+ T naive
Name: cell_type, Length: 4978, dtype: category
Categories (41, object): ['B1 B IGKC+', 'B1 B IGKC-', 'CD4+ T CD314+ CD45RA+', 'CD4+ T activated', ..., 'cDC2', 'gdT CD158b+', 'gdT TCRVD2+', 'pDC']

In [8]:
celltype = np.intersect1d(data1.obs['cell_type'].values, data2.obs['cell_type'].values)
celltype

array(['B1 B IGKC+', 'B1 B IGKC-', 'CD14+ Mono', 'CD16+ Mono',
       'CD4+ T activated', 'CD4+ T activated integrinB7+', 'CD4+ T naive',
       'CD8+ T CD49f+', 'CD8+ T CD57+ CD45RA+', 'CD8+ T CD69+ CD45RA+',
       'CD8+ T CD69+ CD45RO+', 'CD8+ T TIGIT+ CD45RO+', 'CD8+ T naive',
       'Erythroblast', 'G/M prog', 'HSC', 'ILC1', 'Lymph prog', 'MAIT',
       'MK/E prog', 'NK', 'NK CD158e1+', 'Naive CD20+ B IGKC+',
       'Naive CD20+ B IGKC-', 'Normoblast', 'Plasma cell IGKC+',
       'Plasma cell IGKC-', 'Plasmablast IGKC+', 'Plasmablast IGKC-',
       'Proerythroblast', 'Reticulocyte', 'T reg', 'Transitional B',
       'cDC2', 'gdT TCRVD2+', 'pDC'], dtype=object)

In [9]:
x1 = data1.X.A
y1 = data1.obs['cell_type']

x, y = None, None
celltype_num = len(celltype)
cells_sub = []

for i in range(celltype_num):
    cells_sub.append(x1[np.array(y1 == celltype[i]), :])

for i in range(4000):

    # Create fractions for available celltypes
    fracs = np.random.rand(celltype_num)
    fracs_sum = np.sum(fracs)
    fracs = np.divide(fracs, fracs_sum)

    samp_fracs = np.multiply(fracs, 15)
    samp_fracs = np.array(list(map(round, samp_fracs)))

    artificial_samples = None

    for j in range(celltype_num):
        cells_fraction = np.random.randint(0, cells_sub[j].shape[0], samp_fracs[j])
        cells_choice = cells_sub[j][cells_fraction, :]
        if cells_choice.shape[0] > 0:
            artificial_samples = np.concatenate((artificial_samples, cells_choice)) if artificial_samples is not None else cells_choice

    cell = artificial_samples.mean(0).reshape(-1, 1)
    cell_y = (samp_fracs/samp_fracs.sum()).reshape(-1, 1)
    
    x = np.concatenate((x, cell), axis=1) if x is not None else cell
    y = np.concatenate((y, cell_y), axis=1) if y is not None else cell_y
    if i % 1000 == 0: print(i)

print(x.shape, y.shape)

0
1000
2000
3000
(14087, 4000) (36, 4000)


In [10]:
df = pd.DataFrame(x.T, columns=data1.var_names)
df

Unnamed: 0,AL627309.5,LINC01409,LINC01128,LINC00115,FAM41C,NOC2L,KLHL17,HES4,ISG15,AGRN,...,CD94,CD162,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224
0,0.000000,0.000000,0.000000,0.028276,0.000000,0.199423,0.00000,0.260119,0.456606,0.0,...,1.112274,2.562375,0.578273,0.815825,0.684355,1.145364,1.246636,1.419166,1.067369,0.969632
1,0.000000,0.063965,0.197815,0.000000,0.000000,0.233371,0.00000,0.000000,0.167598,0.0,...,0.394961,2.425701,0.619305,0.788107,0.383720,1.104296,1.408290,1.413583,1.302600,0.913066
2,0.000000,0.110482,0.048460,0.000000,0.013435,0.000000,0.00000,0.096033,0.582193,0.0,...,0.957746,2.432650,0.645892,0.815544,0.612726,0.970830,1.555991,0.842874,0.759295,0.900124
3,0.000000,0.066222,0.044178,0.066222,0.000000,0.373645,0.08324,0.031597,0.751123,0.0,...,0.956393,2.068966,0.626467,0.856173,0.340308,1.255244,1.699653,1.144266,1.022269,1.260993
4,0.021277,0.010362,0.164669,0.000000,0.021277,0.101313,0.00000,0.054500,0.607456,0.0,...,0.887823,2.225312,0.560356,0.837079,0.693025,1.089690,1.463853,1.275689,0.950996,1.135841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0.000000,0.000000,0.000000,0.000000,0.000000,0.643665,0.00000,0.000000,0.294352,0.0,...,0.726899,2.361908,0.536177,0.698085,0.153395,1.011927,1.736264,1.356360,0.763586,0.968117
3996,0.000000,0.086463,0.113684,0.000000,0.000000,0.438891,0.00000,0.000000,0.412167,0.0,...,0.812845,2.615007,0.664933,0.883896,0.257706,1.014723,1.824333,1.110388,1.020314,1.157354
3997,0.000000,0.133969,0.000000,0.061367,0.000000,0.000000,0.00000,0.000000,0.802420,0.0,...,0.469872,2.238073,0.442276,0.858622,0.267086,0.999188,2.134152,1.159829,0.807007,1.106666
3998,0.000000,0.124951,0.102127,0.000000,0.000000,0.268578,0.00000,0.000000,0.951914,0.0,...,0.870654,2.686141,0.506390,0.687242,0.794910,1.014359,1.637234,1.402279,0.873082,1.220391


In [11]:
adata_0 = ad.AnnData(df)
for i in range(len(celltype)):
    adata_0.obs[celltype[i]] = y[i].T
adata_0.obs['batch'] = np.array(['0'] * adata_0.shape[0])
adata_0.uns['cell_types'] = celltype

adata_0

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 4000 × 14087
    obs: 'B1 B IGKC+', 'B1 B IGKC-', 'CD14+ Mono', 'CD16+ Mono', 'CD4+ T activated', 'CD4+ T activated integrinB7+', 'CD4+ T naive', 'CD8+ T CD49f+', 'CD8+ T CD57+ CD45RA+', 'CD8+ T CD69+ CD45RA+', 'CD8+ T CD69+ CD45RO+', 'CD8+ T TIGIT+ CD45RO+', 'CD8+ T naive', 'Erythroblast', 'G/M prog', 'HSC', 'ILC1', 'Lymph prog', 'MAIT', 'MK/E prog', 'NK', 'NK CD158e1+', 'Naive CD20+ B IGKC+', 'Naive CD20+ B IGKC-', 'Normoblast', 'Plasma cell IGKC+', 'Plasma cell IGKC-', 'Plasmablast IGKC+', 'Plasmablast IGKC-', 'Proerythroblast', 'Reticulocyte', 'T reg', 'Transitional B', 'cDC2', 'gdT TCRVD2+', 'pDC', 'batch'
    uns: 'cell_types'

In [12]:
x2 = data2.X.A
y2 = data2.obs['cell_type']

x, y = None, None
celltype_num = len(celltype)
cells_sub = []

for i in range(celltype_num):
    cells_sub.append(x2[np.array(y2 == celltype[i]), :])

for i in range(1000):

    # Create fractions for available celltypes
    fracs = np.random.rand(celltype_num)
    fracs_sum = np.sum(fracs)
    fracs = np.divide(fracs, fracs_sum)

    samp_fracs = np.multiply(fracs, 15)
    samp_fracs = np.array(list(map(round, samp_fracs)))

    artificial_samples = None

    for j in range(celltype_num):
        cells_fraction = np.random.randint(0, cells_sub[j].shape[0], samp_fracs[j])
        cells_choice = cells_sub[j][cells_fraction, :]
        if cells_choice.shape[0] > 0:
            artificial_samples = np.concatenate((artificial_samples, cells_choice)) if artificial_samples is not None else cells_choice

    cell = artificial_samples.mean(0).reshape(-1, 1)
    cell_y = (samp_fracs/samp_fracs.sum()).reshape(-1, 1)
    
    x = np.concatenate((x, cell), axis=1) if x is not None else cell
    y = np.concatenate((y, cell_y), axis=1) if y is not None else cell_y
    if i % 1000 == 0: print(i)

print(x.shape, y.shape)

0
(14087, 1000) (36, 1000)


In [13]:
df2 = pd.DataFrame(x.T, columns=data2.var_names)
df2

Unnamed: 0,AL627309.5,LINC01409,LINC01128,LINC00115,FAM41C,NOC2L,KLHL17,HES4,ISG15,AGRN,...,CD94,CD162,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.112949,0.000000,0.118966,0.925491,0.000000,...,0.752156,2.557775,0.552098,0.851598,0.489620,1.066651,1.489090,0.909969,0.946335,0.788567
1,0.000000,0.029642,0.053235,0.171032,0.000000,0.298209,0.000000,0.000000,1.142150,0.000000,...,0.834465,1.826288,0.449097,0.792054,0.165760,1.021252,1.679255,1.810852,1.007045,0.768815
2,0.065122,0.271313,0.169513,0.000000,0.000000,0.222827,0.000000,0.113436,0.574298,0.000000,...,0.714067,1.877265,0.546162,1.005934,0.464673,1.149711,1.341493,1.143325,0.997399,0.889144
3,0.000000,0.208800,0.094500,0.096692,0.000000,0.173763,0.000000,0.000000,0.672000,0.000000,...,0.905265,2.238064,0.507192,0.758214,0.598417,0.975988,1.333926,1.211950,1.249540,0.788440
4,0.000000,0.134400,0.254918,0.093840,0.095916,0.247457,0.083289,0.218406,1.539295,0.000000,...,0.700795,2.198978,0.571810,0.889542,0.582306,0.942843,1.659939,1.371242,1.040304,1.012867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.000000,0.053256,0.158355,0.195465,0.000000,0.085378,0.000000,0.093514,1.258701,0.000000,...,0.754516,2.233727,0.505910,0.922487,0.617505,0.900051,1.192152,1.542065,0.779375,0.835823
996,0.000000,0.000000,0.157212,0.000000,0.000000,0.252850,0.000000,0.000000,0.556839,0.089577,...,0.846812,2.056534,0.559715,0.894217,0.649519,1.160656,1.023405,1.010387,1.280038,0.763795
997,0.000000,0.000000,0.217500,0.171032,0.029953,0.225526,0.000000,0.231982,0.449692,0.000000,...,0.887836,2.076933,0.612214,0.827250,0.638429,1.050176,1.486302,1.177567,1.341808,0.750980
998,0.000000,0.047242,0.067501,0.000000,0.000000,0.095188,0.000000,0.232324,0.738549,0.000000,...,0.575715,2.296344,0.702791,0.879557,0.348651,1.006734,1.678826,0.959292,1.313423,0.933436


In [14]:
adata_1 = ad.AnnData(df2)
for i in range(len(celltype)):
    adata_1.obs[celltype[i]] = y[i].T
adata_1.obs['batch'] = np.array(['1'] * adata_1.shape[0])
adata_1.uns['cell_types'] = celltype

adata_1

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 1000 × 14087
    obs: 'B1 B IGKC+', 'B1 B IGKC-', 'CD14+ Mono', 'CD16+ Mono', 'CD4+ T activated', 'CD4+ T activated integrinB7+', 'CD4+ T naive', 'CD8+ T CD49f+', 'CD8+ T CD57+ CD45RA+', 'CD8+ T CD69+ CD45RA+', 'CD8+ T CD69+ CD45RO+', 'CD8+ T TIGIT+ CD45RO+', 'CD8+ T naive', 'Erythroblast', 'G/M prog', 'HSC', 'ILC1', 'Lymph prog', 'MAIT', 'MK/E prog', 'NK', 'NK CD158e1+', 'Naive CD20+ B IGKC+', 'Naive CD20+ B IGKC-', 'Normoblast', 'Plasma cell IGKC+', 'Plasma cell IGKC-', 'Plasmablast IGKC+', 'Plasmablast IGKC-', 'Proerythroblast', 'Reticulocyte', 'T reg', 'Transitional B', 'cDC2', 'gdT TCRVD2+', 'pDC', 'batch'
    uns: 'cell_types'

In [15]:
adata_0.var_names_make_unique()
adata_1.var_names_make_unique()
adata = ad.concat((adata_0, adata_1))
adata.uns['cell_types'] = celltype
adata

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 5000 × 14087
    obs: 'B1 B IGKC+', 'B1 B IGKC-', 'CD14+ Mono', 'CD16+ Mono', 'CD4+ T activated', 'CD4+ T activated integrinB7+', 'CD4+ T naive', 'CD8+ T CD49f+', 'CD8+ T CD57+ CD45RA+', 'CD8+ T CD69+ CD45RA+', 'CD8+ T CD69+ CD45RO+', 'CD8+ T TIGIT+ CD45RO+', 'CD8+ T naive', 'Erythroblast', 'G/M prog', 'HSC', 'ILC1', 'Lymph prog', 'MAIT', 'MK/E prog', 'NK', 'NK CD158e1+', 'Naive CD20+ B IGKC+', 'Naive CD20+ B IGKC-', 'Normoblast', 'Plasma cell IGKC+', 'Plasma cell IGKC-', 'Plasmablast IGKC+', 'Plasmablast IGKC-', 'Proerythroblast', 'Reticulocyte', 'T reg', 'Transitional B', 'cDC2', 'gdT TCRVD2+', 'pDC', 'batch'
    uns: 'cell_types'

In [16]:
adata.write_h5ad('/public/home/syj/scpDeconv/dataset/mixup_data/bmmc_15.h5ad') # Please modify this to your address.