In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd
import scanpy as sc
import session_info
import matplotlib.pyplot as plt
import seaborn as sns

import os

In [2]:
session_info.show()

In [3]:
sc.settings.set_figure_params(dpi=120)

# Multiplexed samples ID

In [4]:
multiplexed_samples = ['7089STDY13216920_BHF_F_Hea13242527','7089STDY13216921_BHF_F_Hea13242528',
                       '7089STDY13216922_BHF_F_Hea13242529','7089STDY13216923_BHF_F_Hea13242530']

# Read in SOC result

In [6]:
# read in
soc = pd.read_csv('/nfs/team205/heart/mapped/cellranger-arc200/SOC_out/SOC-clusters_multiome-foetal_20221221.csv',
                 index_col=0)
print(soc.shape)
soc.head()

(74984, 10)


Unnamed: 0_level_0,sample,barcode,status,assignment,log_prob_singleton,log_prob_doublet,cluster0,cluster1,cluster2,donor_assignment
obsnames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7089STDY13216920_BHF_F_Hea13242527_GCCCTCATCCTAGTAA-1,7089STDY13216920_BHF_F_Hea13242527,GCCCTCATCCTAGTAA-1,singlet,1,-1804.326,-2214.313429,-3702.970109,-1804.326,,Hst40
7089STDY13216920_BHF_F_Hea13242527_ACTCGCTTCCTAGTAA-1,7089STDY13216920_BHF_F_Hea13242527,ACTCGCTTCCTAGTAA-1,singlet,0,-2300.029401,-2917.765214,-2300.029401,-4878.822776,,Hst39
7089STDY13216920_BHF_F_Hea13242527_TGGTCCTTCCGTGACA-1,7089STDY13216920_BHF_F_Hea13242527,TGGTCCTTCCGTGACA-1,singlet,0,-7138.100603,-8752.227197,-7138.100603,-15143.684314,,Hst39
7089STDY13216920_BHF_F_Hea13242527_TTCAGTTTCCGCTAGA-1,7089STDY13216920_BHF_F_Hea13242527,TTCAGTTTCCGCTAGA-1,singlet,1,-16501.683493,-20232.559001,-36208.064251,-16501.683493,,Hst40
7089STDY13216920_BHF_F_Hea13242527_GTCAATATCATCCACC-1,7089STDY13216920_BHF_F_Hea13242527,GTCAATATCATCCACC-1,singlet,0,-15506.974212,-16020.863109,-15506.974212,-25471.286877,,Hst39


In [7]:
soc['donor_assignment'].value_counts()

Hst39               36323
Hst40               20246
donor_unassigned    13322
Hst36                5093
Name: donor_assignment, dtype: int64

In [8]:
soc_assigned_obsnames = list(soc.index[soc['donor_assignment']!='donor_unassigned'])
soc_unassigned_obsnames = list(soc.index[soc['donor_assignment']=='donor_unassigned'])
print(f'SOC assigned: {len(soc_assigned_obsnames)}')
print(f'SOC unassigned: {len(soc_unassigned_obsnames)}')

SOC assigned: 61662
SOC unassigned: 13322


# Assign donors with CMO result (for SOC unassigned nuclei)

In [10]:
# read in
cmo = pd.read_csv('/nfs/team205/heart/mapped/CMO/result_summary/CMO-result_multiome-foetal_20230207.csv',
                 index_col=0)
# extract soc-unassigned nuclei
cmo = cmo.loc[soc_unassigned_obsnames]

print(cmo.shape)
cmo.head()

(13322, 8)


Unnamed: 0,combinedID,ratio,top_count,donor_assignment,n_donor_assigned,Hst40,Hst39,Hst36
7089STDY13216920_BHF_F_Hea13242527_CTCTATGTCAAATCGC-1,7089STDY13216920_BHF_F_Hea13242527,1.294996,47775.0,Hst40,1,47775.0,36892.0,
7089STDY13216920_BHF_F_Hea13242527_ATAAGTCGTAGGTTTG-1,7089STDY13216920_BHF_F_Hea13242527,1.774988,83215.0,Hst40,1,83215.0,46882.0,
7089STDY13216920_BHF_F_Hea13242527_CTGTTCATCTACCTAT-1,7089STDY13216920_BHF_F_Hea13242527,1.058529,32952.0,Hst40,1,32952.0,31130.0,
7089STDY13216920_BHF_F_Hea13242527_GACTTACAGAGGAGTC-1,7089STDY13216920_BHF_F_Hea13242527,2.02688,26769.0,Hst39,1,13207.0,26769.0,
7089STDY13216920_BHF_F_Hea13242527_CGATGTCCACAAGCCT-1,7089STDY13216920_BHF_F_Hea13242527,1.316336,42361.0,Hst40,1,42361.0,32181.0,


In [11]:
# assign donors which has prop_delta > 2.4282365
# refer to notebook No3
cmo_assigned_obsnames = list(cmo.index[cmo['ratio'] > 2.4282365])
print(f'CMO assigned: {len(cmo_assigned_obsnames)}')

# change assignment to "unassigned" for the nuclei which have "prop_delta" lower than the threshold
cmo.loc[[x for x in cmo.index if x not in cmo_assigned_obsnames],'donor_assignment'] = 'donor_unassigned'
cmo['donor_assignment'].value_counts()

CMO assigned: 1885


donor_unassigned    11437
Hst40                1690
Hst36                 155
Hst39                  40
Name: donor_assignment, dtype: int64

# Make assignment dataframe and save

In [12]:
# SOC
col_to_add = ['status','log_prob_singleton','log_prob_doublet', 'donor_assignment']
summary = soc[col_to_add].copy()
summary.columns = [f'SOC | {x}' for x in summary.columns]
summary.head()

Unnamed: 0_level_0,SOC | status,SOC | log_prob_singleton,SOC | log_prob_doublet,SOC | donor_assignment
obsnames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7089STDY13216920_BHF_F_Hea13242527_GCCCTCATCCTAGTAA-1,singlet,-1804.326,-2214.313429,Hst40
7089STDY13216920_BHF_F_Hea13242527_ACTCGCTTCCTAGTAA-1,singlet,-2300.029401,-2917.765214,Hst39
7089STDY13216920_BHF_F_Hea13242527_TGGTCCTTCCGTGACA-1,singlet,-7138.100603,-8752.227197,Hst39
7089STDY13216920_BHF_F_Hea13242527_TTCAGTTTCCGCTAGA-1,singlet,-16501.683493,-20232.559001,Hst40
7089STDY13216920_BHF_F_Hea13242527_GTCAATATCATCCACC-1,singlet,-15506.974212,-16020.863109,Hst39


In [13]:
# make final assignment column by adding CMO result
summary.rename(columns={'SOC | donor_assignment':'SOC-CMO_assignment'},inplace=True)
summary.loc[cmo.index,'SOC-CMO_assignment'] = cmo['donor_assignment'].astype('str')
print(summary['SOC-CMO_assignment'].value_counts())
summary.head()

Hst39               36363
Hst40               21936
donor_unassigned    11437
Hst36                5248
Name: SOC-CMO_assignment, dtype: int64


Unnamed: 0_level_0,SOC | status,SOC | log_prob_singleton,SOC | log_prob_doublet,SOC-CMO_assignment
obsnames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7089STDY13216920_BHF_F_Hea13242527_GCCCTCATCCTAGTAA-1,singlet,-1804.326,-2214.313429,Hst40
7089STDY13216920_BHF_F_Hea13242527_ACTCGCTTCCTAGTAA-1,singlet,-2300.029401,-2917.765214,Hst39
7089STDY13216920_BHF_F_Hea13242527_TGGTCCTTCCGTGACA-1,singlet,-7138.100603,-8752.227197,Hst39
7089STDY13216920_BHF_F_Hea13242527_TTCAGTTTCCGCTAGA-1,singlet,-16501.683493,-20232.559001,Hst40
7089STDY13216920_BHF_F_Hea13242527_GTCAATATCATCCACC-1,singlet,-15506.974212,-16020.863109,Hst39


In [14]:
# save
summary.to_csv('/nfs/team205/heart/mapped/cellranger-arc200/SOC_out/SOC-CMO-assignment_multiome-foetal_20230207.csv')