In [1]:
# General modules
import sys
import os
import session_info
import warnings
from pyprojroot.here import here
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display


import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

from tqdm.auto import tqdm
# Specific modules
import scanpy as sc
import anndata as ad
import scienceplots

# Setting some parameters
warnings.filterwarnings("ignore")
sys.path.insert(1, str(here('bin')))

# Import functions
from customPythonFunctions import balanced_sample

plt.style.use(['nature'])
dpi_fig_save = 300
sc.set_figure_params(dpi=100, dpi_save=dpi_fig_save, vector_friendly=True)

overwriteFigures = True
overwriteData = True

pd.options.display.max_columns=300
pd.options.display.max_rows = 1000

  from .autonotebook import tqdm as notebook_tqdm


**Load data**

In [2]:
# Load the h5ad file
infile_path = here("00_data_processing/results/01_INFLAMMATION_core.h5ad")
adata_CORE = sc.read_h5ad(infile_path)
adata_CORE

AnnData object with n_obs × n_vars = 6016705 × 37169
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'patientID', 'disease', 'timepoint_replicate', 'treatmentStatus', 'therapyResponse', 'sex', 'age', 'BMI', 'binned_age', 'diseaseStatus', 'smokingStatus', 'ethnicity', 'institute', 'diseaseGroup'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'

In [3]:
extended_obs_SCGT00 = pd.read_pickle(here("00_data_processing/results/extended_obs/SCGT00_obs.pkl"))
extended_obs_SCGT00

Unnamed: 0_level_0,index,orig.ident,nCount_RNA,nFeature_RNA,gem_id,scrublet_doublet_scores,scrublet_predicted_doublet,library_name,INFL_chemistry,batches,disease,therapy,demux_donor,sample.ID.basalWeek.scRNASeq,sample.ID.ResponseWeek.scRNASeq,donor.ID.GWAS,sample.ID.GWAS,donor.ID.RNASeq,sample.ID.basalWeek.RNASeq,sample.ID.ResponseWeek.RNASeq,donor.ID.Proteomics,sample.ID.basalWeek.Proteomics,sample.ID.ResponseWeek.Proteomics,Disease,Date.basalWeek,PersonalInformation.Sex,Birth.Donor.PresentAgeDonor,PersonalInformation.Height,PersonalInformation.Weight,TobaccoFrequency.CigarettesFrequency,TypeOfTobacco.DailyCigaretteQuantity,Response,Week.basalWeek,Week.ResponseWeek,Nominal.ResponseWeek,Date.ResponseWeek,Drug,Target,DerivedBMI,TobaccoFrequency.CigarsFrequency,TobaccoFrequency.SmokingPipeFrequency,Collection.YearsOfEvolution,concomitant.treatments.GC,TranscurredDays,PersonalInformation.DateOfBirth,studyID,cell_barcode,INFL_technology,INFL_lID,INFL_pID,timepoint_replicate,INFL_lID_pID_TP,INFL_sID
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1
SCGT00_L051_I57.3P_T0_AAACCCAAGGTGAGAA,zsggoqfp_x3zbi5sd_AAACCCAAGGTGAGAA-1,zsggoqfp_x3zbi5sd,706.0,406,zsggoqfp_x3zbi5sd,0.092,False,Pool1_IL6_wk0_v3_1,3_GEX_V3,RA_Pool1Pool3,RA,antiIL6,I005-00007,I005B00010,I005B00015,I009-00008,I009D00008,I016-00085,I016P00166,I016P00167,I023-00100,I023E00088,I023E00108,RA,2014-01-09,Female,47.0,165.0,70.0,Never,,NR,S0,S12,11.0,2014-04-02,Tocilizumab,Anti IL6R,25.7116620752984,Never,Never,47.9,False,83.0,,SCGT00,AAACCCAAGGTGAGAA,3_GEX_V3_GenoHashed,SCGT00_L051,SCGT00_I57,0,SCGT00_L051_I57_T0,SCGT00_I57.3P_T0
SCGT00_L051_I56.3P_T0_AAACCCAAGTCCGTCG,zsggoqfp_x3zbi5sd_AAACCCAAGTCCGTCG-1,zsggoqfp_x3zbi5sd,3807.0,1094,zsggoqfp_x3zbi5sd,0.104,False,Pool1_IL6_wk0_v3_1,3_GEX_V3,RA_Pool1Pool3,RA,antiIL6,I005-00006,I005B00009,I005B00012,I009-00003,I009D00003,I016-00089,I016P00174,I016P00175,I023-00093,I023E00090,I023E00068,RA,2013-12-13,Female,56.0,163.0,80.0,Never,,R,S0,SRM,8.0,2014-02-07,Tocilizumab,Anti IL6R,30.1102788964583,Never,Never,50.8,False,56.0,,SCGT00,AAACCCAAGTCCGTCG,3_GEX_V3_GenoHashed,SCGT00_L051,SCGT00_I56,0,SCGT00_L051_I56_T0,SCGT00_I56.3P_T0
SCGT00_L051_I53.3P_T0_AAACCCAAGTGCACTT,zsggoqfp_x3zbi5sd_AAACCCAAGTGCACTT-1,zsggoqfp_x3zbi5sd,9847.0,2508,zsggoqfp_x3zbi5sd,0.09,False,Pool1_IL6_wk0_v3_1,3_GEX_V3,RA_Pool1Pool3,RA,antiIL6,I005-00003,I005B00004,I005B00006,I009-00002,I009D00002,I016-00077,I016P00151,I016P00152,I023-00090,I023E00076,I023E00105,RA,2013-06-28,Female,69.0,160.0,64.0,Never,,R,S0,SRM,15.0,2013-10-17,Tocilizumab,Anti IL6R,25.0,Never,Never,69.5,False,111.0,,SCGT00,AAACCCAAGTGCACTT,3_GEX_V3_GenoHashed,SCGT00_L051,SCGT00_I53,0,SCGT00_L051_I53_T0,SCGT00_I53.3P_T0
SCGT00_L051_I52.3P_T0_AAACCCACAACTGTGT,zsggoqfp_x3zbi5sd_AAACCCACAACTGTGT-1,zsggoqfp_x3zbi5sd,1100.0,605,zsggoqfp_x3zbi5sd,0.01,False,Pool1_IL6_wk0_v3_1,3_GEX_V3,RA_Pool1Pool3,RA,antiIL6,I005-00002,I005B00002,I005B00005,I009-00004,I009D00004,I016-00102,I016P00198,I016P00199,I023-00096,I023E00110,I023E00114,RA,2013-04-12,Female,33.0,156.0,52.0,Never,,NR,S0,S12,14.0,2013-07-23,Tocilizumab,Anti IL6R,21.3675213675214,Never,Never,33.3,False,102.0,,SCGT00,AAACCCACAACTGTGT,3_GEX_V3_GenoHashed,SCGT00_L051,SCGT00_I52,0,SCGT00_L051_I52_T0,SCGT00_I52.3P_T0
SCGT00_L051_I56.3P_T0_AAACCCACAAGAATGT,zsggoqfp_x3zbi5sd_AAACCCACAAGAATGT-1,zsggoqfp_x3zbi5sd,4115.0,989,zsggoqfp_x3zbi5sd,0.086,False,Pool1_IL6_wk0_v3_1,3_GEX_V3,RA_Pool1Pool3,RA,antiIL6,I005-00006,I005B00009,I005B00012,I009-00003,I009D00003,I016-00089,I016P00174,I016P00175,I023-00093,I023E00090,I023E00068,RA,2013-12-13,Female,56.0,163.0,80.0,Never,,R,S0,SRM,8.0,2014-02-07,Tocilizumab,Anti IL6R,30.1102788964583,Never,Never,50.8,False,56.0,,SCGT00,AAACCCACAAGAATGT,3_GEX_V3_GenoHashed,SCGT00_L051,SCGT00_I56,0,SCGT00_L051_I56_T0,SCGT00_I56.3P_T0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SCGT00_L014_I5031.5P_T0_TTTGTCATCGCCTGTT,in39atkl_aolhexcm_TTTGTCATCGCCTGTT-1,in39atkl_aolhexcm,1861.0,776,in39atkl_aolhexcm,0.122,False,Pool6_JAKi_wk0_v5,5_GEX_V2,RA_Pool6Pool8,RA,JAKi,I005-00031,I005B00060,I005B00062,I009-00031,I009D00031,I016-00007,I016P00013,I016P00014,I023-00083,I023E00309,I023E00288,RA,2020-11-11,Female,30.0,160.0,58.0,Never,,R,S0,S12,15.0,2021-02-24,Tofacitinib,JAK inhibitor,22.65625,Never,Never,30.7,False,105.0,,SCGT00,TTTGTCATCGCCTGTT,5_GEX_V2_GenoHashed,SCGT00_L014,SCGT00_I5031,0,SCGT00_L014_I5031_T0,SCGT00_I5031.5P_T0
SCGT00_L014_I5026.5P_T0_TTTGTCATCGGCTTGG,in39atkl_aolhexcm_TTTGTCATCGGCTTGG-1,in39atkl_aolhexcm,3514.0,1115,in39atkl_aolhexcm,0.104,False,Pool6_JAKi_wk0_v5,5_GEX_V2,RA_Pool6Pool8,RA,JAKi,I005-00026,I005B00050,I005B00052,I009-00028,I009D00028,I016-00009,I016P00017,I016P00018,I023-00079,I023E00310,I023E00325,RA,2019-02-26,Male,53.0,182.0,88.0,Daily,11 to 20,R,S0,SRM,12.0,2019-05-22,Tofacitinib,JAK inhibitor,26.5668397536529,Never,Never,53.4,False,85.0,,SCGT00,TTTGTCATCGGCTTGG,5_GEX_V2_GenoHashed,SCGT00_L014,SCGT00_I5026,0,SCGT00_L014_I5026_T0,SCGT00_I5026.5P_T0
SCGT00_L014_I5018.5P_T0_TTTGTCATCTAACTTC,in39atkl_aolhexcm_TTTGTCATCTAACTTC-1,in39atkl_aolhexcm,1704.0,915,in39atkl_aolhexcm,0.116,False,Pool6_JAKi_wk0_v5,5_GEX_V2,RA_Pool6Pool8,RA,JAKi,I005-00018,I005B00030,I005B00039,I009-00021,I009D00021,I016-00129,I016P00251,I016P00252,I023-00069,I023E00291,I023E00302,RA,2018-04-20,Female,52.0,144.0,43.0,Never,,NR,S0,S12,12.0,2018-07-13,Baricitinib,JAK inhibitor,20.7368827160494,Never,Never,52.5,False,84.0,,SCGT00,TTTGTCATCTAACTTC,5_GEX_V2_GenoHashed,SCGT00_L014,SCGT00_I5018,0,SCGT00_L014_I5018_T0,SCGT00_I5018.5P_T0
SCGT00_L014_I5028.5P_T0_TTTGTCATCTATCGCC,in39atkl_aolhexcm_TTTGTCATCTATCGCC-1,in39atkl_aolhexcm,1839.0,725,in39atkl_aolhexcm,0.382,True,Pool6_JAKi_wk0_v5,5_GEX_V2,RA_Pool6Pool8,RA,JAKi,I005-00028,I005B00054,I005B00057,I009-00011,I009D00011,I016-00012,I016P00023,I016P00024,I023-00104,I023E00268,I023E00304,RA,2020-05-28,Female,51.0,155.0,64.0,Never,,R,S0,SRM,15.0,2020-09-15,Tofacitinib,JAK inhibitor,26.6389177939646,Never,Never,51.4,False,110.0,,SCGT00,TTTGTCATCTATCGCC,5_GEX_V2_GenoHashed,SCGT00_L014,SCGT00_I5028,0,SCGT00_L014_I5028_T0,SCGT00_I5028.5P_T0


In [4]:
# Load the h5ad file
infile_path = here("00_data_processing/results/01_INFLAMMATION_external.h5ad")
adata_EXTERNAL = sc.read_h5ad(infile_path)
adata_EXTERNAL

AnnData object with n_obs × n_vars = 600032 × 37169
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'patientID', 'disease', 'timepoint_replicate', 'treatmentStatus', 'therapyResponse', 'sex', 'age', 'BMI', 'binned_age', 'diseaseStatus', 'smokingStatus', 'ethnicity', 'institute', 'diseaseGroup'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'

In [5]:
extended_obs_SCGT00val = pd.read_pickle(here("00_data_processing/results/extended_obs/SCGT00val_obs.pkl"))
extended_obs_SCGT00val

Unnamed: 0_level_0,index,orig.ident,nCount_RNA,nFeature_RNA,gem_id,scrublet_doublet_scores,scrublet_predicted_doublet,library_name,chemistry,batches,disease,therapy,demux_donor,sample.ID.basalWeek.scRNASeq,sample.ID.ResponseWeek.scRNASeq,donor.ID.GWAS,sample.ID.GWAS,Response,Disease,Week.basalWeek,Date.basalWeek,Week.ResponseWeek,Nominal.ResponseWeek,Date.ResponseWeek,DAS28.basalWeek,DAS28.ResponseWeek,Drug,Target,Sex,Age,PersonalInformation.Height,PersonalInformation.Weight,DerivedBMI,TobaccoFrequency.CigarettesFrequency,TobaccoFrequency.CigarsFrequency,TobaccoFrequency.SmokingPipeFrequency,TypeOfTobacco.DailyCigaretteQuantity,AgeAtOnset,DMARD,Collection.YearsOfEvolution,Markers.ACPA.ACPAPos.Neg,Markers.RheumatoidFactor.Determination1Pos.Neg,AsssociatedDiseases.T2DM,OtherAssociatedDisease.T1DM,AsssociatedDiseases.Stroke,AsssociatedDiseases.Neoplasia,AsssociatedDiseases.Neoplasia.AgeAtDiagnosis,AsssociatedDiseases.Neoplasia.TypeOfNeoplasia,OtherAssociatedDisease.Other,NumberOfBiologicalTherapies.BiologicalTherapyRituximab,AsssociatedDiseases.Angor,AsssociatedDiseases.HBP,AsssociatedDiseases.AMI,Treatments.OralCorticosteroids,concomitant.treatments.GC,biologicalTtm.Failures,TranscurredDays,studyID,INFL_chemistry,INFL_technology,cell_barcode,INFL_lID,INFL_pID,timepoint_replicate,INFL_lID_pID_TP,INFL_sID
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1
SCGT00val_L006_I036016_T0_AAACCCAAGACAACTA,qvwqx1qy_3njw1qje_AAACCCAAGACAACTA-1,qvwqx1qy_3njw1qje,1097.0,500,qvwqx1qy_3njw1qje,0.195,False,RA_Pool5_TNF_wk0_1,3_GEX_V3,RA_Pool13Pool14,RA_new2,TNF,I036-00016,I036B00014,I036B00021,I039-00010,I039D00010,NR,RA,S0,2013-11-13,SRTB,12,2014-02-05,5.36,5.02,Certolizumab,Anti TNF,Female,33.0,155,69.0,28.7200832466181,Never,Never,Never,,31.0,Yes,33.7,Positive,Positive,No,No,No,No,,,,No,No,No,No,Yes,True,0,84,SCGT00val,3_GEX_V3,3_GEX_V3_GenoHashed,AAACCCAAGACAACTA,SCGT00val_L006,SCGT00val_I036016,0,SCGT00val_L006_I036016_T0,SCGT00val_I036016_T0
SCGT00val_L006_I036023_T0_AAACCCAAGACGGAAA,qvwqx1qy_3njw1qje_AAACCCAAGACGGAAA-1,qvwqx1qy_3njw1qje,6427.0,1438,qvwqx1qy_3njw1qje,0.591,False,RA_Pool5_TNF_wk0_1,3_GEX_V3,RA_Pool13Pool14,RA_new2,TNF,I036-00023,I036B00005,I036B00007,I039-00003,I039D00003,NR,RA,S0,2013-04-16,S12,9,2013-06-19,4.02,3.81,Golimumab,Anti TNF,Female,52.0,168,64.0,22.6757369614512,Never,Never,Never,,24.0,Yes,52.3,Positive,Positive,No,No,No,No,,,,No,No,No,No,Yes,True,0,64,SCGT00val,3_GEX_V3,3_GEX_V3_GenoHashed,AAACCCAAGACGGAAA,SCGT00val_L006,SCGT00val_I036023,0,SCGT00val_L006_I036023_T0,SCGT00val_I036023_T0
SCGT00val_L006_I036016_T0_AAACCCAAGAGCATTA,qvwqx1qy_3njw1qje_AAACCCAAGAGCATTA-1,qvwqx1qy_3njw1qje,664.0,382,qvwqx1qy_3njw1qje,0.161,False,RA_Pool5_TNF_wk0_1,3_GEX_V3,RA_Pool13Pool14,RA_new2,TNF,I036-00016,I036B00014,I036B00021,I039-00010,I039D00010,NR,RA,S0,2013-11-13,SRTB,12,2014-02-05,5.36,5.02,Certolizumab,Anti TNF,Female,33.0,155,69.0,28.7200832466181,Never,Never,Never,,31.0,Yes,33.7,Positive,Positive,No,No,No,No,,,,No,No,No,No,Yes,True,0,84,SCGT00val,3_GEX_V3,3_GEX_V3_GenoHashed,AAACCCAAGAGCATTA,SCGT00val_L006,SCGT00val_I036016,0,SCGT00val_L006_I036016_T0,SCGT00val_I036016_T0
SCGT00val_L006_I036021_T0_AAACCCAAGATGGTCG,qvwqx1qy_3njw1qje_AAACCCAAGATGGTCG-1,qvwqx1qy_3njw1qje,2534.0,844,qvwqx1qy_3njw1qje,0.535,False,RA_Pool5_TNF_wk0_1,3_GEX_V3,RA_Pool13Pool14,RA_new2,TNF,I036-00021,I036B00010,I036B00015,I039-00005,I039D00005,R,RA,S0,2013-08-09,S12,14,2013-11-19,6.27,3.02,Etanercept,Anti TNF,Female,61.0,160,63.0,24.609375,Never,Never,Never,,51.8,Yes,61.4,Negative,Positive,No,No,No,No,,,,No,No,No,No,Yes,True,0,102,SCGT00val,3_GEX_V3,3_GEX_V3_GenoHashed,AAACCCAAGATGGTCG,SCGT00val_L006,SCGT00val_I036021,0,SCGT00val_L006_I036021_T0,SCGT00val_I036021_T0
SCGT00val_L006_I036021_T0_AAACCCAAGCAACAAT,qvwqx1qy_3njw1qje_AAACCCAAGCAACAAT-1,qvwqx1qy_3njw1qje,8926.0,2330,qvwqx1qy_3njw1qje,0.422,False,RA_Pool5_TNF_wk0_1,3_GEX_V3,RA_Pool13Pool14,RA_new2,TNF,I036-00021,I036B00010,I036B00015,I039-00005,I039D00005,R,RA,S0,2013-08-09,S12,14,2013-11-19,6.27,3.02,Etanercept,Anti TNF,Female,61.0,160,63.0,24.609375,Never,Never,Never,,51.8,Yes,61.4,Negative,Positive,No,No,No,No,,,,No,No,No,No,Yes,True,0,102,SCGT00val,3_GEX_V3,3_GEX_V3_GenoHashed,AAACCCAAGCAACAAT,SCGT00val_L006,SCGT00val_I036021,0,SCGT00val_L006_I036021_T0,SCGT00val_I036021_T0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SCGT00val_L003_I0362_T0_TTTGTTGTCCGTCAAA,i5j1ep9y_6lyayzxo_TTTGTTGTCCGTCAAA-1,i5j1ep9y_6lyayzxo,2524.0,759,i5j1ep9y_6lyayzxo,0.445,True,RA_Pool10_IL6R_wk0_2,3_GEX_V3,RA_Pool19Pool20,RA_new2,antiIL6R,I036-00002,I036B00046,I036B00048,I039-00024,I039D00024,R,RA,S0,2017-05-09,SRM,13,2017-08-08,5.09,1.53,Tocilizumab,Anti IL6R,Female,60.1423682409309,160,,,,,,11 to 20,32.2,Yes,60.1,Positive,Positive,No,No,No,No,,,,No,No,No,No,Yes,True,0,91,SCGT00val,3_GEX_V3,3_GEX_V3_GenoHashed,TTTGTTGTCCGTCAAA,SCGT00val_L003,SCGT00val_I0362,0,SCGT00val_L003_I0362_T0,SCGT00val_I0362_T0
SCGT00val_L003_I036018_T0_TTTGTTGTCGGTTGTA,i5j1ep9y_6lyayzxo_TTTGTTGTCGGTTGTA-1,i5j1ep9y_6lyayzxo,4214.0,1095,i5j1ep9y_6lyayzxo,0.262,False,RA_Pool10_IL6R_wk0_2,3_GEX_V3,RA_Pool19Pool20,RA_new2,antiIL6R,I036-00018,I036B00037,I036B00041,I039-00020,I039D00020,R,RA,S0,2016-06-29,SRM,13,2016-10-04,4.45,0.63,Tocilizumab,Anti IL6R,Female,62.0,154,55.0,23.191094619666,Never,Never,Never,,49.0,No,62.2,,Positive,No,No,No,No,,,,No,No,No,No,,False,0,97,SCGT00val,3_GEX_V3,3_GEX_V3_GenoHashed,TTTGTTGTCGGTTGTA,SCGT00val_L003,SCGT00val_I036018,0,SCGT00val_L003_I036018_T0,SCGT00val_I036018_T0
SCGT00val_L003_I0361_T0_TTTGTTGTCTACGCAA,i5j1ep9y_6lyayzxo_TTTGTTGTCTACGCAA-1,i5j1ep9y_6lyayzxo,5238.0,1796,i5j1ep9y_6lyayzxo,0.065,False,RA_Pool10_IL6R_wk0_2,3_GEX_V3,RA_Pool19Pool20,RA_new2,antiIL6R,I036-00001,I036B00039,I036B00043,I039-00021,I039D00021,R,RA,S0,2016-09-06,S12,11,2016-11-22,5.78,2.76,Tocilizumab,Anti IL6R,Female,37.0,159,64.8,25.6318974724101,Never,Never,Never,,27.3,Yes,37.6,Positive,Positive,No,No,No,No,,,,No,No,No,No,Yes,True,0,77,SCGT00val,3_GEX_V3,3_GEX_V3_GenoHashed,TTTGTTGTCTACGCAA,SCGT00val_L003,SCGT00val_I0361,0,SCGT00val_L003_I0361_T0,SCGT00val_I0361_T0
SCGT00val_L003_I036018_T0_TTTGTTGTCTCTGACC,i5j1ep9y_6lyayzxo_TTTGTTGTCTCTGACC-1,i5j1ep9y_6lyayzxo,2631.0,1069,i5j1ep9y_6lyayzxo,0.089,False,RA_Pool10_IL6R_wk0_2,3_GEX_V3,RA_Pool19Pool20,RA_new2,antiIL6R,I036-00018,I036B00037,I036B00041,I039-00020,I039D00020,R,RA,S0,2016-06-29,SRM,13,2016-10-04,4.45,0.63,Tocilizumab,Anti IL6R,Female,62.0,154,55.0,23.191094619666,Never,Never,Never,,49.0,No,62.2,,Positive,No,No,No,No,,,,No,No,No,No,,False,0,97,SCGT00val,3_GEX_V3,3_GEX_V3_GenoHashed,TTTGTTGTCTCTGACC,SCGT00val_L003,SCGT00val_I036018,0,SCGT00val_L003_I036018_T0,SCGT00val_I036018_T0


# Data playing

## Subset SCGT00 data

In [6]:
adata_CORE_SCGT00 = adata_CORE[adata_CORE.obs['studyID'] == 'SCGT00'].copy()
adata_CORE_SCGT00

AnnData object with n_obs × n_vars = 1096960 × 37169
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'patientID', 'disease', 'timepoint_replicate', 'treatmentStatus', 'therapyResponse', 'sex', 'age', 'BMI', 'binned_age', 'diseaseStatus', 'smokingStatus', 'ethnicity', 'institute', 'diseaseGroup'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'

In [7]:
adata_CORE_SCGT00.obs.groupby("patientID").agg(lambda x: x.unique().tolist()).shape

(184, 18)

In [8]:
adata_EXTERNAL_SCGT00 = adata_EXTERNAL[adata_EXTERNAL.obs['studyID'] == 'SCGT00val'].copy()
adata_EXTERNAL_SCGT00

AnnData object with n_obs × n_vars = 229086 × 37169
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'patientID', 'disease', 'timepoint_replicate', 'treatmentStatus', 'therapyResponse', 'sex', 'age', 'BMI', 'binned_age', 'diseaseStatus', 'smokingStatus', 'ethnicity', 'institute', 'diseaseGroup'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'

In [9]:
adata_EXTERNAL_SCGT00.obs.groupby("patientID").agg(lambda x: x.unique().tolist()).shape

(32, 18)

In [10]:
del adata_CORE
del adata_EXTERNAL
#del adata_EXCLUDED

In [11]:
extended_obs_SCGT00 = extended_obs_SCGT00[extended_obs_SCGT00['INFL_chemistry'] == '3_GEX_V3']
extended_obs_SCGT00

Unnamed: 0_level_0,index,orig.ident,nCount_RNA,nFeature_RNA,gem_id,scrublet_doublet_scores,scrublet_predicted_doublet,library_name,INFL_chemistry,batches,disease,therapy,demux_donor,sample.ID.basalWeek.scRNASeq,sample.ID.ResponseWeek.scRNASeq,donor.ID.GWAS,sample.ID.GWAS,donor.ID.RNASeq,sample.ID.basalWeek.RNASeq,sample.ID.ResponseWeek.RNASeq,donor.ID.Proteomics,sample.ID.basalWeek.Proteomics,sample.ID.ResponseWeek.Proteomics,Disease,Date.basalWeek,PersonalInformation.Sex,Birth.Donor.PresentAgeDonor,PersonalInformation.Height,PersonalInformation.Weight,TobaccoFrequency.CigarettesFrequency,TypeOfTobacco.DailyCigaretteQuantity,Response,Week.basalWeek,Week.ResponseWeek,Nominal.ResponseWeek,Date.ResponseWeek,Drug,Target,DerivedBMI,TobaccoFrequency.CigarsFrequency,TobaccoFrequency.SmokingPipeFrequency,Collection.YearsOfEvolution,concomitant.treatments.GC,TranscurredDays,PersonalInformation.DateOfBirth,studyID,cell_barcode,INFL_technology,INFL_lID,INFL_pID,timepoint_replicate,INFL_lID_pID_TP,INFL_sID
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1
SCGT00_L051_I57.3P_T0_AAACCCAAGGTGAGAA,zsggoqfp_x3zbi5sd_AAACCCAAGGTGAGAA-1,zsggoqfp_x3zbi5sd,706.0,406,zsggoqfp_x3zbi5sd,0.092,False,Pool1_IL6_wk0_v3_1,3_GEX_V3,RA_Pool1Pool3,RA,antiIL6,I005-00007,I005B00010,I005B00015,I009-00008,I009D00008,I016-00085,I016P00166,I016P00167,I023-00100,I023E00088,I023E00108,RA,2014-01-09,Female,47.0,165.0,70.0,Never,,NR,S0,S12,11.0,2014-04-02,Tocilizumab,Anti IL6R,25.7116620752984,Never,Never,47.9,False,83.0,,SCGT00,AAACCCAAGGTGAGAA,3_GEX_V3_GenoHashed,SCGT00_L051,SCGT00_I57,0,SCGT00_L051_I57_T0,SCGT00_I57.3P_T0
SCGT00_L051_I56.3P_T0_AAACCCAAGTCCGTCG,zsggoqfp_x3zbi5sd_AAACCCAAGTCCGTCG-1,zsggoqfp_x3zbi5sd,3807.0,1094,zsggoqfp_x3zbi5sd,0.104,False,Pool1_IL6_wk0_v3_1,3_GEX_V3,RA_Pool1Pool3,RA,antiIL6,I005-00006,I005B00009,I005B00012,I009-00003,I009D00003,I016-00089,I016P00174,I016P00175,I023-00093,I023E00090,I023E00068,RA,2013-12-13,Female,56.0,163.0,80.0,Never,,R,S0,SRM,8.0,2014-02-07,Tocilizumab,Anti IL6R,30.1102788964583,Never,Never,50.8,False,56.0,,SCGT00,AAACCCAAGTCCGTCG,3_GEX_V3_GenoHashed,SCGT00_L051,SCGT00_I56,0,SCGT00_L051_I56_T0,SCGT00_I56.3P_T0
SCGT00_L051_I53.3P_T0_AAACCCAAGTGCACTT,zsggoqfp_x3zbi5sd_AAACCCAAGTGCACTT-1,zsggoqfp_x3zbi5sd,9847.0,2508,zsggoqfp_x3zbi5sd,0.09,False,Pool1_IL6_wk0_v3_1,3_GEX_V3,RA_Pool1Pool3,RA,antiIL6,I005-00003,I005B00004,I005B00006,I009-00002,I009D00002,I016-00077,I016P00151,I016P00152,I023-00090,I023E00076,I023E00105,RA,2013-06-28,Female,69.0,160.0,64.0,Never,,R,S0,SRM,15.0,2013-10-17,Tocilizumab,Anti IL6R,25.0,Never,Never,69.5,False,111.0,,SCGT00,AAACCCAAGTGCACTT,3_GEX_V3_GenoHashed,SCGT00_L051,SCGT00_I53,0,SCGT00_L051_I53_T0,SCGT00_I53.3P_T0
SCGT00_L051_I52.3P_T0_AAACCCACAACTGTGT,zsggoqfp_x3zbi5sd_AAACCCACAACTGTGT-1,zsggoqfp_x3zbi5sd,1100.0,605,zsggoqfp_x3zbi5sd,0.01,False,Pool1_IL6_wk0_v3_1,3_GEX_V3,RA_Pool1Pool3,RA,antiIL6,I005-00002,I005B00002,I005B00005,I009-00004,I009D00004,I016-00102,I016P00198,I016P00199,I023-00096,I023E00110,I023E00114,RA,2013-04-12,Female,33.0,156.0,52.0,Never,,NR,S0,S12,14.0,2013-07-23,Tocilizumab,Anti IL6R,21.3675213675214,Never,Never,33.3,False,102.0,,SCGT00,AAACCCACAACTGTGT,3_GEX_V3_GenoHashed,SCGT00_L051,SCGT00_I52,0,SCGT00_L051_I52_T0,SCGT00_I52.3P_T0
SCGT00_L051_I56.3P_T0_AAACCCACAAGAATGT,zsggoqfp_x3zbi5sd_AAACCCACAAGAATGT-1,zsggoqfp_x3zbi5sd,4115.0,989,zsggoqfp_x3zbi5sd,0.086,False,Pool1_IL6_wk0_v3_1,3_GEX_V3,RA_Pool1Pool3,RA,antiIL6,I005-00006,I005B00009,I005B00012,I009-00003,I009D00003,I016-00089,I016P00174,I016P00175,I023-00093,I023E00090,I023E00068,RA,2013-12-13,Female,56.0,163.0,80.0,Never,,R,S0,SRM,8.0,2014-02-07,Tocilizumab,Anti IL6R,30.1102788964583,Never,Never,50.8,False,56.0,,SCGT00,AAACCCACAAGAATGT,3_GEX_V3_GenoHashed,SCGT00_L051,SCGT00_I56,0,SCGT00_L051_I56_T0,SCGT00_I56.3P_T0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SCGT00_L024_I018012.3P_T0_TTTGTTGGTTGTGCAT,osrwkixq_pqoe2xt8_TTTGTTGGTTGTGCAT-1,osrwkixq_pqoe2xt8,2765.0,671,osrwkixq_pqoe2xt8,0.151,False,PS_Pool3_IL17_wk0_2,3_GEX_V3,PS_Pool8Pool9,PS_new,antiIL17A,I018-00012,I018B00024,I018B00023,I021-00010,I021D00010,I022-00007,I022P00023,I022P00030,I023-00012,I023E00264,I023E00283,PS,2021-05-18,Female,57.0,162.0,75.0,Never,,R,S0,S12,12.0,2021-08-11,Ixekizumab,Anti IL17A,28.5779606767261,Never,Never,57.9,False,85.0,,SCGT00,TTTGTTGGTTGTGCAT,3_GEX_V3_GenoHashed,SCGT00_L024,SCGT00_I018012,0,SCGT00_L024_I018012_T0,SCGT00_I018012.3P_T0
SCGT00_L024_I0185.3P_T0_TTTGTTGTCAATCCAG,osrwkixq_pqoe2xt8_TTTGTTGTCAATCCAG-1,osrwkixq_pqoe2xt8,3807.0,1000,osrwkixq_pqoe2xt8,0.183,False,PS_Pool3_IL17_wk0_2,3_GEX_V3,PS_Pool8Pool9,PS_new,antiIL17A,I018-00005,I018B00009,I018B00010,I021-00033,I021D00033,I022-00004,I022P00009,I022P00029,I023-00009,I023E00286,I023E00295,PS,2020-07-30,Male,53.0,173.0,97.0,Never,,R,S0,SRM,11.0,2020-10-20,Brodalumab,Anti IL17RA,32.4100370877744,Never,Never,53.6,False,82.0,,SCGT00,TTTGTTGTCAATCCAG,3_GEX_V3_GenoHashed,SCGT00_L024,SCGT00_I0185,0,SCGT00_L024_I0185_T0,SCGT00_I0185.3P_T0
SCGT00_L024_I018016.3P_T0_TTTGTTGTCAGACATC,osrwkixq_pqoe2xt8_TTTGTTGTCAGACATC-1,osrwkixq_pqoe2xt8,2324.0,746,osrwkixq_pqoe2xt8,0.336,False,PS_Pool3_IL17_wk0_2,3_GEX_V3,PS_Pool8Pool9,PS_new,antiIL17A,I018-00016,I018B00031,I018B00032,I021-00008,I021D00008,I022-00009,I022P00008,I022P00001,I023-00014,I023E00296,I023E00276,PS,2021-11-10,Male,23.0,180.0,85.0,Occasionally,1 to 10,R,S0,SRM,13.0,2022-02-09,Secukinumab,Anti IL17A,26.2345679012346,,,23.2,False,91.0,,SCGT00,TTTGTTGTCAGACATC,3_GEX_V3_GenoHashed,SCGT00_L024,SCGT00_I018016,0,SCGT00_L024_I018016_T0,SCGT00_I018016.3P_T0
SCGT00_L024_I018016.3P_T0_TTTGTTGTCATCTCTA,osrwkixq_pqoe2xt8_TTTGTTGTCATCTCTA-1,osrwkixq_pqoe2xt8,2257.0,846,osrwkixq_pqoe2xt8,0.092,False,PS_Pool3_IL17_wk0_2,3_GEX_V3,PS_Pool8Pool9,PS_new,antiIL17A,I018-00016,I018B00031,I018B00032,I021-00008,I021D00008,I022-00009,I022P00008,I022P00001,I023-00014,I023E00296,I023E00276,PS,2021-11-10,Male,23.0,180.0,85.0,Occasionally,1 to 10,R,S0,SRM,13.0,2022-02-09,Secukinumab,Anti IL17A,26.2345679012346,,,23.2,False,91.0,,SCGT00,TTTGTTGTCATCTCTA,3_GEX_V3_GenoHashed,SCGT00_L024,SCGT00_I018016,0,SCGT00_L024_I018016_T0,SCGT00_I018016.3P_T0


## Merging datasets

In [18]:
adata_SCGT00 = ad.concat([adata_CORE_SCGT00, adata_EXTERNAL_SCGT00], join='outer', axis=0)
adata_SCGT00

AnnData object with n_obs × n_vars = 1326046 × 37169
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'patientID', 'disease', 'timepoint_replicate', 'treatmentStatus', 'therapyResponse', 'sex', 'age', 'BMI', 'binned_age', 'diseaseStatus', 'smokingStatus', 'ethnicity', 'institute', 'diseaseGroup'

In [19]:
# Recover missing VAR
adata_SCGT00.var = adata_CORE_SCGT00.var
adata_SCGT00

AnnData object with n_obs × n_vars = 1326046 × 37169
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'patientID', 'disease', 'timepoint_replicate', 'treatmentStatus', 'therapyResponse', 'sex', 'age', 'BMI', 'binned_age', 'diseaseStatus', 'smokingStatus', 'ethnicity', 'institute', 'diseaseGroup'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'

In [20]:
extended_obs_SCGT00 = pd.concat([extended_obs_SCGT00[["batches"]], extended_obs_SCGT00val[["batches"]]], axis=0).groupby(level=0).first()
extended_obs_SCGT00

Unnamed: 0_level_0,batches
cellID,Unnamed: 1_level_1
SCGT00_L000_I019011.3P_T0_AAACCCAGTTACGATC,RA_Pool10Pool12
SCGT00_L000_I019011.3P_T0_AAACCCATCCATTTCA,RA_Pool10Pool12
SCGT00_L000_I019011.3P_T0_AAACGAAAGCCGTAAG,RA_Pool10Pool12
SCGT00_L000_I019011.3P_T0_AAACGAATCAGCAGAG,RA_Pool10Pool12
SCGT00_L000_I019011.3P_T0_AAACGAATCTGAGATC,RA_Pool10Pool12
...,...
SCGT00val_L007_I0369_T0_TTTGGTTCAGTAGTGG,RA_Pool15Pool16
SCGT00val_L007_I0369_T0_TTTGGTTGTTGCGGAA,RA_Pool15Pool16
SCGT00val_L007_I0369_T0_TTTGGTTTCCAGCAAT,RA_Pool15Pool16
SCGT00val_L007_I0369_T0_TTTGTTGGTTGCTCAA,RA_Pool15Pool16


##  Include patient multiplexing information

In [21]:
adata_SCGT00.obs = adata_SCGT00.obs.join(extended_obs_SCGT00['batches'], how='left')  # 'left' keeps all entries from adata.obs
adata_SCGT00.obs

Unnamed: 0_level_0,studyID,libraryID,sampleID,chemistry,technology,patientID,disease,timepoint_replicate,treatmentStatus,therapyResponse,sex,age,BMI,binned_age,diseaseStatus,smokingStatus,ethnicity,institute,diseaseGroup,batches
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
SCGT00_L051_I57.3P_T0_AAACCCAAGGTGAGAA,SCGT00,SCGT00_L051,SCGT00_I57.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00_I57,RA,0.0,ongoing,NR,female,47.000000,25.71,41-50,na,never-smoker,na,na,IMIDs,RA_Pool1Pool3
SCGT00_L051_I56.3P_T0_AAACCCAAGTCCGTCG,SCGT00,SCGT00_L051,SCGT00_I56.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00_I56,RA,0.0,ongoing,R,female,56.000000,30.11,51-60,na,never-smoker,na,na,IMIDs,RA_Pool1Pool3
SCGT00_L051_I53.3P_T0_AAACCCAAGTGCACTT,SCGT00,SCGT00_L051,SCGT00_I53.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00_I53,RA,0.0,ongoing,R,female,69.000000,25.00,61-70,na,never-smoker,na,na,IMIDs,RA_Pool1Pool3
SCGT00_L051_I52.3P_T0_AAACCCACAACTGTGT,SCGT00,SCGT00_L051,SCGT00_I52.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00_I52,RA,0.0,ongoing,NR,female,33.000000,21.37,31-40,na,never-smoker,na,na,IMIDs,RA_Pool1Pool3
SCGT00_L051_I56.3P_T0_AAACCCACAAGAATGT,SCGT00,SCGT00_L051,SCGT00_I56.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00_I56,RA,0.0,ongoing,R,female,56.000000,30.11,51-60,na,never-smoker,na,na,IMIDs,RA_Pool1Pool3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SCGT00val_L003_I0362_T0_TTTGTTGTCCGTCAAA,SCGT00val,SCGT00val_L003,SCGT00val_I0362_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00val_I0362,RA,0.0,antiIL6R,R,female,60.142368,,61-70,na,na,na,na,IMIDs,RA_Pool19Pool20
SCGT00val_L003_I036018_T0_TTTGTTGTCGGTTGTA,SCGT00val,SCGT00val_L003,SCGT00val_I036018_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00val_I036018,RA,0.0,antiIL6R,R,female,62.000000,23.19,61-70,na,never-smoker,na,na,IMIDs,RA_Pool19Pool20
SCGT00val_L003_I0361_T0_TTTGTTGTCTACGCAA,SCGT00val,SCGT00val_L003,SCGT00val_I0361_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00val_I0361,RA,0.0,antiIL6R,R,female,37.000000,25.63,31-40,na,never-smoker,na,na,IMIDs,RA_Pool19Pool20
SCGT00val_L003_I036018_T0_TTTGTTGTCTCTGACC,SCGT00val,SCGT00val_L003,SCGT00val_I036018_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00val_I036018,RA,0.0,antiIL6R,R,female,62.000000,23.19,61-70,na,never-smoker,na,na,IMIDs,RA_Pool19Pool20


In [22]:
adata_SCGT00.obs.groupby("batches").agg(lambda x: x.unique().tolist())

Unnamed: 0_level_0,studyID,libraryID,sampleID,chemistry,technology,patientID,disease,timepoint_replicate,treatmentStatus,therapyResponse,sex,age,BMI,binned_age,diseaseStatus,smokingStatus,ethnicity,institute,diseaseGroup
batches,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Chron_Pool1Pool3,[SCGT00],"[SCGT00_L030, SCGT00_L003]","[SCGT00_I0124.3P_T0, SCGT00_I012015.3P_T0, SCG...",[3_GEX_V3],[3_GEX_V3_GenoHashed],"[SCGT00_I0124, SCGT00_I012015, SCGT00_I012013,...",[CD],[0.0],[ongoing],"[R, NR]","[female, male]","[29.0, 37.0, 55.0, 56.0, 45.0, 70.0, 49.0, 47.0]","[24.52, 25.85, 24.09, 29.14, 40.44, 23.46, 23....","[18-30, 31-40, 51-60, 41-50, 61-70]",[na],"[smoker, never-smoker]",[na],[na],[IMIDs]
Chron_Pool2Pool4,[SCGT00],"[SCGT00_L025, SCGT00_L027]","[SCGT00_I012032.3P_T0, SCGT00_I012014.3P_T0, S...",[3_GEX_V3],[3_GEX_V3_GenoHashed],"[SCGT00_I012032, SCGT00_I012014, SCGT00_I01202...",[CD],[0.0],[ongoing],"[R, NR]","[female, male]","[56.0, 50.0, 21.0, 47.0, 39.0, 54.0, 28.0, 36.0]","[18.75, 21.1, 30.12, 23.14, 17.97, 22.14, 23.7...","[51-60, 41-50, 18-30, 31-40]",[na],"[smoker, never-smoker]",[na],[na],[IMIDs]
HC_Pool1,[SCGT00],"[SCGT00_L032, SCGT00_L021]","[SCGT00_I012036.3P_T0, SCGT00_I013017.3P_T0, S...",[3_GEX_V3],[3_GEX_V3_GenoHashed],"[SCGT00_I012036, SCGT00_I013017, SCGT00_I01203...",[healthy],[0.0],[naive],[na],"[male, female]","[43.0, 32.0, 45.0, 31.0, 58.0, 86.0, 65.0, 18.0]",[nan],"[41-50, 31-40, 51-60, >80, 61-70, 18-30]",[healthy],"[never-smoker, smoker]",[na],[na],[healthy]
IBD_Pool1Pool2,[SCGT00],[SCGT00_L020],"[SCGT00_I0208.3P_T0, SCGT00_I0201.3P_T0, SCGT0...",[3_GEX_V3],[3_GEX_V3_GenoHashed],"[SCGT00_I0208, SCGT00_I0201, SCGT00_I0205, SCG...",[CD],[0.0],[ongoing],"[R, NR]","[female, male]","[47.233401779603, 28.0, 52.0, 32.0, 62.0, 42.0...","[27.69, 29.41, 27.34, 19.05, 21.74, 37.04, 21....","[41-50, 18-30, 51-60, 31-40, 61-70]",[na],"[never-smoker, smoker]",[na],[na],[IMIDs]
IBD_Pool3Pool4,[SCGT00],"[SCGT00_L013, SCGT00_L037]","[SCGT00_I020011.3P_T0, SCGT00_I020013.3P_T0, S...",[3_GEX_V3],[3_GEX_V3_GenoHashed],"[SCGT00_I020011, SCGT00_I020013, SCGT00_I02001...","[UC, CD]",[0.0],[ongoing],"[R, NR]","[male, female]","[55.0, 71.0, 30.0, 54.1683778234086, 77.0, 45....","[24.51, 24.06, 29.98, 22.84, 25.24, 23.88, 25....","[51-60, 71-80, 18-30, 41-50]",[na],[never-smoker],[na],[na],[IMIDs]
PS_Pool2Pool4,[SCGT00],"[SCGT00_L004, SCGT00_L031]","[SCGT00_I0133.3P_T0, SCGT00_I0131.3P_T0, SCGT0...",[3_GEX_V3],[3_GEX_V3_GenoHashed],"[SCGT00_I0133, SCGT00_I0131, SCGT00_I0136, SCG...",[PS],[0.0],[ongoing],"[NR, R]","[male, female]","[42.0, 26.0, 59.0, 41.0, 44.0, 43.0, 34.0, 51.0]","[29.54, 23.81, 32.47, 23.23, 26.15, 21.36, 29....","[41-50, 18-30, 51-60, 31-40]",[na],"[never-smoker, smoker]",[na],[na],[IMIDs]
PS_Pool3Pool5,[SCGT00],"[SCGT00_L009, SCGT00_L039]","[SCGT00_I013011.3P_T0, SCGT00_I013010.3P_T0, S...",[3_GEX_V3],[3_GEX_V3_GenoHashed],"[SCGT00_I013011, SCGT00_I013010, SCGT00_I0138,...",[PS],[0.0],[ongoing],"[R, NR]","[female, male]","[57.0, 58.0, 49.0, 40.0, 30.0, 53.0, 54.0]","[28.72, 28.98, 25.04, 28.29, 21.19, 25.53, 33....","[51-60, 41-50, 31-40, 18-30]",[na],"[never-smoker, smoker]",[na],[na],[IMIDs]
PS_Pool6Pool7,[SCGT00],"[SCGT00_L015, SCGT00_L040]","[SCGT00_I018011.3P_T0, SCGT00_I0184.3P_T0, SCG...",[3_GEX_V3],[3_GEX_V3_GenoHashed],"[SCGT00_I018011, SCGT00_I0184, SCGT00_I0182, S...",[PS],[0.0],[ongoing],"[NR, R]","[male, female]","[53.0, 59.0, 61.0, 49.0, 51.0, 78.0, 77.360711...","[23.59, 16.94, 25.39, 34.29, 30.12, 30.3, 36.8...","[51-60, 61-70, 41-50, 71-80]",[na],"[smoker, never-smoker]",[na],[na],[IMIDs]
PS_Pool8Pool9,[SCGT00],"[SCGT00_L010, SCGT00_L024]","[SCGT00_I0189.3P_T0, SCGT00_I0185.3P_T0, SCGT0...",[3_GEX_V3],[3_GEX_V3_GenoHashed],"[SCGT00_I0189, SCGT00_I0185, SCGT00_I018016, S...",[PS],[0.0],[ongoing],"[R, NR]","[male, female]","[36.0, 53.0, 23.0, 55.0, 59.0, 62.0, 57.0, 49.0]","[28.73, 32.41, 26.23, 29.4, 31.89, 35.29, 28.5...","[31-40, 51-60, 18-30, 61-70, 41-50]",[na],"[smoker, never-smoker]",[na],[na],[IMIDs]
PsA_Pool1Pool3,[SCGT00],"[SCGT00_L008, SCGT00_L049, SCGT00_L036]","[SCGT00_I113.3P_T0, SCGT00_I112.3P_T0, SCGT00_...",[3_GEX_V3],[3_GEX_V3_GenoHashed],"[SCGT00_I113, SCGT00_I112, SCGT00_I119, SCGT00...",[PSA],[0.0],[ongoing],"[NR, R]","[female, male]","[36.0, 66.0, 56.0, 40.0, 41.0, 64.0, 30.0, 46.0]","[30.45, 23.15, 25.24, 34.7, 26.3, 26.96, 19.28...","[31-40, 61-70, 51-60, 41-50, 18-30]",[na],[never-smoker],[na],[na],[IMIDs]


# Split data downstream analysis

As anticipated before, we aim to split the **CORE DATASET** into two major datasets (named, MAIN and VALIDATION) in a balanced manner, considering some relevant covariates, such as *studyID*, *chemistry* and *disease* to ensure an homogeneous sampling. 

* The **main** dataset will be employed to characterize the inflammation landscape of circulating immune cells and test the classifier with a 5-fold cross-validation strategy. 

* The **validation** dataset will be kept out to assess the performances of the patient classifier.

In [23]:
patient_metadata = adata_SCGT00.obs[['sampleID','batches','chemistry','disease','sex','binned_age']].drop_duplicates()
patient_metadata = patient_metadata.reset_index().drop('cellID', axis=1).set_index('sampleID')
# patient_metadata[['disease_new','batches']] = patient_metadata.batches.str.split('_', expand=True)
# patient_metadata.drop('disease_new', axis=1,inplace=True)
patient_metadata

Unnamed: 0_level_0,batches,chemistry,disease,sex,binned_age
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SCGT00_I57.3P_T0,RA_Pool1Pool3,3_GEX_V3,RA,female,41-50
SCGT00_I56.3P_T0,RA_Pool1Pool3,3_GEX_V3,RA,female,51-60
SCGT00_I53.3P_T0,RA_Pool1Pool3,3_GEX_V3,RA,female,61-70
SCGT00_I52.3P_T0,RA_Pool1Pool3,3_GEX_V3,RA,female,31-40
SCGT00_I51.3P_T0,RA_Pool1Pool3,3_GEX_V3,RA,female,41-50
SCGT00_I58.3P_T0,RA_Pool1Pool3,3_GEX_V3,RA,female,61-70
SCGT00_I55.3P_T0,RA_Pool1Pool3,3_GEX_V3,RA,female,51-60
SCGT00_I54.3P_T0,RA_Pool1Pool3,3_GEX_V3,RA,female,61-70
SCGT00_I5014.3P_T0,RA_Pool2Pool4,3_GEX_V3,RA,female,61-70
SCGT00_I5032.3P_T0,RA_Pool2Pool4,3_GEX_V3,RA,female,51-60


In [24]:
patient_metadata.groupby(['disease'], observed=True)['batches'].nunique()

disease
CD          4
PS          4
PSA         4
RA         10
SLE         2
UC          3
healthy     1
Name: batches, dtype: int64

**We have to remove healthy patient from the dataset because we have only one batch**

In [25]:
patient_metadata_filt = patient_metadata.query("disease != 'healthy'")

In [26]:
best_i = None

n_splits = 4

patient_metadata_filt['VALIDATION'] = False

splitter = StratifiedGroupKFold(n_splits=n_splits, shuffle=False)
splits = list(splitter.split(X = patient_metadata_filt, y = patient_metadata_filt.disease, groups=patient_metadata_filt.batches))

# We search the first split that includes every diseases
for i, s in enumerate(splits):
    TRAINsplit = s[0]
    VALsplit = s[1]

    disease_included = patient_metadata_filt.iloc[VALsplit].disease.unique()
        
    if patient_metadata_filt.disease.unique().shape[0] == disease_included.shape[0]:
        best_i = i
        
        patient_metadata_filt.loc[patient_metadata_filt.iloc[VALsplit].index, 'VALIDATION'] = True
        
        print(f"number of split = {n_splits}")
        print(f"best split = {best_i}")
        print(f"Tot sample in TRAIN = {patient_metadata_filt.shape[0] - VALsplit.shape[0]}")
        print(f"Tot sample in VALIDATION = {VALsplit.shape[0]}")

        print(patient_metadata_filt.iloc[VALsplit].groupby('disease', observed=True)['VALIDATION'].count())
        print(patient_metadata_filt.groupby('disease', observed=True)['VALIDATION'].mean().round(2))

        print()

        break
        


number of split = 4
best split = 3
Tot sample in TRAIN = 152
Tot sample in VALIDATION = 56
disease
CD     10
PS      8
PSA     8
RA     16
SLE     8
UC      6
Name: VALIDATION, dtype: int64
disease
CD     0.38
PS     0.25
PSA    0.25
RA     0.20
SLE    0.50
UC     0.27
Name: VALIDATION, dtype: float64



#### Appling split

In [27]:
TRAIN_SAMPLE_DF = patient_metadata_filt.iloc[TRAINsplit]  
VALIDATION_SAMPLE_DF = patient_metadata_filt.iloc[VALsplit]  

In [28]:
assert (len(list(set(VALIDATION_SAMPLE_DF.batches) & set(TRAIN_SAMPLE_DF.batches))) == 0)

In [29]:
adata_SCGT00_MAIN = adata_SCGT00[adata_SCGT00.obs.sampleID.isin(TRAIN_SAMPLE_DF.index)]
adata_SCGT00_EXTERNAL = adata_SCGT00[adata_SCGT00.obs.sampleID.isin(VALIDATION_SAMPLE_DF.index)]

adata_SCGT00_MAIN, adata_SCGT00_EXTERNAL

(View of AnnData object with n_obs × n_vars = 887497 × 37169
     obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'patientID', 'disease', 'timepoint_replicate', 'treatmentStatus', 'therapyResponse', 'sex', 'age', 'BMI', 'binned_age', 'diseaseStatus', 'smokingStatus', 'ethnicity', 'institute', 'diseaseGroup', 'batches'
     var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status',
 View of AnnData object with n_obs × n_vars = 392847 × 37169
     obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'patientID', 'disease', 'timepoint_replicate', 'treatmentStatus', 'therapyResponse', 'sex', 'age', 'BMI', 'binned_age', 'diseaseStatus', 'smokingStatus', 'ethnicity', 'institute', 'diseaseGroup', 'batches'
     var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status')

In [30]:
assert (len(list(set(adata_SCGT00_EXTERNAL.obs.batches) & set(adata_SCGT00_MAIN.obs.batches))) == 0)

In [31]:
adata_SCGT00_MAIN.obs.batches.unique()

array(['RA_Pool1Pool3', 'RA_Pool2Pool4', 'RA_Pool6Pool8',
       'PsA_Pool1Pool3', 'PsA_Pool2Pool4', 'PsA_Pool5Pool7',
       'Chron_Pool2Pool4', 'UC_Pool5Pool7', 'UC_Pool6Pool8',
       'PS_Pool2Pool4', 'PS_Pool3Pool5', 'SLE_Pool2Pool4',
       'IBD_Pool1Pool2', 'PS_Pool6Pool7', 'RA_Pool9Pool11',
       'RA_Pool10Pool12', 'RA_Pool13Pool14', 'RA_Pool15Pool16',
       'RA_Pool19Pool20'], dtype=object)

In [32]:
adata_SCGT00_EXTERNAL.obs.batches.unique()

array(['RA_Pool5Pool7', 'PsA_Pool6Pool8', 'Chron_Pool1Pool3',
       'SLE_Pool1Pool3', 'IBD_Pool3Pool4', 'PS_Pool8Pool9',
       'RA_Pool17Pool18'], dtype=object)

# Save results

In [33]:
# Save Anndata object (.h5ad file) 
adata_SCGT00.write(here("01_data_processing/SCGT00_CentralizedDataset/results/SCGT00_FULLdata.h5ad"), compression="gzip")

In [34]:
adata_SCGT00_MAIN.write(here("01_data_processing/SCGT00_CentralizedDataset/results/SCGT00_MAIN.h5ad"), compression="gzip")

In [35]:
adata_SCGT00_EXTERNAL.write(here("01_data_processing/SCGT00_CentralizedDataset/results/SCGT00_EXTERNAL.h5ad"), compression="gzip")

In [36]:
session_info.show()