In [1]:
# General modules
import sys
import os
import session_info
import warnings
from pyprojroot.here import here
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import plotly.graph_objects as go
import re
import io

# Specific modules
import scanpy as sc
import anndata as ad
import scienceplots
from matplotlib.backends.backend_pdf import PdfPages
import plotly.io as pio
import pickle

# Setting some parameters
warnings.filterwarnings("ignore")

# Import functions
sys.path.insert(1, str(here('bin')))
from customPalette import *

plt.style.use(['nature'])
dpi_fig_save = 300
sc.set_figure_params(dpi=100, dpi_save=dpi_fig_save, vector_friendly=True)

overwriteFigures = True
overwriteData = True

pd.options.display.max_columns = 300
pd.options.display.max_rows = 1000

**Defining functions**

In [2]:
print("directory path: {}".format(here()))

directory path: /scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas-R1


**Load data**

In [3]:
with open(here("00_data_processing/results/01_INFLAMMATION_external_sampleMetadata.pkl"), 'rb') as file:
    external_sampleMetadata = pickle.load(file)

external_sampleMetadata["datasetSplitting"] = "external"
external_sampleMetadata.head()

Unnamed: 0_level_0,studyID,sampleID,chemistry,technology,patientID,disease,timepoint_replicate,treatmentStatus,therapyResponse,sex,age,BMI,binned_age,diseaseStatus,smokingStatus,ethnicity,institute,diseaseGroup,datasetSplitting
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
SCGT00val_L006_I036016_T0_AAACCCAAGACAACTA,SCGT00val,SCGT00val_I036016_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00val_I036016,RA,0.0,TNF,NR,female,33.0,28.72,31-40,na,never-smoker,na,na,IMIDs,external
SCGT00val_L006_I036023_T0_AAACCCAAGACGGAAA,SCGT00val,SCGT00val_I036023_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00val_I036023,RA,0.0,TNF,NR,female,52.0,22.68,51-60,na,never-smoker,na,na,IMIDs,external
SCGT00val_L006_I036021_T0_AAACCCAAGATGGTCG,SCGT00val,SCGT00val_I036021_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00val_I036021,RA,0.0,TNF,R,female,61.0,24.61,61-70,na,never-smoker,na,na,IMIDs,external
SCGT00val_L006_I036015_T0_AAACCCAAGCATTGTC,SCGT00val,SCGT00val_I036015_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00val_I036015,RA,0.0,TNF,NR,female,33.0,24.61,31-40,na,never-smoker,na,na,IMIDs,external
SCGT00val_L006_I0364_T0_AAACCCAAGCGATGGT,SCGT00val,SCGT00val_I0364_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00val_I0364,RA,0.0,TNF,R,female,25.0,21.01,18-30,na,never-smoker,na,na,IMIDs,external


In [4]:
with open(here("00_data_processing/results/01_INFLAMMATION_main_sampleMetadata.pkl"), 'rb') as file:
    main_sampleMetadata = pickle.load(file)

main_sampleMetadata["datasetSplitting"] = "CORE_main"
main_sampleMetadata.head()

Unnamed: 0_level_0,studyID,sampleID,chemistry,technology,patientID,disease,timepoint_replicate,treatmentStatus,therapyResponse,sex,age,BMI,binned_age,diseaseStatus,smokingStatus,ethnicity,institute,diseaseGroup,datasetSplitting
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
SCGT00_L051_I57.3P_T0_AAACCCAAGGTGAGAA,SCGT00,SCGT00_I57.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00_I57,RA,0.0,ongoing,NR,female,47.0,25.71,41-50,na,never-smoker,na,na,IMIDs,CORE_main
SCGT00_L051_I56.3P_T0_AAACCCAAGTCCGTCG,SCGT00,SCGT00_I56.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00_I56,RA,0.0,ongoing,R,female,56.0,30.11,51-60,na,never-smoker,na,na,IMIDs,CORE_main
SCGT00_L051_I53.3P_T0_AAACCCAAGTGCACTT,SCGT00,SCGT00_I53.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00_I53,RA,0.0,ongoing,R,female,69.0,25.0,61-70,na,never-smoker,na,na,IMIDs,CORE_main
SCGT00_L051_I52.3P_T0_AAACCCACAACTGTGT,SCGT00,SCGT00_I52.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00_I52,RA,0.0,ongoing,NR,female,33.0,21.37,31-40,na,never-smoker,na,na,IMIDs,CORE_main
SCGT00_L051_I58.3P_T0_AAACCCATCACGAGGA,SCGT00,SCGT00_I58.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00_I58,RA,0.0,ongoing,NR,female,61.0,36.1,61-70,na,never-smoker,na,na,IMIDs,CORE_main


In [5]:
with open(here("00_data_processing/results/01_INFLAMMATION_validation_sampleMetadata.pkl"), 'rb') as file:
    validation_sampleMetadata = pickle.load(file)

validation_sampleMetadata["datasetSplitting"] = "CORE_validation"
validation_sampleMetadata.head()

Unnamed: 0_level_0,studyID,sampleID,chemistry,technology,patientID,disease,timepoint_replicate,treatmentStatus,therapyResponse,sex,age,BMI,binned_age,diseaseStatus,smokingStatus,ethnicity,institute,diseaseGroup,datasetSplitting
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
SCGT00_L051_I51.3P_T0_AAACCCACAATCAGCT,SCGT00,SCGT00_I51.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00_I51,RA,0.0,ongoing,R,female,42.0,24.34,41-50,na,never-smoker,na,na,IMIDs,CORE_validation
SCGT00_L023_I5010.3P_T0_AAACCCACAGTACTAC,SCGT00,SCGT00_I5010.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00_I5010,RA,0.0,ongoing,NR,male,51.0,35.92,51-60,na,smoker,na,na,IMIDs,CORE_validation
SCGT00_L028_I5020.3P_T0_AAACCCACATCGGCCA,SCGT00,SCGT00_I5020.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00_I5020,RA,0.0,ongoing,R,female,60.0,20.28,51-60,na,never-smoker,na,na,IMIDs,CORE_validation
SCGT00_L050_I5030.3P_T0_AAACCCAAGAATCGTA,SCGT00,SCGT00_I5030.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00_I5030,RA,0.0,ongoing,R,female,55.0,18.72,51-60,na,never-smoker,na,na,IMIDs,CORE_validation
SCGT00_L050_I5018.3P_T0_AAACCCAAGCCGTCGT,SCGT00,SCGT00_I5018.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,SCGT00_I5018,RA,0.0,ongoing,NR,female,52.0,20.74,51-60,na,never-smoker,na,na,IMIDs,CORE_validation


In [15]:
sampleMetadata_merged = pd.concat([main_sampleMetadata, validation_sampleMetadata, external_sampleMetadata], ignore_index=True)
sampleMetadata_merged = sampleMetadata_merged.set_index("sampleID")

# Unify "na" and "NaN" to "NA"
sampleMetadata_merged = sampleMetadata_merged.replace('na', 'NA')
sampleMetadata_merged = sampleMetadata_merged.fillna('NA').replace('', 'NA')

# Define the new order of columns
new_order = ['datasetSplitting', 'studyID', 'patientID', 'diseaseGroup', 'disease', 'diseaseStatus',
             'chemistry', 'technology', 'institute',
             'sex', 'age', 'binned_age', 'BMI',  'smokingStatus', 'ethnicity',
             'timepoint_replicate', 'treatmentStatus', 'therapyResponse']
sampleMetadata_merged = sampleMetadata_merged[new_order]

In [16]:
print(sampleMetadata_merged.shape)
sampleMetadata_merged.head()

(1047, 18)


Unnamed: 0_level_0,datasetSplitting,studyID,patientID,diseaseGroup,disease,diseaseStatus,chemistry,technology,institute,sex,age,binned_age,BMI,smokingStatus,ethnicity,timepoint_replicate,treatmentStatus,therapyResponse
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
SCGT00_I57.3P_T0,CORE_main,SCGT00,SCGT00_I57,IMIDs,RA,,3_GEX_V3,3_GEX_V3_GenoHashed,,female,47.0,41-50,25.71,never-smoker,,0.0,ongoing,NR
SCGT00_I56.3P_T0,CORE_main,SCGT00,SCGT00_I56,IMIDs,RA,,3_GEX_V3,3_GEX_V3_GenoHashed,,female,56.0,51-60,30.11,never-smoker,,0.0,ongoing,R
SCGT00_I53.3P_T0,CORE_main,SCGT00,SCGT00_I53,IMIDs,RA,,3_GEX_V3,3_GEX_V3_GenoHashed,,female,69.0,61-70,25.0,never-smoker,,0.0,ongoing,R
SCGT00_I52.3P_T0,CORE_main,SCGT00,SCGT00_I52,IMIDs,RA,,3_GEX_V3,3_GEX_V3_GenoHashed,,female,33.0,31-40,21.37,never-smoker,,0.0,ongoing,NR
SCGT00_I58.3P_T0,CORE_main,SCGT00,SCGT00_I58,IMIDs,RA,,3_GEX_V3,3_GEX_V3_GenoHashed,,female,61.0,61-70,36.1,never-smoker,,0.0,ongoing,NR


In [17]:
sampleMetadata_merged.to_csv(here("00_data_processing/results/01_INFLAMMATION_DatasetDescription_Table1_Sheet3.csv"), index=True)