In [1]:
#import numpy as np
import pandas as pd
import scanpy as sc
import harmonypy as hm
import anndata
import os
import scrublet
#sc.settings.set_figure_params(dpi=500, dpi_save=1000, figsize=(5,5), facecolor='white')

In [2]:
# set up the data directory
counts_dir = "/home/groups/singlecell/smorabito/shared/SERPENTINE/data/"


In [3]:
os.chdir('/home/groups/singlecell/smorabito/analysis/SERPENTINE/')
data_dir = 'data/'
fig_dir = 'figures/'


In [16]:
# loading the metadata table
meta_table = pd.read_csv(
    "/home/groups/singlecell/smorabito/analysis/SERPENTINE/data/SERPENTINE_metadata.txt", 
    sep='\t',
    encoding = 'utf-16'
)

meta_table.tail(10)


Unnamed: 0,Subproject_CNAG,ID_CNAG,Tissue,Libraries,Sample,Timepoint,Info,Fractions loaded,biopsy_id,localizacion_SNOMED,Timepoint_1
52,SERPENTINE_58_59,SPE_1_19_C2_A_FRESH_1,Lung met,"GEX, TCR",19,2,,50%CD45+/50%CD45-,,,C02
53,SERPENTINE_58_59,SPE_1_19_C2_A_FRESH_2,Lung met,"GEX, TCR",19,2,,50%CD45+/50%CD45-,,,C02
54,SERPENTINE_60_61,SPE_1_20_SCR_A_FRESH_CD45pos,Lung met,"GEX, TCR",20,1,,100%CD45+,,,SCREENING
55,SERPENTINE_60_61,SPE_1_20_SCR_A_FRESH_CD45neg,Lung met,GEX,20,1,,100%CD45-,,,SCREENING
56,SERPENTINE_62_63,SPE_1_26_SCR_A_FRESH_CD45pos,Lung met,"GEX, TCR",26,1,,100%CD45+,,,SCREENING
57,SERPENTINE_62_63,SPE_1_26_SCR_A_FRESH_CD45neg,Lung met,GEX,26,1,,100%CD45-,,,SCREENING
58,SERPENTINE_64_65,SPE_1_20_C2_A_FRESH_CD45pos,Lung met,"GEX, TCR",20,2,,100%CD45+,,,C02
59,SERPENTINE_64_65,SPE_1_20_C2_A_FRESH_CD45neg,Lung met,GEX,20,2,,100%CD45-,,,C02
60,SERPENTINE_66_67,SPE_1_29_SCR_A_FRESH_CD45pos,Lung met,"GEX, TCR",29,1,,100%CD45+,,,SCREENING
61,SERPENTINE_66_67,SPE_1_29_SCR_A_FRESH_CD45neg,Lung met,GEX,29,1,,100%CD45-,,,SCREENING


In [17]:
meta_table = meta_table[~meta_table.Subproject_CNAG.isin(['SERPENTINE_62_63', 'SERPENTINE_64_65', 'SERPENTINE_66_67'])]

meta_table.tail(10)


Unnamed: 0,Subproject_CNAG,ID_CNAG,Tissue,Libraries,Sample,Timepoint,Info,Fractions loaded,biopsy_id,localizacion_SNOMED,Timepoint_1
46,SERPENTINE_50_51,SPE_1_18_SCR_A_FRESH,Lymphnode (hepatic iliar),"GEX, TCR",18,1,,ND,,,SCREENING
47,SERPENTINE_52_53,SPE_1_16_C2_A_FRESH_CD45pos,Liver met,"GEX, TCR",16,2,,100%CD45+,,,C02
48,SERPENTINE_52_53,SPE_1_16_C2_A_FRESH_CD45neg,Liver met,GEX,16,2,,100%CD45-,,,C02
49,SERPENTINE_54_55,SPE_1_18_C2_A_FRESH,Lymphnode (hepatic iliar),"GEX, TCR",18,2,,ND,,,C02
50,SERPENTINE_56_57,SPE_1_17_C2_A_FRESH_1,Lung met,"GEX, TCR",17,2,,50%CD45+/50%CD45-,,,C02
51,SERPENTINE_56_57,SPE_1_17_C2_A_FRESH_2,Lung met,"GEX, TCR",17,2,,50%CD45+/50%CD45-,,,C02
52,SERPENTINE_58_59,SPE_1_19_C2_A_FRESH_1,Lung met,"GEX, TCR",19,2,,50%CD45+/50%CD45-,,,C02
53,SERPENTINE_58_59,SPE_1_19_C2_A_FRESH_2,Lung met,"GEX, TCR",19,2,,50%CD45+/50%CD45-,,,C02
54,SERPENTINE_60_61,SPE_1_20_SCR_A_FRESH_CD45pos,Lung met,"GEX, TCR",20,1,,100%CD45+,,,SCREENING
55,SERPENTINE_60_61,SPE_1_20_SCR_A_FRESH_CD45neg,Lung met,GEX,20,1,,100%CD45-,,,SCREENING


In [18]:

i = 0 

adata_list = []


for cur_subproject in meta_table['Subproject_CNAG'].unique():
    print("")
    print(cur_subproject)
    
    # for this subproject, get the different reps that we sequenced (ie CD45+/- or total)
    if os.path.exists('{}{}/jobs/'.format(counts_dir, cur_subproject)):
        cur_reps = os.listdir('/home/groups/singlecell/smorabito/shared/SERPENTINE/data/{}/jobs/'.format(cur_subproject))
    else:
        cur_reps = os.listdir('/home/groups/singlecell/smorabito/shared/SERPENTINE/data/{}/jobs_human/'.format(cur_subproject))

    # loop over each replicate for this subproject, and load the gene expression counts matrices
    cur_adata_list = []
    for x in cur_reps:
        if os.path.exists('{}{}/jobs/'.format(counts_dir, cur_subproject)):
            cur_cellranger_dir = "{}{}/jobs/{}/{}/outs/per_sample_outs/{}/count/sample_filtered_feature_bc_matrix/".format(counts_dir, cur_subproject, x, x, x)
        else:
            cur_cellranger_dir = "{}{}/jobs_human/{}/{}/outs/per_sample_outs/{}/count/sample_filtered_feature_bc_matrix/".format(counts_dir, cur_subproject, x, x, x)
        try:
            cur_adata = sc.read_10x_mtx(cur_cellranger_dir)
        except:
            print('Issue loading {}, skipping for now.'.format(x))
            continue
        cur_adata.obs['Subproject_CNAG'] = cur_subproject 
        cur_adata.obs['Replicate'] = x

        # add patient label:
        tmp = x.split('_')
        if len(tmp) > 2:
            cur_patient = tmp[2]
            if len(cur_patient) == 1:
                cur_patient = '0' + cur_patient
        else:
            cur_patient = tmp[0]
        cur_patient = 'Patient ' + cur_patient
        print(cur_patient)
        cur_adata.obs['Patient'] = cur_patient
        cur_adata.obs['bc'] = [bc.split('-')[0] for bc in cur_adata.obs.index.to_list()]
        cur_adata_list.append(cur_adata)
        print(cur_adata.shape)

    adata_list = adata_list + cur_adata_list

        




SERPENTINE_04_05
Patient 01
(694, 36601)
Patient 01
(4682, 36601)

SERPENTINE_06_07
Patient 02
(8351, 36601)
Patient 02
(6858, 36601)

SERPENTINE_08_09
Patient 01
(4470, 36601)
Patient 01
(4011, 36601)

SERPENTINE_10_11
Patient 02
(6693, 36601)
Patient 03
(1110, 36601)
Patient 02
(5798, 36601)

SERPENTINE_12_13
Patient 04
(11125, 36601)
Patient 04
(11551, 36601)

SERPENTINE_14_15
Patient 03
(3815, 36601)

SERPENTINE_16_17
Patient 04
(6516, 36601)
Patient 04
(5440, 36601)

SERPENTINE_18_19
Patient 03
(1451, 36601)
Patient 03
(8306, 36601)

SERPENTINE_20_21
Patient 05
(5481, 36601)
Patient 06
(1042, 36601)
Patient 06
(3623, 36601)

SERPENTINE_22_23
Patient 08
(15757, 36601)

SERPENTINE_24_25
Patient 09
(7434, 36601)

SERPENTINE_26_27
Patient 07
(3333, 36601)
Patient 07
(160, 36601)

SERPENTINE_28_29
Patient 10
(6935, 36601)
Patient 10
(15640, 36601)

SERPENTINE_30_31
Patient 08
(6484, 36601)
Patient 08
(6995, 36601)

SERPENTINE_32_33
Patient 09
(8119, 36601)
Patient 09
(7793, 36601)

SE

In [19]:
# concatenate into a single adata:
adata = adata_list[0].concatenate(adata_list[1:])

  adata = adata_list[0].concatenate(adata_list[1:])


In [20]:
rep = adata.obs.Replicate.replace({'01_total':'SPE_1_01_SCR_A_FRESH', '01_CD45':'SPE_1_01_SCR_A_FRESH_CD45'})
adata.obs.Replicate = rep

In [21]:
adata.shape

(340764, 36601)