## Serpentine: Matching tumor clonotypes with GEX profiles

### Set Up Environment

In [61]:
# Load project configuration
setwd("/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR")
source("code/helper/Config.R", echo = FALSE)
options(repr.matrix.max.rows=100, repr.matrix.max.cols=100)

Project configured successfully. Root directory set to: /scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR 


### Load serpentine tumor TCR data

In [62]:
# Read wide formatted tumor TCR data data (clonotype-level) containing the clone size and classification information
serpentine_TCR <- qread(file = file.path(root_dir, "out", "data", "SERP_TCR_combined_clones_with_sizes_df_09-2025_v2.qs"))

In [64]:
# Adjust TCR cell barcodes so they match with the GEX cell barcodes
serpentine_TCR <- serpentine_TCR %>% 
        mutate(old_barcode = barcode,
               barcode_nt = str_extract(barcode, "[^_]+$") %>% gsub("-\\d+$", "", .),
               barcode = paste(sample, barcode_nt, sep="_")
    )
head(serpentine_TCR, 3)

patient,sample_id,barcode,sample,ID,TCR1,cdr3_aa1,cdr3_nt1,TCR2,cdr3_aa2,cdr3_nt2,CTgene,CTnt,CTaa,CTstrict,timepoint,assessment_point,MS_status,tumor_type,met_loc,response,mol_profile,KRAS_mut,BRAF_mut,prev_chemo,liver_met,time_gap_days,age,sex,tumor_growth,stage_diag,num_prev_lines,num_met_sites,adjuv_treat,CTnt_beta,CTnt_beta_shrt,CTnt_alpha,CTaa_beta,CTaa_alpha,CTgene_beta,CTgene_alpha,TRBV_TRBJ,clonotype_id,cloneSize,norm_cloneSize,cloneClass,ICI,presence_status,expansion_status,old_barcode,barcode_nt
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<fct>,<chr>,<chr>,<chr>,<chr>,<chr>
P01,P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_AAACCTGAGTTAAGTG,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV10.TRAJ22.TRAC,CVVSLSGSARQLTF,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT,TRBV4-1.NA.TRBJ2-5.TRBC2,CASSYGGFPETQYF,TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,TRAV10.TRAJ22.TRAC_TRBV4-1.NA.TRBJ2-5.TRBC2,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT_TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,CVVSLSGSARQLTF_CASSYGGFPETQYF,TRAV10.TRAJ22.TRAC;TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT_TRBV4-1.NA.TRBJ2-5.TRBC2;TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,T0,SCR,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,GCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTAC,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT,CASSYGGFPETQYF,CVVSLSGSARQLTF,TRBV4-1.NA.TRBJ2-5.TRBC2,TRAV10.TRAJ22.TRAC,TRBV4-1_TRBJ2-5,TRBV4-1_TRBJ2-5_GCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTAC,3,7.037298e-05,Rare,Pre-ICI,Pre-existing,Shared_pre,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGAGTTAAGTG-1,AAACCTGAGTTAAGTG
P01,P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_AAACCTGTCAAACCAC,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV12-3.TRAJ26.TRAC,CAMSLNNYGQNFVF,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT,TRBV28.NA.TRBJ2-5.TRBC2,CASTGTGKLQETQYF,TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,TRAV12-3.TRAJ26.TRAC_TRBV28.NA.TRBJ2-5.TRBC2,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT_TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,CAMSLNNYGQNFVF_CASTGTGKLQETQYF,TRAV12-3.TRAJ26.TRAC;TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT_TRBV28.NA.TRBJ2-5.TRBC2;TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,T0,SCR,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,GCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTAC,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT,CASTGTGKLQETQYF,CAMSLNNYGQNFVF,TRBV28.NA.TRBJ2-5.TRBC2,TRAV12-3.TRAJ26.TRAC,TRBV28_TRBJ2-5,TRBV28_TRBJ2-5_GCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTAC,2,4.691532e-05,Rare,Pre-ICI,Lost,Lost,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGTCAAACCAC-1,AAACCTGTCAAACCAC
P01,P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_AAACCTGTCCGCAGTG,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV3.TRAJ5.TRAC,CAVREDTGRRALTF,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT,TRBV29-1.NA.TRBJ1-1.TRBC1,CSVPLGAGEAFF,TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,TRAV3.TRAJ5.TRAC_TRBV29-1.NA.TRBJ1-1.TRBC1,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT_TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,CAVREDTGRRALTF_CSVPLGAGEAFF,TRAV3.TRAJ5.TRAC;TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT_TRBV29-1.NA.TRBJ1-1.TRBC1;TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,T0,SCR,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,AGCGTCCCCCTGGGGGCTGGGGAAGCTTTC,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT,CSVPLGAGEAFF,CAVREDTGRRALTF,TRBV29-1.NA.TRBJ1-1.TRBC1,TRAV3.TRAJ5.TRAC,TRBV29-1_TRBJ1-1,TRBV29-1_TRBJ1-1_AGCGTCCCCCTGGGGGCTGGGGAAGCTTTC,18,0.0004222379,Medium,Pre-ICI,Pre-existing,Shared_pre,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGTCCGCAGTG-1,AAACCTGTCCGCAGTG


In [65]:
# Check for duplicated barcodes
serpentine_TCR %>%
    group_by(ID, barcode_nt) %>%
    filter(n() > 1) %>%   
    ungroup() %>%
    nrow()

patient,sample_id,barcode,sample,ID,TCR1,cdr3_aa1,cdr3_nt1,TCR2,cdr3_aa2,cdr3_nt2,CTgene,CTnt,CTaa,CTstrict,timepoint,assessment_point,MS_status,tumor_type,met_loc,response,mol_profile,KRAS_mut,BRAF_mut,prev_chemo,liver_met,time_gap_days,age,sex,tumor_growth,stage_diag,num_prev_lines,num_met_sites,adjuv_treat,CTnt_beta,CTnt_beta_shrt,CTnt_alpha,CTaa_beta,CTaa_alpha,CTgene_beta,CTgene_alpha,TRBV_TRBJ,clonotype_id,cloneSize,norm_cloneSize,cloneClass,ICI,presence_status,expansion_status,old_barcode,barcode_nt
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<fct>,<chr>,<chr>,<chr>,<chr>,<chr>


### Load serpentine tumor preprocessed and annotated GEX data

In [66]:
# Read the GEX annotations
serpentine_GEX <- read.table(file = file.path(root_dir, "out", "data", "SERP_T_Annotations_11-2025_v2.csv"), sep = ",", header = T)

In [70]:
# Rename GEX data colums so they match the TCR ones
serpentine_GEX <- serpentine_GEX %>%
    rename(sample = Replicate, barcode_nt = bc)
names(serpentine_GEX)

In [71]:
# Create barcode columns for further matching
serpentine_GEX <- serpentine_GEX %>%
    mutate(barcode = paste(sample, barcode_nt, sep = "_"))
head(serpentine_GEX)

Unnamed: 0_level_0,barcode_nt,sample,dataset,cell_type,lv1,CD4_CD8_assignment,annotation,lv2,barcode
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,CATTCGCGTGAACCTT,ESP_1_03_EOT_A_FRESH_CD45neg,SERPENTINE,NK cells,NC,,Tgd-17,Tgd-17 (35),ESP_1_03_EOT_A_FRESH_CD45neg_CATTCGCGTGAACCTT
2,AGTTGGTAGGAGCGAG,ESP_1_03_EOT_A_FRESH_CD45neg,SERPENTINE,T cells,CD8,CD8,CD8 resident,CD8 resident (10),ESP_1_03_EOT_A_FRESH_CD45neg_AGTTGGTAGGAGCGAG
3,TGAGCCGAGTCAAGGC,ESP_1_03_EOT_A_FRESH_CD45neg,SERPENTINE,T cells,CD8,CD8,CD8 metabolic,CD8 metabolic (17),ESP_1_03_EOT_A_FRESH_CD45neg_TGAGCCGAGTCAAGGC
4,TTCTTAGTCATGGTCA,ESP_1_03_EOT_A_FRESH_CD45neg,SERPENTINE,T cells,T Naive/CM,CD4,CD4 central memory,CD4 central memory (8),ESP_1_03_EOT_A_FRESH_CD45neg_TTCTTAGTCATGGTCA
5,GGTGTTATCTGGCGTG,ESP_1_03_EOT_A_FRESH_CD45neg,SERPENTINE,T cells,T Naive/CM,CD4,CD4 central memory pre-Tfh,CD4 central memory pre-Tfh (19),ESP_1_03_EOT_A_FRESH_CD45neg_GGTGTTATCTGGCGTG
6,CTACCCACAAGAAGAG,ESP_1_03_EOT_A_FRESH_CD45neg,SERPENTINE,T cells,CD8,CD8,T proliferating,T proliferating (28),ESP_1_03_EOT_A_FRESH_CD45neg_CTACCCACAAGAAGAG


In [72]:
# Check for duplicated barcodes in TCR data
any(duplicated(serpentine_TCR$barcode))

In [73]:
# Check for duplicated barcodes in GEX data
any(duplicated(serpentine_GEX$barcode))

### Combine TCR and GEX data


In [74]:
# Check that the GEX and TCR barcodes match
(serpentine_TCR$barcode %in% serpentine_GEX$barcode) %>% table()

.
 FALSE   TRUE 
100496 108034 

In [75]:
# Check that sample names match between TCR and GEX cells
print(unique(serpentine_TCR$sample))
print("")
print(unique(serpentine_GEX$sample))

 [1] "SPE_1_01_SCR_A_FRESH_CD45"        "SPE_1_01_SCR_A_FRESH"            
 [3] "SPE_1_01_C2D1_A_FRESH_1"          "SPE_1_01_C2D1_A_FRESH_2"         
 [5] "SPE_1_02_SCR_A_FRESH_1"           "SPE_1_02_SCR_A_FRESH_2"          
 [7] "SPE_1_02_C02_A_FRESH_1"           "SPE_1_02_C02_A_FRESH_2"          
 [9] "SPE_1_03_SCR_A_FRESH"             "SPE_1_03_C02_A_FRESH"            
[11] "ESP_1_03_EOT_A_FRESH_CD45pos"     "SPE_1_04_SCR_A_FRESH_1"          
[13] "SPE_1_04_SCR_A_FRESH_2"           "SPE_1_04_C02_A_FRESH_1"          
[15] "SPE_1_04_C02_A_FRESH_2"           "SPE_1_05_SCR_A_FRESH"            
[17] "SPE_1_06_SCR_A_FRESH_CD45pos"     "ESP_1_07_SCR_A_FRESH_CD45pos"    
[19] "SPE_1_07_C2_A_FRESH_1"            "SPE_1_07_C2_A_FRESH_2"           
[21] "SPE_1_07_EOT_A_FRESH_1"           "SPE_1_07_EOT_A_FRESH_2"          
[23] "SPE_1_08_SCR_A_FRESH"             "SPE_1_8_C2_A_FRESH_1"            
[25] "SPE_1_8_C2_A_FRESH_2"             "SPE_1_09_SCR_A_FRESH"            
[27] "SPE_1_09_C2_A_FRESH

In [76]:
# Remove "GEX" part of GEX cell barcodes
serpentine_TCR$sample <- gsub("_GEX1", "_1", serpentine_TCR$sample)
serpentine_TCR$sample <- gsub("_GEX2", "_2", serpentine_TCR$sample)
serpentine_TCR$sample <- gsub("_GEX", "", serpentine_TCR$sample)

In [77]:
intersect(serpentine_TCR$sample, serpentine_GEX$sample)

In [78]:
# Now, sample names match!
setdiff(serpentine_TCR$sample, serpentine_GEX$sample)

In [79]:
# CD45 negative samples are unique to GEX 
setdiff(serpentine_GEX$sample, serpentine_TCR$sample)

In [80]:
# Remove "GEX" part of GEX cell barcodes
serpentine_TCR$barcode <- gsub("_GEX1", "_1", serpentine_TCR$barcode)
serpentine_TCR$barcode <- gsub("_GEX2", "_2", serpentine_TCR$barcode)
serpentine_TCR$barcode <- gsub("_GEX", "", serpentine_TCR$barcode)

In [81]:
# Check that the GEX and TCR barcodes match
(serpentine_TCR$barcode %in% serpentine_GEX$barcode) %>% table()

.
 FALSE   TRUE 
 26634 181896 

In [82]:
# Check TCR barcodes that do not match the GEX data
is_in_GEX <- serpentine_TCR$barcode %in% serpentine_GEX$barcode
non_matching_rows <- serpentine_TCR[!is_in_GEX, ]
head(non_matching_rows %>% filter(sample == "SPE_1_14_SCR_A_FRESH_CD45pos"))

patient,sample_id,barcode,sample,ID,TCR1,cdr3_aa1,cdr3_nt1,TCR2,cdr3_aa2,cdr3_nt2,CTgene,CTnt,CTaa,CTstrict,timepoint,assessment_point,MS_status,tumor_type,met_loc,response,mol_profile,KRAS_mut,BRAF_mut,prev_chemo,liver_met,time_gap_days,age,sex,tumor_growth,stage_diag,num_prev_lines,num_met_sites,adjuv_treat,CTnt_beta,CTnt_beta_shrt,CTnt_alpha,CTaa_beta,CTaa_alpha,CTgene_beta,CTgene_alpha,TRBV_TRBJ,clonotype_id,cloneSize,norm_cloneSize,cloneClass,ICI,presence_status,expansion_status,old_barcode,barcode_nt
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<fct>,<chr>,<chr>,<chr>,<chr>,<chr>
P14,P14_T0_CD45pos,SPE_1_14_SCR_A_FRESH_CD45pos_AAACCTGCACAGATTC,SPE_1_14_SCR_A_FRESH_CD45pos,P14_T0_CD45pos,TRAV19.TRAJ29.TRAC,CALSETPSGNTPLVF,TGTGCTCTGAGTGAGACCCCTTCAGGAAACACACCTCTTGTCTTT,TRBV9.NA.TRBJ2-1.TRBC2,CASSGPLLAGAMNEQFF,TGTGCCAGCAGCGGCCCACTCCTAGCGGGGGCCATGAATGAGCAGTTCTTC,TRAV19.TRAJ29.TRAC_TRBV9.NA.TRBJ2-1.TRBC2,TGTGCTCTGAGTGAGACCCCTTCAGGAAACACACCTCTTGTCTTT_TGTGCCAGCAGCGGCCCACTCCTAGCGGGGGCCATGAATGAGCAGTTCTTC,CALSETPSGNTPLVF_CASSGPLLAGAMNEQFF,TRAV19.TRAJ29.TRAC;TGTGCTCTGAGTGAGACCCCTTCAGGAAACACACCTCTTGTCTTT_TRBV9.NA.TRBJ2-1.TRBC2;TGTGCCAGCAGCGGCCCACTCCTAGCGGGGGCCATGAATGAGCAGTTCTTC,T0,SCR,MSS,CRC,Liver,PD,KRAS G12V,True,False,True,True,68,63.24,FEMALE,37,IIIb,5,4,CAPOX,TGTGCCAGCAGCGGCCCACTCCTAGCGGGGGCCATGAATGAGCAGTTCTTC,GCCAGCAGCGGCCCACTCCTAGCGGGGGCCATGAATGAGCAGTTC,TGTGCTCTGAGTGAGACCCCTTCAGGAAACACACCTCTTGTCTTT,CASSGPLLAGAMNEQFF,CALSETPSGNTPLVF,TRBV9.NA.TRBJ2-1.TRBC2,TRAV19.TRAJ29.TRAC,TRBV9_TRBJ2-1,TRBV9_TRBJ2-1_GCCAGCAGCGGCCCACTCCTAGCGGGGGCCATGAATGAGCAGTTC,2,1.275291e-05,Rare,Pre-ICI,Pre-existing,Shared_pre,SPE_1_14_SCR_A_FRESH_CD45pos_P14_T0_CD45pos_AAACCTGCACAGATTC-1,AAACCTGCACAGATTC
P14,P14_T0_CD45pos,SPE_1_14_SCR_A_FRESH_CD45pos_AAACCTGGTAGCTAAA,SPE_1_14_SCR_A_FRESH_CD45pos,P14_T0_CD45pos,TRAV30.TRAJ44.TRAC,CGTPTGTASKLTF,TGCGGCACCCCCACCGGCACTGCCAGTAAACTCACCTTT,TRBV6-6.NA.TRBJ2-7.TRBC2,CASHRDYYEQYF,TGTGCCTCCCACAGGGATTACTACGAGCAGTACTTC,TRAV30.TRAJ44.TRAC_TRBV6-6.NA.TRBJ2-7.TRBC2,TGCGGCACCCCCACCGGCACTGCCAGTAAACTCACCTTT_TGTGCCTCCCACAGGGATTACTACGAGCAGTACTTC,CGTPTGTASKLTF_CASHRDYYEQYF,TRAV30.TRAJ44.TRAC;TGCGGCACCCCCACCGGCACTGCCAGTAAACTCACCTTT_TRBV6-6.NA.TRBJ2-7.TRBC2;TGTGCCTCCCACAGGGATTACTACGAGCAGTACTTC,T0,SCR,MSS,CRC,Liver,PD,KRAS G12V,True,False,True,True,68,63.24,FEMALE,37,IIIb,5,4,CAPOX,TGTGCCTCCCACAGGGATTACTACGAGCAGTACTTC,GCCTCCCACAGGGATTACTACGAGCAGTAC,TGCGGCACCCCCACCGGCACTGCCAGTAAACTCACCTTT,CASHRDYYEQYF,CGTPTGTASKLTF,TRBV6-6.NA.TRBJ2-7.TRBC2,TRAV30.TRAJ44.TRAC,TRBV6-6_TRBJ2-7,TRBV6-6_TRBJ2-7_GCCTCCCACAGGGATTACTACGAGCAGTAC,1,6.376453e-06,Singlet,Pre-ICI,Pre-existing,Shared_pre,SPE_1_14_SCR_A_FRESH_CD45pos_P14_T0_CD45pos_AAACCTGGTAGCTAAA-1,AAACCTGGTAGCTAAA
P14,P14_T0_CD45pos,SPE_1_14_SCR_A_FRESH_CD45pos_AAACCTGTCAGGCCCA,SPE_1_14_SCR_A_FRESH_CD45pos,P14_T0_CD45pos,,,,TRBV24-1.TRBD2.TRBJ2-3.TRBC2,CATSDTKGTSGSTDTQYF,TGTGCCACCAGTGACACCAAAGGGACTAGCGGGAGCACAGATACGCAGTATTTT,NA_TRBV24-1.TRBD2.TRBJ2-3.TRBC2,NA_TGTGCCACCAGTGACACCAAAGGGACTAGCGGGAGCACAGATACGCAGTATTTT,NA_CATSDTKGTSGSTDTQYF,NA;NA_TRBV24-1.TRBD2.TRBJ2-3.TRBC2;TGTGCCACCAGTGACACCAAAGGGACTAGCGGGAGCACAGATACGCAGTATTTT,T0,SCR,MSS,CRC,Liver,PD,KRAS G12V,True,False,True,True,68,63.24,FEMALE,37,IIIb,5,4,CAPOX,TGTGCCACCAGTGACACCAAAGGGACTAGCGGGAGCACAGATACGCAGTATTTT,GCCACCAGTGACACCAAAGGGACTAGCGGGAGCACAGATACGCAGTAT,,CATSDTKGTSGSTDTQYF,,TRBV24-1.TRBD2.TRBJ2-3.TRBC2,,TRBV24-1_TRBJ2-3,TRBV24-1_TRBJ2-3_GCCACCAGTGACACCAAAGGGACTAGCGGGAGCACAGATACGCAGTAT,54,0.0003443285,Hyperexpanded,Pre-ICI,Lost,Lost,SPE_1_14_SCR_A_FRESH_CD45pos_P14_T0_CD45pos_AAACCTGTCAGGCCCA-1,AAACCTGTCAGGCCCA
P14,P14_T0_CD45pos,SPE_1_14_SCR_A_FRESH_CD45pos_AAACCTGTCAGTTGAC,SPE_1_14_SCR_A_FRESH_CD45pos,P14_T0_CD45pos,,,,TRBV18.NA.TRBJ2-1.TRBC2,CASSPPGSYNEQFF,TGTGCCAGCTCACCACCGGGCAGTTACAATGAGCAGTTCTTC,NA_TRBV18.NA.TRBJ2-1.TRBC2,NA_TGTGCCAGCTCACCACCGGGCAGTTACAATGAGCAGTTCTTC,NA_CASSPPGSYNEQFF,NA;NA_TRBV18.NA.TRBJ2-1.TRBC2;TGTGCCAGCTCACCACCGGGCAGTTACAATGAGCAGTTCTTC,T0,SCR,MSS,CRC,Liver,PD,KRAS G12V,True,False,True,True,68,63.24,FEMALE,37,IIIb,5,4,CAPOX,TGTGCCAGCTCACCACCGGGCAGTTACAATGAGCAGTTCTTC,GCCAGCTCACCACCGGGCAGTTACAATGAGCAGTTC,,CASSPPGSYNEQFF,,TRBV18.NA.TRBJ2-1.TRBC2,,TRBV18_TRBJ2-1,TRBV18_TRBJ2-1_GCCAGCTCACCACCGGGCAGTTACAATGAGCAGTTC,1,6.376453e-06,Singlet,Pre-ICI,Lost,Lost,SPE_1_14_SCR_A_FRESH_CD45pos_P14_T0_CD45pos_AAACCTGTCAGTTGAC-1,AAACCTGTCAGTTGAC
P14,P14_T0_CD45pos,SPE_1_14_SCR_A_FRESH_CD45pos_AAACCTGTCTGCCCTA,SPE_1_14_SCR_A_FRESH_CD45pos,P14_T0_CD45pos,TRAV9-2.TRAJ27.TRAC,CALRSTNAGKSTF,TGTGCTCTGAGGAGCACCAATGCAGGCAAATCAACCTTT,TRBV19.TRBD1.TRBJ2-7.TRBC2,CASSIGTGSYEQYF,TGTGCCAGTAGTATAGGGACAGGCAGCTACGAGCAGTACTTC,TRAV9-2.TRAJ27.TRAC_TRBV19.TRBD1.TRBJ2-7.TRBC2,TGTGCTCTGAGGAGCACCAATGCAGGCAAATCAACCTTT_TGTGCCAGTAGTATAGGGACAGGCAGCTACGAGCAGTACTTC,CALRSTNAGKSTF_CASSIGTGSYEQYF,TRAV9-2.TRAJ27.TRAC;TGTGCTCTGAGGAGCACCAATGCAGGCAAATCAACCTTT_TRBV19.TRBD1.TRBJ2-7.TRBC2;TGTGCCAGTAGTATAGGGACAGGCAGCTACGAGCAGTACTTC,T0,SCR,MSS,CRC,Liver,PD,KRAS G12V,True,False,True,True,68,63.24,FEMALE,37,IIIb,5,4,CAPOX,TGTGCCAGTAGTATAGGGACAGGCAGCTACGAGCAGTACTTC,GCCAGTAGTATAGGGACAGGCAGCTACGAGCAGTAC,TGTGCTCTGAGGAGCACCAATGCAGGCAAATCAACCTTT,CASSIGTGSYEQYF,CALRSTNAGKSTF,TRBV19.TRBD1.TRBJ2-7.TRBC2,TRAV9-2.TRAJ27.TRAC,TRBV19_TRBJ2-7,TRBV19_TRBJ2-7_GCCAGTAGTATAGGGACAGGCAGCTACGAGCAGTAC,1,6.376453e-06,Singlet,Pre-ICI,Lost,Lost,SPE_1_14_SCR_A_FRESH_CD45pos_P14_T0_CD45pos_AAACCTGTCTGCCCTA-1,AAACCTGTCTGCCCTA
P14,P14_T0_CD45pos,SPE_1_14_SCR_A_FRESH_CD45pos_AAACGGGCAAACTGCT,SPE_1_14_SCR_A_FRESH_CD45pos,P14_T0_CD45pos,,,,TRBV7-2.NA.TRBJ1-5.TRBC1,CASSLGGEAGQPQHF,TGTGCCAGCAGCTTAGGAGGCGAGGCAGGGCAGCCCCAGCATTTT,NA_TRBV7-2.NA.TRBJ1-5.TRBC1,NA_TGTGCCAGCAGCTTAGGAGGCGAGGCAGGGCAGCCCCAGCATTTT,NA_CASSLGGEAGQPQHF,NA;NA_TRBV7-2.NA.TRBJ1-5.TRBC1;TGTGCCAGCAGCTTAGGAGGCGAGGCAGGGCAGCCCCAGCATTTT,T0,SCR,MSS,CRC,Liver,PD,KRAS G12V,True,False,True,True,68,63.24,FEMALE,37,IIIb,5,4,CAPOX,TGTGCCAGCAGCTTAGGAGGCGAGGCAGGGCAGCCCCAGCATTTT,GCCAGCAGCTTAGGAGGCGAGGCAGGGCAGCCCCAGCAT,,CASSLGGEAGQPQHF,,TRBV7-2.NA.TRBJ1-5.TRBC1,,TRBV7-2_TRBJ1-5,TRBV7-2_TRBJ1-5_GCCAGCAGCTTAGGAGGCGAGGCAGGGCAGCCCCAGCAT,1,6.376453e-06,Singlet,Pre-ICI,Pre-existing,Shared_pre,SPE_1_14_SCR_A_FRESH_CD45pos_P14_T0_CD45pos_AAACGGGCAAACTGCT-1,AAACGGGCAAACTGCT


In [87]:
# Match the Serpentine TCR data with the integrated Serpentine GEX Phenotype data by cell barcode
serpentine_TCR_GEX <- serpentine_TCR %>% 
    left_join(
        serpentine_GEX %>% select('barcode', 'cell_type', 'lv1', 'CD4_CD8_assignment', 'annotation', 'lv2'),
        by = "barcode" 
    )
print(nrow(serpentine_TCR_GEX))
serpentine_TCR_GEX %>% head(5)

[1] 208530


patient,sample_id,barcode,sample,ID,TCR1,cdr3_aa1,cdr3_nt1,TCR2,cdr3_aa2,cdr3_nt2,CTgene,CTnt,CTaa,CTstrict,timepoint,assessment_point,MS_status,tumor_type,met_loc,response,mol_profile,KRAS_mut,BRAF_mut,prev_chemo,liver_met,time_gap_days,age,sex,tumor_growth,stage_diag,num_prev_lines,num_met_sites,adjuv_treat,CTnt_beta,CTnt_beta_shrt,CTnt_alpha,CTaa_beta,CTaa_alpha,CTgene_beta,CTgene_alpha,TRBV_TRBJ,clonotype_id,cloneSize,norm_cloneSize,cloneClass,ICI,presence_status,expansion_status,old_barcode,barcode_nt,cell_type,lv1,CD4_CD8_assignment,annotation,lv2
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<fct>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
P01,P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_AAACCTGAGTTAAGTG,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV10.TRAJ22.TRAC,CVVSLSGSARQLTF,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT,TRBV4-1.NA.TRBJ2-5.TRBC2,CASSYGGFPETQYF,TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,TRAV10.TRAJ22.TRAC_TRBV4-1.NA.TRBJ2-5.TRBC2,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT_TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,CVVSLSGSARQLTF_CASSYGGFPETQYF,TRAV10.TRAJ22.TRAC;TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT_TRBV4-1.NA.TRBJ2-5.TRBC2;TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,T0,SCR,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,GCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTAC,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT,CASSYGGFPETQYF,CVVSLSGSARQLTF,TRBV4-1.NA.TRBJ2-5.TRBC2,TRAV10.TRAJ22.TRAC,TRBV4-1_TRBJ2-5,TRBV4-1_TRBJ2-5_GCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTAC,3,7.037298e-05,Rare,Pre-ICI,Pre-existing,Shared_pre,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGAGTTAAGTG-1,AAACCTGAGTTAAGTG,T cells,CD8,CD8,CD8 resident,CD8 resident (10)
P01,P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_AAACCTGTCAAACCAC,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV12-3.TRAJ26.TRAC,CAMSLNNYGQNFVF,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT,TRBV28.NA.TRBJ2-5.TRBC2,CASTGTGKLQETQYF,TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,TRAV12-3.TRAJ26.TRAC_TRBV28.NA.TRBJ2-5.TRBC2,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT_TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,CAMSLNNYGQNFVF_CASTGTGKLQETQYF,TRAV12-3.TRAJ26.TRAC;TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT_TRBV28.NA.TRBJ2-5.TRBC2;TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,T0,SCR,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,GCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTAC,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT,CASTGTGKLQETQYF,CAMSLNNYGQNFVF,TRBV28.NA.TRBJ2-5.TRBC2,TRAV12-3.TRAJ26.TRAC,TRBV28_TRBJ2-5,TRBV28_TRBJ2-5_GCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTAC,2,4.691532e-05,Rare,Pre-ICI,Lost,Lost,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGTCAAACCAC-1,AAACCTGTCAAACCAC,,,,,
P01,P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_AAACCTGTCCGCAGTG,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV3.TRAJ5.TRAC,CAVREDTGRRALTF,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT,TRBV29-1.NA.TRBJ1-1.TRBC1,CSVPLGAGEAFF,TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,TRAV3.TRAJ5.TRAC_TRBV29-1.NA.TRBJ1-1.TRBC1,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT_TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,CAVREDTGRRALTF_CSVPLGAGEAFF,TRAV3.TRAJ5.TRAC;TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT_TRBV29-1.NA.TRBJ1-1.TRBC1;TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,T0,SCR,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,AGCGTCCCCCTGGGGGCTGGGGAAGCTTTC,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT,CSVPLGAGEAFF,CAVREDTGRRALTF,TRBV29-1.NA.TRBJ1-1.TRBC1,TRAV3.TRAJ5.TRAC,TRBV29-1_TRBJ1-1,TRBV29-1_TRBJ1-1_AGCGTCCCCCTGGGGGCTGGGGAAGCTTTC,18,0.0004222379,Medium,Pre-ICI,Pre-existing,Shared_pre,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGTCCGCAGTG-1,AAACCTGTCCGCAGTG,T cells,CD8,CD8,T proliferating,T proliferating (28)
P01,P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_AAACGGGAGCGTTTAC,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV23/DV6.TRAJ57.TRAC,CAVHQGGSEKLVF,TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT,TRBV30.NA.TRBJ1-4.TRBC1,CAWSVGGVDEKLFF,TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,TRAV23/DV6.TRAJ57.TRAC_TRBV30.NA.TRBJ1-4.TRBC1,TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT_TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,CAVHQGGSEKLVF_CAWSVGGVDEKLFF,TRAV23/DV6.TRAJ57.TRAC;TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT_TRBV30.NA.TRBJ1-4.TRBC1;TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,T0,SCR,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,GCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTT,TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT,CAWSVGGVDEKLFF,CAVHQGGSEKLVF,TRBV30.NA.TRBJ1-4.TRBC1,TRAV23/DV6.TRAJ57.TRAC,TRBV30_TRBJ1-4,TRBV30_TRBJ1-4_GCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTT,1,2.345766e-05,Singlet,Pre-ICI,Lost,Lost,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACGGGAGCGTTTAC-1,AAACGGGAGCGTTTAC,T cells,T Naive/CM,CD4,CD4 central memory,CD4 central memory (8)
P01,P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_AAACGGGAGTATTGGA,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV24.TRAJ15.TRAC,CALKTALIF,TGTGCTCTCAAAACTGCTCTGATCTTT,TRBV9.NA.TRBJ2-5.TRBC2,CASSVGGGSQETQYF,TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,TRAV24.TRAJ15.TRAC_TRBV9.NA.TRBJ2-5.TRBC2,TGTGCTCTCAAAACTGCTCTGATCTTT_TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,CALKTALIF_CASSVGGGSQETQYF,TRAV24.TRAJ15.TRAC;TGTGCTCTCAAAACTGCTCTGATCTTT_TRBV9.NA.TRBJ2-5.TRBC2;TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,T0,SCR,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,GCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTAC,TGTGCTCTCAAAACTGCTCTGATCTTT,CASSVGGGSQETQYF,CALKTALIF,TRBV9.NA.TRBJ2-5.TRBC2,TRAV24.TRAJ15.TRAC,TRBV9_TRBJ2-5,TRBV9_TRBJ2-5_GCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTAC,2,4.691532e-05,Rare,Pre-ICI,Pre-existing,Shared_pre,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACGGGAGTATTGGA-1,AAACGGGAGTATTGGA,T cells,CD8,CD8,CD8 activated,CD8 activated (24)


### Save the integrated Serpentine GEX data matched with the Serpentine TCR data

In [98]:
# Save the GEX data updated with TCR data
qsave(serpentine_TCR_GEX, file = file.path(root_dir, "out", "data", "SERP_TCR-GEX_11-2025_v2.qs"))

### Convert the integrated TCR-GEX Serpentine data to wide format

The same clonotype might present diverse phenotypes, therefore for clonotype-level data we assign the annotation to that clonotype that is most abundant (mode-based approach). 

In [99]:
# Read matched data
serpentine_TCR_GEX <- qread(file = file.path(root_dir, "out", "data", "SERP_TCR-GEX_11-2025_v2.qs"))

In [103]:
# Mode function to select the most abundant cell type from each clonotype 
get_mode <- function(x) {
    ux <- unique(x)
    ux[which.max(tabulate(match(x, ux)))]
}

In [105]:
# Convert the df to wide format
serpentine_TCR_GEX_wide <- serpentine_TCR_GEX %>%
    
    # have a single row per unique clonotype at the sample-level
    distinct(clonotype_id, patient, timepoint, sample_id, ID, cloneSize,
            MS_status,tumor_type,met_loc,response,tumor_growth, presence_status,
            cell_type, CD4_CD8_assignment, lv1, annotation, lv2) %>%
    group_by(clonotype_id, patient, ID, timepoint, met_loc, response, tumor_growth, presence_status) %>% 
    summarise(
        cloneSize = unique(cloneSize),
        cell_type = list(cell_type), CD4_CD8_assignment = list(CD4_CD8_assignment), lv1 = list(lv1), annotation = list(annotation), lv2 = list(lv2),
    ) %>%

    # Get annotation mode
    group_by(clonotype_id, patient, met_loc, response, tumor_growth, num_met_sites, presence_status, timepoint) %>% 
    summarise(
        cloneSize = sum(cloneSize),
        cell_type = get_mode(flatten_chr(cell_type)), CD4_CD8_assignment = get_mode(flatten_chr(CD4_CD8_assignment)), lv1 = get_mode(flatten_chr(lv1)), 
        annotation = get_mode(flatten_chr(annotation)), lv2 = get_mode(flatten_chr(lv2))
    ) %>%

    # Convert to wide format
    pivot_wider(
        names_from = timepoint,
        values_from = c(cloneSize, cell_type, CD4_CD8_assignment, lv1, annotation, lv2),
        values_fn = list(
            cloneSize = sum, 
            cell_type = ~ unique(.x), 
            CD4_CD8_assignment = ~ unique(.x),
            lv1 = ~ unique(.x), 
            annotation = ~ unique(.x), 
            lv2 = ~ unique(.x)
        ))%>%
    mutate(
        cloneSize_T0 = replace_na(cloneSize_T0, 0),
        cloneSize_T1 = replace_na(cloneSize_T1, 0),
        cloneSize_EOT = replace_na(cloneSize_EOT, 0) 
    ) %>%

    # Normalize counts per timepoint-patient (same as library for most cases but technical replicates)
    group_by(patient) %>%
    mutate(
        norm_cloneSize_T0 = cloneSize_T0 / sum(cloneSize_T0, na.rm = TRUE),
        norm_cloneSize_T1 = cloneSize_T1 / sum(cloneSize_T1, na.rm = TRUE),
        norm_cloneSize_EOT = cloneSize_EOT / sum(cloneSize_EOT, na.rm = TRUE)
    ) %>%
    ungroup() %>%

    # Re-classify clonotypes into clonal classes
    mutate(
        # T0 (SCR)
        cloneClass_T0 = case_when(
        cloneSize_T0 == 1 ~ "Singlet", 
        cloneSize_T0 > 1 ~ cut(
            log10(norm_cloneSize_T0),
            breaks = quantile(log10(norm_cloneSize_T0[cloneSize_T0 > 1]), 
                              probs = c(0,0.5,1), na.rm = TRUE),
            labels = c("Lowly Expanded (< Median)", "Highly Expanded (> Median)")     
        )),
        # T1 (C02)
        cloneClass_T1 = case_when(
        cloneSize_T1 == 1 ~ "Singlet", 
        cloneSize_T1 > 1 ~ cut(
            log10(norm_cloneSize_T1),
            breaks = quantile(log10(norm_cloneSize_T1[cloneSize_T1 > 1]), 
                              probs = c(0,0.5,1), na.rm = TRUE),
            labels = c("Lowly Expanded (< Median)", "Highly Expanded (> Median)")     
        )),
        # EOT
        cloneClass_EOT = case_when(
        cloneSize_EOT == 1 ~ "Singlet", 
        cloneSize_EOT > 1 ~ cut(
            log10(norm_cloneSize_EOT),
            breaks = quantile(log10(norm_cloneSize_EOT[cloneSize_EOT > 1]), 
                              probs = c(0,0.5,1), na.rm = TRUE),
            labels = c("Lowly Expanded (< Median)", "Highly Expanded (> Median)")     
        ))
    )

nrow(serpentine_TCR_GEX)
nrow(serpentine_TCR_GEX_wide)
head(serpentine_TCR_GEX_wide)

[1m[22m`summarise()` has grouped output by 'clonotype_id', 'patient', 'ID', 'timepoint', 'met_loc', 'response', 'tumor_growth',
'num_met_sites'. You can override using the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'clonotype_id', 'patient', 'met_loc', 'response', 'tumor_growth', 'num_met_sites',
'presence_status'. You can override using the `.groups` argument.


clonotype_id,patient,met_loc,response,tumor_growth,num_met_sites,presence_status,cloneSize_T0,cloneSize_T1,cloneSize_EOT,cell_type_T0,cell_type_T1,cell_type_EOT,CD4_CD8_assignment_T0,CD4_CD8_assignment_T1,CD4_CD8_assignment_EOT,lv1_T0,lv1_T1,lv1_EOT,annotation_T0,annotation_T1,annotation_EOT,lv2_T0,lv2_T1,lv2_EOT,norm_cloneSize_T0,norm_cloneSize_T1,norm_cloneSize_EOT,cloneClass_T0,cloneClass_T1,cloneClass_EOT
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<fct>,<fct>,<fct>
NA_,P01,Liver,PD,15.0,7.0,Pre-existing,54,95,0,T cells,T cells,,CD4,CD4,,T Naive/CM,CD8,,CD4 central memory,CD4 follicular helper,,,,,0.01920341,0.017301038,,Hyperexpanded,Hyperexpanded,
NA_,P02,Liver,PD,88.0,2.0,Pre-existing,216,49,0,T cells,T cells,,CD4,CD4,,CD8,CD8,,CD4 central memory,CD4 central memory,,T Naive (3),MAIT-17 (11),,0.02997086,0.009660883,,Hyperexpanded,Large,
NA_,P03,Liver,PD,22.0,3.0,Pre-existing,9,39,108,T cells,T cells,T cells,CD4,CD4,CD4,CD4,CD8,CD8,MAIT-17,CD4 central memory,CD4 follicular helper,MAIT-17 (11),Tregs (1),CD8 activated (24),0.02158273,0.014722537,0.01775148,Medium,Large,Hyperexpanded
NA_,P04,Lung,PD,45.0,,Pre-existing,268,73,0,T cells,T cells,,CD8,CD8,,CD8,CD8,,CD4 follicular helper,Tregs,,NK-Tgd (4),T Naive (3),,0.02239679,0.011263694,,Hyperexpanded,Hyperexpanded,
NA_,P05,Nodul,PD,,,Lost,21,0,0,T cells,,,CD4,,,CD8,,,CD8 resident activated,,,CD8 resident activated (25),,,0.02692308,,,Large,,
NA_,P06,Nodul,PD,,,Lost,74,0,0,T cells,,,CD4,,,CD8,,,CD4 central memory,,,,,,0.04695431,,,Hyperexpanded,,


In [106]:
# CHECK THAT THE SUM OF CLONAL SIZES EQUALS THE TOTAL NUMBER OF CELLS
( sum(serpentine_TCR_GEX_wide$cloneSize_T0) + sum(serpentine_TCR_GEX_wide$cloneSize_T1) + sum(serpentine_TCR_GEX_wide$cloneSize_EOT) )

In [107]:
( sum(serpentine_TCR_GEX_wide$norm_cloneSize_T0, na.rm = TRUE) + sum(serpentine_TCR_GEX_wide$norm_cloneSize_T1, na.rm = TRUE) + sum(serpentine_TCR_GEX_wide$norm_cloneSize_EOT, na.rm = TRUE) ) / 46

In [108]:
# Check there are no duplicates
as.data.table(serpentine_TCR_GEX_wide)[, .N, by = .(clonotype_id, patient)][N > 1] 

clonotype_id,patient,N
<chr>,<chr>,<int>


### Remove clonotypes without a beta chain

In [109]:
serpentine_TCR_GEX_wide %>%
    filter(clonotype_id == "NA_") %>%
    head() # these correspond to clonotypes without beta chain, we can filter them out (25)

clonotype_id,patient,met_loc,response,tumor_growth,num_met_sites,presence_status,cloneSize_T0,cloneSize_T1,cloneSize_EOT,cell_type_T0,cell_type_T1,cell_type_EOT,CD4_CD8_assignment_T0,CD4_CD8_assignment_T1,CD4_CD8_assignment_EOT,lv1_T0,lv1_T1,lv1_EOT,annotation_T0,annotation_T1,annotation_EOT,lv2_T0,lv2_T1,lv2_EOT,norm_cloneSize_T0,norm_cloneSize_T1,norm_cloneSize_EOT,cloneClass_T0,cloneClass_T1,cloneClass_EOT
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<fct>,<fct>,<fct>
NA_,P01,Liver,PD,15.0,7.0,Pre-existing,54,95,0,T cells,T cells,,CD4,CD4,,T Naive/CM,CD8,,CD4 central memory,CD4 follicular helper,,,,,0.01920341,0.017301038,,Hyperexpanded,Hyperexpanded,
NA_,P02,Liver,PD,88.0,2.0,Pre-existing,216,49,0,T cells,T cells,,CD4,CD4,,CD8,CD8,,CD4 central memory,CD4 central memory,,T Naive (3),MAIT-17 (11),,0.02997086,0.009660883,,Hyperexpanded,Large,
NA_,P03,Liver,PD,22.0,3.0,Pre-existing,9,39,108,T cells,T cells,T cells,CD4,CD4,CD4,CD4,CD8,CD8,MAIT-17,CD4 central memory,CD4 follicular helper,MAIT-17 (11),Tregs (1),CD8 activated (24),0.02158273,0.014722537,0.01775148,Medium,Large,Hyperexpanded
NA_,P04,Lung,PD,45.0,,Pre-existing,268,73,0,T cells,T cells,,CD8,CD8,,CD8,CD8,,CD4 follicular helper,Tregs,,NK-Tgd (4),T Naive (3),,0.02239679,0.011263694,,Hyperexpanded,Hyperexpanded,
NA_,P05,Nodul,PD,,,Lost,21,0,0,T cells,,,CD4,,,CD8,,,CD8 resident activated,,,CD8 resident activated (25),,,0.02692308,,,Large,,
NA_,P06,Nodul,PD,,,Lost,74,0,0,T cells,,,CD4,,,CD8,,,CD4 central memory,,,,,,0.04695431,,,Hyperexpanded,,


In [110]:
# Remove NA_clonotype_ids 
serpentine_TCR_GEX_wide <- serpentine_TCR_GEX_wide %>%
    filter(clonotype_id != "NA_")
nrow(serpentine_TCR_GEX_wide)

### Renormalize again after removing clonotypes without beta chain

In [111]:
serpentine_TCR_GEX_wide <- serpentine_TCR_GEX_wide %>%
    group_by(patient) %>%
    mutate(
        norm_cloneSize_T0 = cloneSize_T0 / sum(cloneSize_T0, na.rm = TRUE),
        norm_cloneSize_T1 = cloneSize_T1 / sum(cloneSize_T1, na.rm = TRUE),
        norm_cloneSize_EOT = cloneSize_EOT / sum(cloneSize_EOT, na.rm = TRUE)
    )

In [112]:
# Check sum of normalized values is 1
serpentine_TCR_GEX_wide %>%
    group_by(patient) %>%
    summarise(sum_T0 = sum(norm_cloneSize_T0), sum_T1 = sum(norm_cloneSize_T1), sum_EOT = sum(norm_cloneSize_EOT))

patient,sum_T0,sum_T1,sum_EOT
<chr>,<dbl>,<dbl>,<dbl>
P01,1.0,1.0,
P02,1.0,1.0,
P03,1.0,1.0,1.0
P04,1.0,1.0,
P05,1.0,,
P06,1.0,,
P07,1.0,1.0,1.0
P08,1.0,1.0,
P09,1.0,1.0,
P10,1.0,1.0,


In [113]:
table(serpentine_TCR_GEX_wide$CD4_CD8_assignment_T1)


  CD4   CD8 
30330 12140 

In [114]:
table(serpentine_TCR_GEX_wide$CD4_CD8_assignment_T0)


  CD4   CD8 
30751 10397 

In [116]:
# Check there are no duplicates
as.data.table(serpentine_TCR_GEX_wide)[, .N, by = .(clonotype_id, patient)][N > 1] 

clonotype_id,patient,N
<chr>,<chr>,<int>


In [117]:
# Save the TCR-GEX matched data in wide format
qsave(serpentine_TCR_GEX_wide, file = file.path(root_dir, "out", "data", "SERP_TCR-GEX_wide_11-2025_v2.qs"))

In [118]:
# Read the data
serpentine_TCR_GEX_wide <- qread(file = file.path(root_dir, "out", "data", "SERP_TCR-GEX_wide_11-2025_v2.qs"))