# Serpentine: Tumor 10x scTCR-seq Data Preprocessing

### Set Up Environment

In [4]:
# Load project configuration
setwd("/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR")
options(repr.matrix.max.rows=100, repr.matrix.max.cols=100)
source("code/helper/Config.R", echo = FALSE)

Project configured successfully. Root directory set to: /scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR 


## Read 10x contig files for each VDJ-T library

Read the filtered annotated contig files from each sample.

In [5]:
# Define project ids to be read
project_ids <- list("04_05", "06_07", "08_09", "10_11", "12_13", "14_15", "16_17", "18_19", "20_21", 
               "22_23", "24_25", "26_27", "28_29", "30_31", "32_33", "34_35", "36_37", "38_39", "40_41", 
               "42_43", "44_45", "46_47", "48_49", "50_51", "52_53", "54_55", "56_57", "58_59", "60_61",
               "62_63", "64_65", "66_67", "68_69", "70_71", "72_73", "74_75", "76_77", "78_79", "80_81", "82_83", "84_85") # Second data freeze

projects <- paste("SERPENTINE", project_ids, sep = "_")

print(paste("Total number of projects:", length(projects)))

[1] "Total number of projects: 41"


In [3]:
# Read the contig files
contig_list <- list()
sample_id_list <- list()
n = 0

# Iterate through each project
for (project in projects) {
    message(paste("Processing project", project, "..."))
      
    # Get the jobs dir
    jobs_dir <- file.path(shared_dir, project, "jobs")
    
    # Check if the jobs directory exists
    if (!dir.exists(jobs_dir)) {
        warning(paste("Jobs directory not found for project:", project))
        return(NULL)
    }
    
    # List all subdirectories in the jobs directory
    spe_folders <- list.dirs(jobs_dir, full.names = TRUE, recursive = FALSE)
    
    # Filter for directories that start with "SPE", "ESP2 or "01"
    folder_prefix <- ifelse(project == "SERPENTINE_04_05", "^01", "^SPE")
    folder_prefix <- ifelse(project %in% c("SERPENTINE_18_19", "SERPENTINE_26_27"), "^ESP", folder_prefix)
    spe_folders <- spe_folders[grepl(folder_prefix, basename(spe_folders))]
    
    if (length(spe_folders) == 0) {
        warning(paste("No SPE folders found for project:", project))
        return(NULL)
    }
    
    # Process each SPE folder
    for (spe_folder in spe_folders){

        # Construct the path to the contigs csv file 
        if (basename(spe_folder) == "SPE_1_03_C02_A_FRESH"){
            contigs_file <- file.path(spe_folder, "SPE_1_03_C02_A_FRESH/outs/per_sample_outs/SPE_1_03_C02_A_FRESH/vdj_t", "filtered_contig_annotations.csv")
        }
        else {
            contigs_file <- file.path(spe_folder, basename(spe_folder), "outs", "per_sample_outs", basename(spe_folder), "vdj_t", "filtered_contig_annotations.csv")
        }
        
        # Check if the file exists
        if (file.exists(contigs_file)) {
        
            # Read the sample contigs file
            if (file.size(contigs_file) != 0) {
                sample <- fread(contigs_file)
            
                # Add sample name to the sample ids list
                sample_id_list <- append(sample_id_list, list(basename(spe_folder)))
            
                # Append the contigs to a vector of all the sample contigs
                contig_list <- append(contig_list, list(sample))
                n <- n+1

                message(paste("    Sample", basename(spe_folder), "processed."))
            }
        
        } else {
            warning(paste("File not found:", contigs_file))
        }
    }
    message(paste("Project", project, "processed!"))
}

Processing project SERPENTINE_04_05 ...

    Sample 01_CD45 processed.

    Sample 01_total processed.

Project SERPENTINE_04_05 processed!

Processing project SERPENTINE_06_07 ...

    Sample SPE_1_02_SCR_A_FRESH_1 processed.

    Sample SPE_1_02_SCR_A_FRESH_2 processed.

Project SERPENTINE_06_07 processed!

Processing project SERPENTINE_08_09 ...

    Sample SPE_1_01_C2D1_A_FRESH_1 processed.

    Sample SPE_1_01_C2D1_A_FRESH_2 processed.

Project SERPENTINE_08_09 processed!

Processing project SERPENTINE_10_11 ...

    Sample SPE_1_02_C02_A_FRESH_1 processed.

    Sample SPE_1_02_C02_A_FRESH_2 processed.

    Sample SPE_1_03_SCR_A_FRESH processed.

Project SERPENTINE_10_11 processed!

Processing project SERPENTINE_12_13 ...

    Sample SPE_1_04_SCR_A_FRESH_1 processed.

    Sample SPE_1_04_SCR_A_FRESH_2 processed.

Project SERPENTINE_12_13 processed!

Processing project SERPENTINE_14_15 ...

    Sample SPE_1_03_C02_A_FRESH processed.

Project SERPENTINE_14_15 processed!

Processing 

In [4]:
# Get total number of samples
print(paste("Total number of samples:", length(contig_list)))

[1] "Total number of samples: 59"


In [8]:
# Save the sample-combined contigs lists
qsave(contig_list, file = file.path(root_dir, "out", "data", "SERP_TCR_contigs_list_09-2025_v2.qs"), nthreads=getDTthreads())

In [9]:
# Read the sample-combined contigs list
contig_list <- qread(file = file.path(root_dir, "out", "data", "SERP_TCR_contigs_list_09-2025_v2.qs"), nthreads=getDTthreads())

### Format sample ids

Format the sample names for simplicity: P[patient_num]_T[timepoint_num]_S[sample_num]. Ex: P01_T0_S1
\
\
Timepoint information:
- T0: Screening timepoint (SCR) (before ICI)
- T1: First timepoint after ICI (C02)
- T2: End of treatment (EOT) timepoint in non-responding patients

In [7]:
# Save original sample ids in a new column
sample_id_list[1] <-  "SPE_1_01_SCR_A_FRESH_CD45"
sample_id_list[2] <- "SPE_1_01_SCR_A_FRESH"
contig_list <- Map(function(contig_df, sample_id) {
    contig_df$replicate <- sample_id 
    return(contig_df)
}, contig_list, sample_id_list)
head(contig_list[[1]])

barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,fwr1,fwr1_nt,cdr1,cdr1_nt,fwr2,fwr2_nt,cdr2,cdr2_nt,fwr3,fwr3_nt,cdr3,cdr3_nt,fwr4,fwr4_nt,reads,umis,raw_clonotype_id,raw_consensus_id,exact_subclonotype_id,replicate
<chr>,<lgl>,<chr>,<lgl>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,<chr>,<int>,<chr>
AAACCTGAGTTAAGTG-1,True,AAACCTGAGTTAAGTG-1_contig_1,True,481,TRA,TRAV10,,TRAJ22,TRAC,True,True,KNQVEQSPQSLIILEGKNCTLQCNYT,AAAAACCAAGTGGAGCAGAGTCCTCAGTCCCTGATCATCCTGGAGGGAAAGAACTGCACTCTTCAATGCAATTATACA,VSPFSN,GTGAGCCCCTTCAGCAAC,LRWYKQDTGRGPVSLTI,TTAAGGTGGTATAAGCAAGATACTGGGAGAGGTCCTGTTTCCCTGACAATC,MTFSENT,ATGACTTTCAGTGAGAACACA,KSNGRYTATLDADTKQSSLHITASQLSDSASYI,AAGTCGAACGGAAGATATACAGCAACTCTGGATGCAGACACAAAGCAAAGCTCTCTGCACATCACAGCCTCCCAGCTCAGCGATTCAGCCTCCTACATC,CVVSLSGSARQLTF,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT,GSGTQLTVLP,GGATCTGGGACACAATTGACTGTTTTACCTG,2846,9,clonotype53,clonotype53_consensus_3,1,SPE_1_01_SCR_A_FRESH_CD45
AAACCTGAGTTAAGTG-1,True,AAACCTGAGTTAAGTG-1_contig_2,True,500,TRA,TRAV13-2,,TRAJ39,TRAC,True,True,GESVGLHLPTLSVQEGDNSIINCAYS,GGAGAGAGTGTGGGGCTGCATCTTCCTACCCTGAGTGTCCAGGAGGGTGACAACTCTATTATCAACTGTGCTTATTCA,NSASDY,AACAGCGCCTCAGACTAC,FIWYKQESGKGPQFIID,TTCATTTGGTACAAGCAAGAATCTGGAAAAGGTCCTCAGTTCATTATAGAC,IRSNMDK,ATTCGTTCAAATATGGACAAA,RQGQRVTVLLNKTVKHLSLQIAATQPGDSAVYF,AGGCAAGGCCAAAGAGTCACCGTTTTATTGAATAAGACAGTGAAACATCTCTCTCTGCAAATTGCAGCTACTCAACCTGGAGACTCAGCTGTCTACTTT,CAEMAYMLTF,TGTGCAGAGATGGCCTATATGCTCACCTTT,GGGTRLMVKP,GGAGGGGGAACAAGGTTAATGGTCAAACCCC,939,7,clonotype53,clonotype53_consensus_2,1,SPE_1_01_SCR_A_FRESH_CD45
AAACCTGAGTTAAGTG-1,True,AAACCTGAGTTAAGTG-1_contig_3,True,478,TRB,TRBV4-1,,TRBJ2-5,TRBC2,True,True,DTEVTQTPKHLVMGMTNKKSLKCEQH,GACACTGAAGTTACCCAGACACCAAAACACCTGGTCATGGGAATGACAAATAAGAAGTCTTTGAAATGTGAACAACAT,MGHRA,ATGGGGCACAGGGCT,MYWYKQKAKKPPELMFV,ATGTATTGGTACAAGCAGAAAGCTAAGAAGCCACCGGAGCTCATGTTTGTC,YSYEKL,TACAGCTATGAGAAACTC,SINESVPSRFSPECPNSSLLNLHLHALQPEDSALYL,TCTATAAATGAAAGTGTGCCAAGTCGCTTCTCACCTGAATGCCCCAACAGCTCTCTCTTAAACCTTCACCTACACGCCCTGCAGCCAGAAGACTCAGCCCTGTATCTC,CASSYGGFPETQYF,TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,GPGTRLLVL,GGGCCAGGCACGCGGCTCCTGGTGCTCG,7664,23,clonotype53,clonotype53_consensus_1,1,SPE_1_01_SCR_A_FRESH_CD45
AAACCTGTCAAACCAC-1,True,AAACCTGTCAAACCAC-1_contig_1,True,472,TRB,TRBV28,,TRBJ2-5,TRBC2,True,True,DVKVTQSSRYLVKRTGEKVFLECVQD,GATGTGAAAGTAACCCAGAGCTCGAGATATCTAGTCAAAAGGACGGGAGAGAAAGTTTTTCTGGAATGTGTCCAGGAT,MDHEN,ATGGACCATGAAAAT,MFWYRQDPGLGLRLIYF,ATGTTCTGGTATCGACAAGACCCAGGTCTGGGGCTACGGCTGATCTATTTC,SYDVKM,TCATATGATGTTAAAATG,KEKGDIPEGYSVSREKKERFSLILESASTNQTSMYL,AAAGAAAAAGGAGATATTCCTGAGGGGTACAGTGTCTCTAGAGAGAAGAAGGAGCGCTTCTCCCTGATTCTGGAGTCCGCCAGCACCAACCAGACATCTATGTACCTC,CASTGTGKLQETQYF,TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,GPGTRLLVL,GGGCCAGGCACGCGGCTCCTGGTGCTCG,2994,8,clonotype147,clonotype147_consensus_1,1,SPE_1_01_SCR_A_FRESH_CD45
AAACCTGTCAAACCAC-1,True,AAACCTGTCAAACCAC-1_contig_2,True,501,TRA,TRAV12-3,,TRAJ26,TRAC,True,True,QKEVEQDPGPLSVPEGAIVSLNCTYS,CAGAAGGAGGTGGAGCAGGATCCTGGACCACTCAGTGTTCCAGAGGGAGCCATTGTTTCTCTCAACTGCACTTACAGC,NSAFQY,AACAGTGCTTTTCAATAC,FMWYRQYSRKGPELLMY,TTCATGTGGTACAGACAGTATTCCAGAAAAGGCCCTGAGTTGCTGATGTAC,TYSSGN,ACATACTCCAGTGGTAAC,KEDGRFTAQVDKSSKYISLFIRDSQPSDSATYL,AAAGAAGATGGAAGGTTTACAGCACAGGTCGATAAATCCAGCAAGTATATCTCCTTGTTCATCAGAGACTCACAGCCCAGTGATTCAGCCACCTACCTC,CAMSLNNYGQNFVF,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT,GPGTRLSVLP,GGTCCCGGAACCAGATTGTCCGTGCTGCCCT,1185,5,clonotype147,clonotype147_consensus_2,1,SPE_1_01_SCR_A_FRESH_CD45
AAACCTGTCCGCAGTG-1,True,AAACCTGTCCGCAGTG-1_contig_1,True,618,TRA,TRAV3,,TRAJ5,TRAC,True,True,AQSVAQPEDQVNVAEGNPLTVKCTYS,GCTCAGTCAGTGGCTCAGCCGGAAGATCAGGTCAACGTTGCTGAAGGGAATCCTCTGACTGTGAAATGCACCTATTCA,VSGNPY,GTCTCTGGAAACCCTTAT,LFWYVQYPNRGLQFLLK,CTTTTTTGGTATGTTCAATACCCCAACCGAGGCCTCCAGTTCCTTCTGAAA,YITGDNLV,TACATCACAGGGGATAACCTGGTT,KGSYGFEAEFNKSQTSFHLKKPSALVSDSALYF,AAAGGCAGCTATGGCTTTGAAGCTGAATTTAACAAGAGCCAAACCTCCTTCCACCTGAAGAAACCATCTGCCCTTGTGAGCGACTCCGCTTTGTACTTC,CAVREDTGRRALTF,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT,GSGTRLQVQP,GGGAGTGGAACAAGACTCCAAGTGCAACCAA,1896,10,clonotype9,clonotype9_consensus_2,1,SPE_1_01_SCR_A_FRESH_CD45


In [8]:
# Rename sample ids
mod_ids <- sapply(sample_id_list[c(3:length(sample_id_list))], function(x) {
    split_id <- strsplit(x, "_")[[1]]
    mod_id <- c(ifelse(split_id[3] == "8", "08", split_id[3]),                                 # change patient 8 name from P8 to P08
                ifelse(split_id[4] == "SCR", "T0",                                             # change 1st timepoint (SCR) to T0
                ifelse(split_id[4] %in% c("C2", "C02", "C2D1"), "T1",                          # change 2nd timepoint to T1
                ifelse(split_id[4] == "EOT", "EOT", split_id[4]))), tail(split_id, n = 1)) %>% paste(collapse = "_")
    mod_id <- paste0("P", mod_id) 
    mod_id <- gsub("_FRESH$", "_1", mod_id)  # Remove sample specific suffixes to sample count (1, 2)
    mod_id <- gsub("_GEX1$", "_1", mod_id)
    mod_id <- gsub("_GEX2$", "_2", mod_id)
    mod_id <- gsub("_GEX$", "_1", mod_id)
    mod_id <- gsub("_1$", "_S1", mod_id)
    mod_id <- gsub("_2$", "_S2", mod_id)
    return(mod_id)
})
new_ids <- c("P01_T0_CD45pos", "P01_T0_total", mod_ids) 
print(sort(new_ids))

 [1] "P01_T0_CD45pos"  "P01_T0_total"    "P01_T1_S1"       "P01_T1_S2"      
 [5] "P02_T0_S1"       "P02_T0_S2"       "P02_T1_S1"       "P02_T1_S2"      
 [9] "P03_EOT_CD45pos" "P03_T0_S1"       "P03_T1_S1"       "P04_T0_S1"      
[13] "P04_T0_S2"       "P04_T1_S1"       "P04_T1_S2"       "P05_T0_S1"      
[17] "P06_T0_CD45pos"  "P07_EOT_S1"      "P07_EOT_S2"      "P07_T0_CD45pos" 
[21] "P07_T1_S1"       "P07_T1_S2"       "P08_T0_S1"       "P08_T1_S1"      
[25] "P08_T1_S2"       "P09_T0_S1"       "P09_T1_CD45pos"  "P10_T0_CD45pos" 
[29] "P10_T1_CD45pos"  "P14_T0_CD45pos"  "P14_T1_CD45pos"  "P15_T0_S1"      
[33] "P15_T0_S2"       "P15_T1_CD45pos"  "P16_T0_S1"       "P16_T0_S2"      
[37] "P16_T1_CD45pos"  "P17_T0_S1"       "P17_T1_S1"       "P17_T1_S2"      
[41] "P18_T0_S1"       "P18_T1_S1"       "P19_T1_S1"       "P19_T1_S2"      
[45] "P20_T0_S1"       "P20_T1_S1"       "P26_T0_S1"       "P26_T1_S1"      
[49] "P29_T0_S1"       "P29_T1_S1"       "P31_T0_S1"       "P31_T1_S1"      

### Combine the list of contigs into individual clones

Here we join the alpha and beta chain contigs for each individual cell.

In [18]:
combined_TCR <- scRepertoire::combineTCR(contig_list,
                           samples = unlist(sample_id_list),
                           ID = new_ids,
                           removeNA = FALSE,
                           removeMulti = FALSE,
                           filterMulti = TRUE,
                           filterNonproductive = FALSE)
names(combined_TCR) <- new_ids

In [14]:
# Save combined TCR object
qsave(combined_TCR, file = file.path(root_dir, "out", "data", "SERP_TCR_combined_clones_09-2025_v2.qs"))

In [16]:
# Read combined TCR object
combined_TCR <- qread(file = file.path(root_dir, "out", "data", "SERP_TCR_combined_clones_09-2025_v2.qs"))

In [19]:
# count number of cells per each sample and the total

# Count rows in each data frame
rows_per_df <- sapply(combined_TCR, nrow)

# Convert to a tidy data frame
rows_df <- data.frame(
    df_name = names(rows_per_df),
    n_rows = rows_per_df
)

rows_df

# Total number of rows across all data frames
total_rows <- sum(rows_per_df)
total_rows #208530

Unnamed: 0_level_0,df_name,n_rows
Unnamed: 0_level_1,<chr>,<int>
P01_T0_CD45pos,P01_T0_CD45pos,2545
P01_T0_total,P01_T0_total,267
P02_T0_S1,P02_T0_S1,4022
P02_T0_S2,P02_T0_S2,3185
P01_T1_S1,P01_T1_S1,2855
P01_T1_S2,P01_T1_S2,2636
P02_T1_S1,P02_T1_S1,2739
P02_T1_S2,P02_T1_S2,2333
P03_T0_S1,P03_T0_S1,417
P04_T0_S1,P04_T0_S1,6001


### Add metadata information

Add metadata information at the cell-level.

In [19]:
# Patient
patients <- sapply(new_ids, function(x) {
    p <- strsplit(x, "_")[[1]][1]
    return(p)
})
patients

In [20]:
# Assign metadata information

# Timepoint
timepoints <- sapply(new_ids, function(x) {
    tp <- strsplit(x, "_")[[1]][2]
    return(tp)
})

# Assessment point
timepoint_to_assess <- c(T0 = "SCR", T1 = "C02", EOT = "EOT")
assess_point <- sapply(timepoints, function(tp) timepoint_to_assess[tp])
names(assess_point) <- new_ids
                       
# Patient
patients <- sapply(new_ids, function(x) {
    p <- strsplit(x, "_")[[1]][1]
    return(p)
})

# MS Status
MS_statuses <- sapply(patients, function(p) {
    MS <- ifelse(p %in% c("P04", "P13", "P16", "P18", "P24", "P28"), "MSI", "MSS")
    return(MS)
})

# Tumor type
tumor_types <- sapply(patients, function(p) {
    tumors <- ifelse(p %in% c("P04", "P06", "P07", "P08", "P09", "P11", "P12"), "EC", "CRC") # No data for P06, 11, 12
    return(tumors)
})

# Response status
responses <- sapply(patients, function(p) {
    resp <- ifelse(p %in% c("P08", "P15", "P18","P27","P28", "P29", "P31", "P34", "P35", "P37", "P38"), "SD", "PD")
    resp <- ifelse(p %in% c("P13", "P16", "P19", "P40"), "PR", resp)
    resp <- ifelse(p %in% c("24"), "CR", resp)
    return(resp)
})

# Metastasic tissue
id_to_met <- c(
    P01 = "Liver", P02 = "Liver", P03 = "Liver", P08 = "Liver",
    P10 = "Liver", P07 = "Liver", P14 = "Liver", P16 = "Liver",
    P05 = "Nodul", P06 = "Nodul", 
    P09 = "Peritoneum",
    P07 = "Lymphnode", P18 = "Lymphnode",
    P33 = "Liver"
)
default_met <- "Lung"
mets <- sapply(patients, function(p) {
    if (is.na(id_to_met[p])) met <- default_met
    else met <- id_to_met[p]
    return(met)
})
names(mets) <- new_ids

# Add tumor growth information at BOR
tumor_growth <- c(
    P01 = 15, P02 = 88, P03 = 22, P04 = 45, P07 = 10, P08 = 4, P09 = 23, P10 = 149, P13 = -74.35, P14 = 37, P15 = 9.7, P16 = -100, P17 = NA, P18 = -23.2, P19 = -34.7, P20 = 12.8,
    P21 = 23.1, P24 = -100, P26 = 13.2, P27 = 4.2, P28 = 7.1, P29 = 2.1, P31 = 9.6, P32 = 24.2, P33 = 33.3, P34 = 24.8, P35 = 6.9, P36 = -26.2, P37 = 0, P38 = NA, P39 = -13.8, P40 = -31.49
)
tumor_growth_list <- sapply(patients, function(p) tumor_growth[p])
names(tumor_growth_list) <- new_ids

In [22]:
# Add metadata information to the combined object
combined_TCR <- lapply(names(combined_TCR), function(sample) {
    df <- combined_TCR[[sample]]
    
    df$timepoint        <- timepoints[sample]
    df$assessment_point <- assess_point[sample]
    df$patient          <- patients[sample]
    df$MS_status        <- MS_statuses[sample]
    df$tumor_type       <- tumor_types[sample]
    df$met_loc          <- mets[sample]
    df$response         <- responses[sample]
    df$tumor_growth     <- tumor_growth_list[sample]     
    return(df)
})

names(combined_TCR) <- new_ids

In [24]:
# Save combined TCR object
qsave(combined_TCR, file = file.path(root_dir, "out", "data", "SERP_TCR_combined_clones_09-2025_v2.qs"))

## Clonotype Calling: Clonal Size Quantification

In [3]:
# Read combined TCR object
combined_TCR <- qread(file = file.path(root_dir, "out", "data", "SERP_TCR_combined_clones_09-2025_v2.qs"))

In [4]:
# Overview of the data
head(combined_TCR[[1]])

Unnamed: 0_level_0,barcode,sample,ID,TCR1,cdr3_aa1,cdr3_nt1,TCR2,cdr3_aa2,cdr3_nt2,CTgene,CTnt,CTaa,CTstrict,timepoint,assessment_point,patient,MS_status,tumor_type,met_loc,response,mol_profile,KRAS_mut,BRAF_mut,prev_chemo,liver_met,time_gap_days,age,sex,tumor_growth,stage_diag,num_prev_lines,num_met_sites,adjuv_treat
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>
1,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGAGTTAAGTG-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV10.TRAJ22.TRAC,CVVSLSGSARQLTF,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT,TRBV4-1.NA.TRBJ2-5.TRBC2,CASSYGGFPETQYF,TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,TRAV10.TRAJ22.TRAC_TRBV4-1.NA.TRBJ2-5.TRBC2,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT_TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,CVVSLSGSARQLTF_CASSYGGFPETQYF,TRAV10.TRAJ22.TRAC;TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT_TRBV4-1.NA.TRBJ2-5.TRBC2;TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,
3,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGTCAAACCAC-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV12-3.TRAJ26.TRAC,CAMSLNNYGQNFVF,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT,TRBV28.NA.TRBJ2-5.TRBC2,CASTGTGKLQETQYF,TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,TRAV12-3.TRAJ26.TRAC_TRBV28.NA.TRBJ2-5.TRBC2,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT_TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,CAMSLNNYGQNFVF_CASTGTGKLQETQYF,TRAV12-3.TRAJ26.TRAC;TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT_TRBV28.NA.TRBJ2-5.TRBC2;TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,
5,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGTCCGCAGTG-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV3.TRAJ5.TRAC,CAVREDTGRRALTF,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT,TRBV29-1.NA.TRBJ1-1.TRBC1,CSVPLGAGEAFF,TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,TRAV3.TRAJ5.TRAC_TRBV29-1.NA.TRBJ1-1.TRBC1,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT_TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,CAVREDTGRRALTF_CSVPLGAGEAFF,TRAV3.TRAJ5.TRAC;TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT_TRBV29-1.NA.TRBJ1-1.TRBC1;TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,
7,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACGGGAGCGTTTAC-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV23/DV6.TRAJ57.TRAC,CAVHQGGSEKLVF,TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT,TRBV30.NA.TRBJ1-4.TRBC1,CAWSVGGVDEKLFF,TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,TRAV23/DV6.TRAJ57.TRAC_TRBV30.NA.TRBJ1-4.TRBC1,TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT_TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,CAVHQGGSEKLVF_CAWSVGGVDEKLFF,TRAV23/DV6.TRAJ57.TRAC;TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT_TRBV30.NA.TRBJ1-4.TRBC1;TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,
9,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACGGGAGTATTGGA-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV24.TRAJ15.TRAC,CALKTALIF,TGTGCTCTCAAAACTGCTCTGATCTTT,TRBV9.NA.TRBJ2-5.TRBC2,CASSVGGGSQETQYF,TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,TRAV24.TRAJ15.TRAC_TRBV9.NA.TRBJ2-5.TRBC2,TGTGCTCTCAAAACTGCTCTGATCTTT_TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,CALKTALIF_CASSVGGGSQETQYF,TRAV24.TRAJ15.TRAC;TGTGCTCTCAAAACTGCTCTGATCTTT_TRBV9.NA.TRBJ2-5.TRBC2;TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,
11,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACGGGCAAGGTTCT-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV8-6.TRAJ20.TRAC,CAVSDQGDYKLSF,TGTGCTGTGAGTGATCAGGGCGACTACAAGCTCAGCTTT,TRBV20-1.NA.TRBJ2-5.TRBC2,CSAKAGLAGVETQYF,TGCAGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTACTTC,TRAV8-6.TRAJ20.TRAC_TRBV20-1.NA.TRBJ2-5.TRBC2,TGTGCTGTGAGTGATCAGGGCGACTACAAGCTCAGCTTT_TGCAGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTACTTC,CAVSDQGDYKLSF_CSAKAGLAGVETQYF,TRAV8-6.TRAJ20.TRAC;TGTGCTGTGAGTGATCAGGGCGACTACAAGCTCAGCTTT_TRBV20-1.NA.TRBJ2-5.TRBC2;TGCAGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,


### Clonotypes definition

Clonotypes are defined by the TRBV-TRBJ segments plus the CDR3 region nucleotide sequence (CTnt) of the beta chain of the T cell receptor. 

In [6]:
# Create a function to extract the V and J gene segments TRBV and TRBJ for clonotype definition
extract_trbv_trbj <- function(x) {
    parts <- strsplit(x, "_")[[1]]
    if(length(parts) < 2) return(NA)
    post_underscore <- parts[2]
    subparts <- strsplit(post_underscore, "\\.")[[1]]
    trbv <- subparts[grep("^TRBV", subparts)]
    trbj <- subparts[grep("^TRBJ", subparts)]
    if(length(trbv) == 0 | length(trbj) == 0) return(NA)
    paste0(trbv, "_", trbj)
}

In [7]:
# Create clonotype id column as TRBV-TRBJ+CDR3 beta chain combination
combined_TCR <- lapply(combined_TCR, function(sample_df) {
    sample_df <-  sample_df %>%
        mutate(
            CTnt_beta       = sub(".*_", "", CTnt),
            CTnt_beta_shrt  = substr(CTnt_beta, 4, nchar(CTnt_beta) - 3),
            CTnt_alpha      = sub("_.*", "", CTnt),
            
            CTaa_beta       = sub(".*_", "", CTaa),
            CTaa_alpha      = sub("_.*", "", CTaa),
            
            CTgene_beta     = sub(".*_", "", CTgene),
            CTgene_alpha    = sub("_.*", "", CTgene),
            
            TRBV_TRBJ = as.character(sapply(CTgene, extract_trbv_trbj)),
            
            clonotype_id = paste(TRBV_TRBJ, CTnt_beta_shrt, sep = "_")
        )

    return(sample_df)
})

Unnamed: 0_level_0,barcode,sample,ID,TCR1,cdr3_aa1,cdr3_nt1,TCR2,cdr3_aa2,cdr3_nt2,CTgene,CTnt,CTaa,CTstrict,timepoint,assessment_point,patient,MS_status,tumor_type,met_loc,response,mol_profile,KRAS_mut,BRAF_mut,prev_chemo,liver_met,time_gap_days,age,sex,tumor_growth,stage_diag,num_prev_lines,num_met_sites,adjuv_treat,CTnt_beta,CTnt_beta_shrt,CTnt_alpha,CTaa_beta,CTaa_alpha,CTgene_beta,CTgene_alpha,TRBV_TRBJ,clonotype_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGAGTTAAGTG-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV10.TRAJ22.TRAC,CVVSLSGSARQLTF,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT,TRBV4-1.NA.TRBJ2-5.TRBC2,CASSYGGFPETQYF,TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,TRAV10.TRAJ22.TRAC_TRBV4-1.NA.TRBJ2-5.TRBC2,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT_TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,CVVSLSGSARQLTF_CASSYGGFPETQYF,TRAV10.TRAJ22.TRAC;TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT_TRBV4-1.NA.TRBJ2-5.TRBC2;TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,GCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTAC,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT,CASSYGGFPETQYF,CVVSLSGSARQLTF,TRBV4-1.NA.TRBJ2-5.TRBC2,TRAV10.TRAJ22.TRAC,TRBV4-1_TRBJ2-5,TRBV4-1_TRBJ2-5_GCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTAC
3,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGTCAAACCAC-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV12-3.TRAJ26.TRAC,CAMSLNNYGQNFVF,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT,TRBV28.NA.TRBJ2-5.TRBC2,CASTGTGKLQETQYF,TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,TRAV12-3.TRAJ26.TRAC_TRBV28.NA.TRBJ2-5.TRBC2,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT_TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,CAMSLNNYGQNFVF_CASTGTGKLQETQYF,TRAV12-3.TRAJ26.TRAC;TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT_TRBV28.NA.TRBJ2-5.TRBC2;TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,GCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTAC,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT,CASTGTGKLQETQYF,CAMSLNNYGQNFVF,TRBV28.NA.TRBJ2-5.TRBC2,TRAV12-3.TRAJ26.TRAC,TRBV28_TRBJ2-5,TRBV28_TRBJ2-5_GCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTAC
5,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGTCCGCAGTG-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV3.TRAJ5.TRAC,CAVREDTGRRALTF,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT,TRBV29-1.NA.TRBJ1-1.TRBC1,CSVPLGAGEAFF,TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,TRAV3.TRAJ5.TRAC_TRBV29-1.NA.TRBJ1-1.TRBC1,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT_TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,CAVREDTGRRALTF_CSVPLGAGEAFF,TRAV3.TRAJ5.TRAC;TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT_TRBV29-1.NA.TRBJ1-1.TRBC1;TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,AGCGTCCCCCTGGGGGCTGGGGAAGCTTTC,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT,CSVPLGAGEAFF,CAVREDTGRRALTF,TRBV29-1.NA.TRBJ1-1.TRBC1,TRAV3.TRAJ5.TRAC,TRBV29-1_TRBJ1-1,TRBV29-1_TRBJ1-1_AGCGTCCCCCTGGGGGCTGGGGAAGCTTTC
7,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACGGGAGCGTTTAC-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV23/DV6.TRAJ57.TRAC,CAVHQGGSEKLVF,TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT,TRBV30.NA.TRBJ1-4.TRBC1,CAWSVGGVDEKLFF,TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,TRAV23/DV6.TRAJ57.TRAC_TRBV30.NA.TRBJ1-4.TRBC1,TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT_TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,CAVHQGGSEKLVF_CAWSVGGVDEKLFF,TRAV23/DV6.TRAJ57.TRAC;TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT_TRBV30.NA.TRBJ1-4.TRBC1;TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,GCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTT,TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT,CAWSVGGVDEKLFF,CAVHQGGSEKLVF,TRBV30.NA.TRBJ1-4.TRBC1,TRAV23/DV6.TRAJ57.TRAC,TRBV30_TRBJ1-4,TRBV30_TRBJ1-4_GCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTT
9,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACGGGAGTATTGGA-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV24.TRAJ15.TRAC,CALKTALIF,TGTGCTCTCAAAACTGCTCTGATCTTT,TRBV9.NA.TRBJ2-5.TRBC2,CASSVGGGSQETQYF,TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,TRAV24.TRAJ15.TRAC_TRBV9.NA.TRBJ2-5.TRBC2,TGTGCTCTCAAAACTGCTCTGATCTTT_TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,CALKTALIF_CASSVGGGSQETQYF,TRAV24.TRAJ15.TRAC;TGTGCTCTCAAAACTGCTCTGATCTTT_TRBV9.NA.TRBJ2-5.TRBC2;TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,GCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTAC,TGTGCTCTCAAAACTGCTCTGATCTTT,CASSVGGGSQETQYF,CALKTALIF,TRBV9.NA.TRBJ2-5.TRBC2,TRAV24.TRAJ15.TRAC,TRBV9_TRBJ2-5,TRBV9_TRBJ2-5_GCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTAC
11,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACGGGCAAGGTTCT-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV8-6.TRAJ20.TRAC,CAVSDQGDYKLSF,TGTGCTGTGAGTGATCAGGGCGACTACAAGCTCAGCTTT,TRBV20-1.NA.TRBJ2-5.TRBC2,CSAKAGLAGVETQYF,TGCAGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTACTTC,TRAV8-6.TRAJ20.TRAC_TRBV20-1.NA.TRBJ2-5.TRBC2,TGTGCTGTGAGTGATCAGGGCGACTACAAGCTCAGCTTT_TGCAGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTACTTC,CAVSDQGDYKLSF_CSAKAGLAGVETQYF,TRAV8-6.TRAJ20.TRAC;TGTGCTGTGAGTGATCAGGGCGACTACAAGCTCAGCTTT_TRBV20-1.NA.TRBJ2-5.TRBC2;TGCAGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGCAGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTACTTC,AGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTAC,TGTGCTGTGAGTGATCAGGGCGACTACAAGCTCAGCTTT,CSAKAGLAGVETQYF,CAVSDQGDYKLSF,TRBV20-1.NA.TRBJ2-5.TRBC2,TRAV8-6.TRAJ20.TRAC,TRBV20-1_TRBJ2-5,TRBV20-1_TRBJ2-5_AGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTAC


At this point, we store the ungrouped sample-specific combined TCR data to further match them with the GEX data. Joining the TCR and GEX data will allow us to quantify the expansion level of each cell phenotype.

In [38]:
# Save the ungrouped combined TCR data updated with chain split and clone size info
qsave(combined_TCR, file = file.path(root_dir, "out", "data", "SERP_TCR_combined_clones_with_sizes_09-2025_v2.qs"), nthreads = 1)

In [39]:
# Read the combined TCR data updated with chain split and clone size info
combined_TCR <- qread(file = file.path(root_dir, "out", "data", "SERP_TCR_combined_clones_with_sizes_09-2025_v2.qs"))

In [41]:
# Check for duplicates in each data frame in the list
duplicates_per_df <- lapply(combined_TCR, function(df) {
    duplicated_rows <- duplicated(df)  
    any_dup <- any(duplicated_rows)    
    list(has_duplicates = any_dup, duplicated_rows = duplicated_rows)
})
sapply(duplicates_per_df, function(x) x$has_duplicates) %>% table()

.
FALSE 
   59 

### Clonal sizes quantification

Clonal sizes are quantified by aggregating cell counts for each clonotype at the sample level. Clonal proportions are computed by normalizing clonal sizes by library size. 

In [42]:
# Create a single df from all sample dfs in combined_TCR
all_combined_TCR_df <- bind_rows(combined_TCR, .id = "sample_id")
head(all_combined_TCR_df)

Unnamed: 0_level_0,sample_id,barcode,sample,ID,TCR1,cdr3_aa1,cdr3_nt1,TCR2,cdr3_aa2,cdr3_nt2,CTgene,CTnt,CTaa,CTstrict,timepoint,assessment_point,patient,MS_status,tumor_type,met_loc,response,mol_profile,KRAS_mut,BRAF_mut,prev_chemo,liver_met,time_gap_days,age,sex,tumor_growth,stage_diag,num_prev_lines,num_met_sites,adjuv_treat,CTnt_beta,CTnt_beta_shrt,CTnt_alpha,CTaa_beta,CTaa_alpha,CTgene_beta,CTgene_alpha,TRBV_TRBJ,clonotype_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGAGTTAAGTG-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV10.TRAJ22.TRAC,CVVSLSGSARQLTF,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT,TRBV4-1.NA.TRBJ2-5.TRBC2,CASSYGGFPETQYF,TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,TRAV10.TRAJ22.TRAC_TRBV4-1.NA.TRBJ2-5.TRBC2,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT_TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,CVVSLSGSARQLTF_CASSYGGFPETQYF,TRAV10.TRAJ22.TRAC;TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT_TRBV4-1.NA.TRBJ2-5.TRBC2;TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,GCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTAC,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT,CASSYGGFPETQYF,CVVSLSGSARQLTF,TRBV4-1.NA.TRBJ2-5.TRBC2,TRAV10.TRAJ22.TRAC,TRBV4-1_TRBJ2-5,TRBV4-1_TRBJ2-5_GCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTAC
2,P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGTCAAACCAC-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV12-3.TRAJ26.TRAC,CAMSLNNYGQNFVF,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT,TRBV28.NA.TRBJ2-5.TRBC2,CASTGTGKLQETQYF,TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,TRAV12-3.TRAJ26.TRAC_TRBV28.NA.TRBJ2-5.TRBC2,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT_TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,CAMSLNNYGQNFVF_CASTGTGKLQETQYF,TRAV12-3.TRAJ26.TRAC;TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT_TRBV28.NA.TRBJ2-5.TRBC2;TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,GCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTAC,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT,CASTGTGKLQETQYF,CAMSLNNYGQNFVF,TRBV28.NA.TRBJ2-5.TRBC2,TRAV12-3.TRAJ26.TRAC,TRBV28_TRBJ2-5,TRBV28_TRBJ2-5_GCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTAC
3,P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGTCCGCAGTG-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV3.TRAJ5.TRAC,CAVREDTGRRALTF,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT,TRBV29-1.NA.TRBJ1-1.TRBC1,CSVPLGAGEAFF,TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,TRAV3.TRAJ5.TRAC_TRBV29-1.NA.TRBJ1-1.TRBC1,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT_TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,CAVREDTGRRALTF_CSVPLGAGEAFF,TRAV3.TRAJ5.TRAC;TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT_TRBV29-1.NA.TRBJ1-1.TRBC1;TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,AGCGTCCCCCTGGGGGCTGGGGAAGCTTTC,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT,CSVPLGAGEAFF,CAVREDTGRRALTF,TRBV29-1.NA.TRBJ1-1.TRBC1,TRAV3.TRAJ5.TRAC,TRBV29-1_TRBJ1-1,TRBV29-1_TRBJ1-1_AGCGTCCCCCTGGGGGCTGGGGAAGCTTTC
4,P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACGGGAGCGTTTAC-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV23/DV6.TRAJ57.TRAC,CAVHQGGSEKLVF,TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT,TRBV30.NA.TRBJ1-4.TRBC1,CAWSVGGVDEKLFF,TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,TRAV23/DV6.TRAJ57.TRAC_TRBV30.NA.TRBJ1-4.TRBC1,TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT_TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,CAVHQGGSEKLVF_CAWSVGGVDEKLFF,TRAV23/DV6.TRAJ57.TRAC;TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT_TRBV30.NA.TRBJ1-4.TRBC1;TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,GCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTT,TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT,CAWSVGGVDEKLFF,CAVHQGGSEKLVF,TRBV30.NA.TRBJ1-4.TRBC1,TRAV23/DV6.TRAJ57.TRAC,TRBV30_TRBJ1-4,TRBV30_TRBJ1-4_GCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTT
5,P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACGGGAGTATTGGA-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV24.TRAJ15.TRAC,CALKTALIF,TGTGCTCTCAAAACTGCTCTGATCTTT,TRBV9.NA.TRBJ2-5.TRBC2,CASSVGGGSQETQYF,TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,TRAV24.TRAJ15.TRAC_TRBV9.NA.TRBJ2-5.TRBC2,TGTGCTCTCAAAACTGCTCTGATCTTT_TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,CALKTALIF_CASSVGGGSQETQYF,TRAV24.TRAJ15.TRAC;TGTGCTCTCAAAACTGCTCTGATCTTT_TRBV9.NA.TRBJ2-5.TRBC2;TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,GCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTAC,TGTGCTCTCAAAACTGCTCTGATCTTT,CASSVGGGSQETQYF,CALKTALIF,TRBV9.NA.TRBJ2-5.TRBC2,TRAV24.TRAJ15.TRAC,TRBV9_TRBJ2-5,TRBV9_TRBJ2-5_GCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTAC
6,P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACGGGCAAGGTTCT-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV8-6.TRAJ20.TRAC,CAVSDQGDYKLSF,TGTGCTGTGAGTGATCAGGGCGACTACAAGCTCAGCTTT,TRBV20-1.NA.TRBJ2-5.TRBC2,CSAKAGLAGVETQYF,TGCAGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTACTTC,TRAV8-6.TRAJ20.TRAC_TRBV20-1.NA.TRBJ2-5.TRBC2,TGTGCTGTGAGTGATCAGGGCGACTACAAGCTCAGCTTT_TGCAGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTACTTC,CAVSDQGDYKLSF_CSAKAGLAGVETQYF,TRAV8-6.TRAJ20.TRAC;TGTGCTGTGAGTGATCAGGGCGACTACAAGCTCAGCTTT_TRBV20-1.NA.TRBJ2-5.TRBC2;TGCAGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGCAGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTACTTC,AGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTAC,TGTGCTGTGAGTGATCAGGGCGACTACAAGCTCAGCTTT,CSAKAGLAGVETQYF,CAVSDQGDYKLSF,TRBV20-1.NA.TRBJ2-5.TRBC2,TRAV8-6.TRAJ20.TRAC,TRBV20-1_TRBJ2-5,TRBV20-1_TRBJ2-5_AGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTAC


In [43]:
# Check if there are clonotype_id duplicates
nrow(all_combined_TCR_df)
nrow(distinct(all_combined_TCR_df %>% select(ID, clonotype_id)))

In [46]:
# Count number of total cells
nrow(all_combined_TCR_df)

In [47]:
# Check that all rows have been concatenated correctly
sum(sapply(combined_TCR, nrow)) == nrow(all_combined_TCR_df)

In [48]:
# Compute the Clonal size for each clonotype per time point
all_combined_TCR_df <- all_combined_TCR_df %>%
    
    # Group by id and calculate clone size (cloneSize computed at the library level) (compute at the time point level since replicates are technical)
    group_by(ID, clonotype_id) %>%  # ID, clonotype_id
    mutate(cloneSize = n()) %>% # counts
    ungroup() %>%
    group_by(patient, timepoint) %>% 
    mutate(norm_cloneSize = cloneSize / sum(cloneSize)) %>% # Proportions: normalized counts by library size
    ungroup() %>% 

    # Classify clones by their degree of expansion
    mutate(cloneClass = case_when(
       cloneSize == 1 ~ "Singlet", 
       cloneSize > 1 ~ cut(
           log10(norm_cloneSize),
           breaks = quantile(log10(norm_cloneSize[cloneSize > 1]), 
                             probs = c(0,0.5,1), na.rm = TRUE),
           labels = c("Lowly Expanded (< Median)", "Highly Expanded (> Median)")     
        )
    )

head(all_combined_TCR_df)

sample_id,barcode,sample,ID,TCR1,cdr3_aa1,cdr3_nt1,TCR2,cdr3_aa2,cdr3_nt2,CTgene,CTnt,CTaa,CTstrict,timepoint,assessment_point,patient,MS_status,tumor_type,met_loc,response,mol_profile,KRAS_mut,BRAF_mut,prev_chemo,liver_met,time_gap_days,age,sex,tumor_growth,stage_diag,num_prev_lines,num_met_sites,adjuv_treat,CTnt_beta,CTnt_beta_shrt,CTnt_alpha,CTaa_beta,CTaa_alpha,CTgene_beta,CTgene_alpha,TRBV_TRBJ,clonotype_id,cloneSize,norm_cloneSize,cloneClass
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<fct>
P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGAGTTAAGTG-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV10.TRAJ22.TRAC,CVVSLSGSARQLTF,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT,TRBV4-1.NA.TRBJ2-5.TRBC2,CASSYGGFPETQYF,TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,TRAV10.TRAJ22.TRAC_TRBV4-1.NA.TRBJ2-5.TRBC2,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT_TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,CVVSLSGSARQLTF_CASSYGGFPETQYF,TRAV10.TRAJ22.TRAC;TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT_TRBV4-1.NA.TRBJ2-5.TRBC2;TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGCGCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTACTTC,GCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTAC,TGTGTGGTGAGCCTATCTGGTTCTGCAAGGCAACTGACCTTT,CASSYGGFPETQYF,CVVSLSGSARQLTF,TRBV4-1.NA.TRBJ2-5.TRBC2,TRAV10.TRAJ22.TRAC,TRBV4-1_TRBJ2-5,TRBV4-1_TRBJ2-5_GCCAGCAGCTATGGTGGATTCCCAGAGACCCAGTAC,3,7.037298e-05,Rare
P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGTCAAACCAC-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV12-3.TRAJ26.TRAC,CAMSLNNYGQNFVF,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT,TRBV28.NA.TRBJ2-5.TRBC2,CASTGTGKLQETQYF,TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,TRAV12-3.TRAJ26.TRAC_TRBV28.NA.TRBJ2-5.TRBC2,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT_TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,CAMSLNNYGQNFVF_CASTGTGKLQETQYF,TRAV12-3.TRAJ26.TRAC;TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT_TRBV28.NA.TRBJ2-5.TRBC2;TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGTGCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTACTTC,GCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTAC,TGTGCAATGAGCCTCAATAACTATGGTCAGAATTTTGTCTTT,CASTGTGKLQETQYF,CAMSLNNYGQNFVF,TRBV28.NA.TRBJ2-5.TRBC2,TRAV12-3.TRAJ26.TRAC,TRBV28_TRBJ2-5,TRBV28_TRBJ2-5_GCCAGCACAGGGACAGGTAAACTTCAAGAGACCCAGTAC,2,4.691532e-05,Rare
P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACCTGTCCGCAGTG-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV3.TRAJ5.TRAC,CAVREDTGRRALTF,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT,TRBV29-1.NA.TRBJ1-1.TRBC1,CSVPLGAGEAFF,TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,TRAV3.TRAJ5.TRAC_TRBV29-1.NA.TRBJ1-1.TRBC1,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT_TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,CAVREDTGRRALTF_CSVPLGAGEAFF,TRAV3.TRAJ5.TRAC;TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT_TRBV29-1.NA.TRBJ1-1.TRBC1;TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGCAGCGTCCCCCTGGGGGCTGGGGAAGCTTTCTTT,AGCGTCCCCCTGGGGGCTGGGGAAGCTTTC,TGTGCTGTGAGAGAGGACACGGGCAGGAGAGCACTTACTTTT,CSVPLGAGEAFF,CAVREDTGRRALTF,TRBV29-1.NA.TRBJ1-1.TRBC1,TRAV3.TRAJ5.TRAC,TRBV29-1_TRBJ1-1,TRBV29-1_TRBJ1-1_AGCGTCCCCCTGGGGGCTGGGGAAGCTTTC,18,0.0004222379,Medium
P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACGGGAGCGTTTAC-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV23/DV6.TRAJ57.TRAC,CAVHQGGSEKLVF,TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT,TRBV30.NA.TRBJ1-4.TRBC1,CAWSVGGVDEKLFF,TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,TRAV23/DV6.TRAJ57.TRAC_TRBV30.NA.TRBJ1-4.TRBC1,TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT_TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,CAVHQGGSEKLVF_CAWSVGGVDEKLFF,TRAV23/DV6.TRAJ57.TRAC;TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT_TRBV30.NA.TRBJ1-4.TRBC1;TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGTGCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTTTTT,GCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTT,TGTGCAGTCCATCAGGGCGGATCTGAAAAGCTGGTCTTT,CAWSVGGVDEKLFF,CAVHQGGSEKLVF,TRBV30.NA.TRBJ1-4.TRBC1,TRAV23/DV6.TRAJ57.TRAC,TRBV30_TRBJ1-4,TRBV30_TRBJ1-4_GCCTGGAGTGTTGGGGGCGTGGATGAAAAACTGTTT,1,2.345766e-05,Singlet
P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACGGGAGTATTGGA-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV24.TRAJ15.TRAC,CALKTALIF,TGTGCTCTCAAAACTGCTCTGATCTTT,TRBV9.NA.TRBJ2-5.TRBC2,CASSVGGGSQETQYF,TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,TRAV24.TRAJ15.TRAC_TRBV9.NA.TRBJ2-5.TRBC2,TGTGCTCTCAAAACTGCTCTGATCTTT_TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,CALKTALIF_CASSVGGGSQETQYF,TRAV24.TRAJ15.TRAC;TGTGCTCTCAAAACTGCTCTGATCTTT_TRBV9.NA.TRBJ2-5.TRBC2;TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGTGCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTACTTC,GCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTAC,TGTGCTCTCAAAACTGCTCTGATCTTT,CASSVGGGSQETQYF,CALKTALIF,TRBV9.NA.TRBJ2-5.TRBC2,TRAV24.TRAJ15.TRAC,TRBV9_TRBJ2-5,TRBV9_TRBJ2-5_GCCAGCAGCGTAGGTGGTGGTAGCCAAGAGACCCAGTAC,2,4.691532e-05,Rare
P01_T0_CD45pos,SPE_1_01_SCR_A_FRESH_CD45_P01_T0_CD45pos_AAACGGGCAAGGTTCT-1,SPE_1_01_SCR_A_FRESH_CD45,P01_T0_CD45pos,TRAV8-6.TRAJ20.TRAC,CAVSDQGDYKLSF,TGTGCTGTGAGTGATCAGGGCGACTACAAGCTCAGCTTT,TRBV20-1.NA.TRBJ2-5.TRBC2,CSAKAGLAGVETQYF,TGCAGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTACTTC,TRAV8-6.TRAJ20.TRAC_TRBV20-1.NA.TRBJ2-5.TRBC2,TGTGCTGTGAGTGATCAGGGCGACTACAAGCTCAGCTTT_TGCAGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTACTTC,CAVSDQGDYKLSF_CSAKAGLAGVETQYF,TRAV8-6.TRAJ20.TRAC;TGTGCTGTGAGTGATCAGGGCGACTACAAGCTCAGCTTT_TRBV20-1.NA.TRBJ2-5.TRBC2;TGCAGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTACTTC,T0,SCR,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52,51.78,FEMALE,15,IVa,5,7,,TGCAGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTACTTC,AGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTAC,TGTGCTGTGAGTGATCAGGGCGACTACAAGCTCAGCTTT,CSAKAGLAGVETQYF,CAVSDQGDYKLSF,TRBV20-1.NA.TRBJ2-5.TRBC2,TRAV8-6.TRAJ20.TRAC,TRBV20-1_TRBJ2-5,TRBV20-1_TRBJ2-5_AGTGCTAAGGCCGGGCTAGCGGGGGTAGAGACCCAGTAC,1,2.345766e-05,Singlet


In [52]:
# Check the sum of clonal sizes of distinct clonotypes equals the total number of clones
all_combined_TCR_df %>%
    select(sample, clonotype_id, cloneSize) %>% 
    distinct() %>%
    group_by(sample, clonotype_id, cloneSize) %>% 
    summarise(n = sum(cloneSize)) %>%
    pull(n) %>% sum()

[1m[22m`summarise()` has grouped output by 'sample', 'clonotype_id'. You can override using the `.groups` argument.


## Classify clonotypes by tumor presence status

Classify clonotypes into lost (non-persistent), pre-existing (persistent), and de novo clonotypes based on their longitudinal presence status (SCR and C02).

In [53]:
# Create variable indicating whether the cells are present before of after treatment
all_combined_TCR_df$ICI <- ifelse(all_combined_TCR_df$timepoint == "T0", "Pre-ICI", 
                 ifelse(all_combined_TCR_df$timepoint %in% c("T1", "T2", "EOT"), "Post-ICI", NA))

In [54]:
# Add presence and expansion status
all_combined_TCR_df <- all_combined_TCR_df %>%
    group_by(patient) %>%
    group_modify(~ {
        
        df <- .
    
        # Get unique clonotypes per time point for each patient
        clonotypes_pre <- unique(df %>% filter(ICI == "Pre-ICI") %>% pull(clonotype_id))
        clonotypes_post <- unique(df %>% filter(ICI == "Post-ICI") %>% pull(clonotype_id))
        
        # Determine status
        df <- df %>%
            mutate(
                presence_status = case_when(
                    clonotype_id %in% setdiff(clonotypes_pre, clonotypes_post) ~ "Lost",
                    clonotype_id %in% setdiff(clonotypes_post, clonotypes_pre) ~ "De Novo",
                    clonotype_id %in% intersect(clonotypes_pre, clonotypes_post) ~ "Pre-existing",
                    TRUE ~ NA_character_
                ),
                expansion_status = case_when(
                    presence_status == "Lost" & ICI == "Pre-ICI" ~ "Lost",
                    presence_status == "De Novo" & ICI == "Post-ICI" ~ "De Novo",
                    presence_status == "Pre-existing" & ICI == "Pre-ICI" ~ "Shared_pre",
                    presence_status == "Pre-existing" & ICI == "Post-ICI" ~ "Shared_post",
                    TRUE ~ NA_character_
                )
            )
        return(df)
  }) %>%
  ungroup()

In [55]:
# Check there are no duplicated barcodes
any(duplicated(all_combined_TCR_df$clonotype_id))
any(duplicated(all_combined_TCR_df$barcode))
any(duplicated(distinct(all_combined_TCR_df)))
any(duplicated(distinct(all_combined_TCR_df %>% select(ID, clonotype_id))))

In [56]:
nrow(all_combined_TCR_df)
nrow(distinct(all_combined_TCR_df))
length(unique(all_combined_TCR_df$barcode))

In [57]:
all_combined_TCR_df %>%
    select(clonotype_id, timepoint) %>%
    distinct() %>%
    nrow()

In [58]:
# Find columns that differ among duplicated barcodes

# Get all duplicated barcode rows
dup_rows <- all_combined_TCR_df %>%
  group_by(ID, barcode) %>% 
  filter(n() > 1) %>%
  ungroup()

dup_rows

patient,sample_id,barcode,sample,ID,TCR1,cdr3_aa1,cdr3_nt1,TCR2,cdr3_aa2,cdr3_nt2,CTgene,CTnt,CTaa,CTstrict,timepoint,assessment_point,MS_status,tumor_type,met_loc,response,mol_profile,KRAS_mut,BRAF_mut,prev_chemo,liver_met,time_gap_days,age,sex,tumor_growth,stage_diag,num_prev_lines,num_met_sites,adjuv_treat,CTnt_beta,CTnt_beta_shrt,CTnt_alpha,CTaa_beta,CTaa_alpha,CTgene_beta,CTgene_alpha,TRBV_TRBJ,clonotype_id,cloneSize,norm_cloneSize,cloneClass,ICI,presence_status,expansion_status
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<fct>,<chr>,<chr>,<chr>


In [59]:
nrow(all_combined_TCR_df)
nrow(distinct(all_combined_TCR_df))

### Convert long data to wide format: clonotype-level data

Convert the cell-level df to a clonotype-level df with time point-related variables as columns. Aggregate 

In [63]:
all_combined_TCR_df %>%
    # Remove barcode column, make each clonotype tp combination unique
    select(-barcode) %>%
    distinct() %>%
    nrow()

In [3]:
# Convert the df to wide format
all_combined_TCR_df_wide <- all_combined_TCR_df %>%
    # Reduce the data so there is a single row per clonotype
    select('clonotype_id', 'patient', 'timepoint', 'sample_id', 'ID', 'cloneSize',
            'MS_status','tumor_type','met_loc','response', 'tumor_growth', 'presence_status') %>%
    distinct() %>%

    # Convert to wide format
    pivot_wider(
        id_cols = -c(sample_id, ID),
        names_from = timepoint,
        values_from = c(cloneSize),
        values_fn = list(cloneSize = sum)) %>% # sum = unique clonesize since the data is at the clonotype-level
    
    # Replace NAs by 0
    mutate(
       cloneSize_T0 = replace_na(T0, 0),
       cloneSize_T1 = replace_na(T1, 0),
       cloneSize_EOT = replace_na(EOT, 0) 
    ) %>%
    
    # Normalize counts per timepoint-patient (same as library for most cases but technical replicates)
    group_by(patient) %>%
    mutate(
        norm_cloneSize_T0 = cloneSize_T0 / sum(cloneSize_T0, na.rm = TRUE),
        norm_cloneSize_T1 = cloneSize_T1 / sum(cloneSize_T1, na.rm = TRUE),
        norm_cloneSize_EOT = cloneSize_EOT / sum(cloneSize_EOT, na.rm = TRUE)
    ) %>%
    ungroup() %>%
    
    # Re-classify clonotypes into clonal classes
    mutate(
        # T0 (SCR)
        cloneClass_T0 = case_when(
        cloneSize_T0 == 1 ~ "Singlet", 
        cloneSize_T0 > 1 ~ cut(
            log10(norm_cloneSize_T0),
            breaks = quantile(log10(norm_cloneSize_T0[cloneSize_T0 > 1]), 
                              probs = c(0,0.5,1), na.rm = TRUE),
            labels = c("Lowly Expanded (< Median)", "Highly Expanded (> Median)")     
        )),
        # T1 (C02)
        cloneClass_T1 = case_when(
        cloneSize_T1 == 1 ~ "Singlet", 
        cloneSize_T1 > 1 ~ cut(
            log10(norm_cloneSize_T1),
            breaks = quantile(log10(norm_cloneSize_T1[cloneSize_T1 > 1]), 
                              probs = c(0,0.5,1), na.rm = TRUE),
            labels = c("Lowly Expanded (< Median)", "Highly Expanded (> Median)")     
        )),
        # EOT
        cloneClass_EOT = case_when(
        cloneSize_EOT == 1 ~ "Singlet", 
        cloneSize_EOT > 1 ~ cut(
            log10(norm_cloneSize_EOT),
            breaks = quantile(log10(norm_cloneSize_EOT[cloneSize_EOT > 1]), 
                              probs = c(0,0.5,1), na.rm = TRUE),
            labels = c("Lowly Expanded (< Median)", "Highly Expanded (> Median)")     
        )),
            
     )

nrow(all_combined_TCR_df)
nrow(all_combined_TCR_df_wide)
head(all_combined_TCR_df_wide)

ERROR: Error: object 'all_combined_TCR_df' not found


In [65]:
# CHECK THAT THE SUM OF CLONAL SIZES EQUALS THE TOTAL NUMBER OF CELLS
( sum(all_combined_TCR_df_wide$cloneSize_T0, na.rm = TRUE) + sum(all_combined_TCR_df_wide$cloneSize_T1, na.rm = TRUE) + sum(all_combined_TCR_df_wide$cloneSize_EOT, na.rm = TRUE) )

In [66]:
# Check that sum of clonal proportions per library / libraries equals to 1
( sum(all_combined_TCR_df_wide$norm_cloneSize_T0, na.rm = TRUE) + sum(all_combined_TCR_df_wide$norm_cloneSize_T1, na.rm = TRUE) + sum(all_combined_TCR_df_wide$norm_cloneSize_EOT, na.rm = TRUE) ) / nrow(distinct(all_combined_TCR_df %>% select(patient, timepoint)))

In [67]:
# Check there are no duplicates
as.data.table(all_combined_TCR_df_wide)[, .N, by = .(clonotype_id, patient)][N > 1] 

clonotype_id,patient,N
<chr>,<chr>,<int>


### Remove clonotypes without beta chain

In [68]:
# Remove clonotypes without beta chain (NA_)
all_combined_TCR_df_wide %>%
    filter(clonotype_id == "NA_") %>%
    head() 

clonotype_id,patient,MS_status,tumor_type,met_loc,response,mol_profile,KRAS_mut,BRAF_mut,prev_chemo,liver_met,time_gap_days,tumor_growth,num_prev_lines,num_met_sites,adjuv_treat,stage_diag,presence_status,T0,T1,EOT,cloneSize_T0,cloneSize_T1,cloneSize_EOT,norm_cloneSize_T0,norm_cloneSize_T1,norm_cloneSize_EOT,cloneClass_T0,cloneClass_T1,cloneClass_EOT
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<fct>,<fct>,<fct>
NA_,P01,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,52.0,15.0,5.0,7.0,,IVa,Pre-existing,54,95.0,,54,95,0,0.01920341,0.017301038,,Hyperexpanded,Hyperexpanded,
NA_,P02,MSS,CRC,Liver,PD,KRAS G12S,True,False,True,True,58.0,88.0,2.0,2.0,,IVb,Pre-existing,216,49.0,,216,49,0,0.02997086,0.009660883,,Hyperexpanded,Large,
NA_,P03,MSS,CRC,Liver,PD,KRAS G13D,True,False,True,True,51.0,22.0,2.0,3.0,,IVb,Pre-existing,9,39.0,108.0,9,39,108,0.02158273,0.014722537,0.01775148,Medium,Large,Hyperexpanded
NA_,P04,MSI,EC,Lung,PD,"MLH1 hypermet, p53 wt, POLE<U+00A0>wt",False,False,True,False,52.0,45.0,,,,,Pre-existing,268,73.0,,268,73,0,0.02239679,0.011263694,,Hyperexpanded,Hyperexpanded,
NA_,P05,MSS,CRC,Nodul,PD,KRAS G12V,True,False,True,False,,,,,,,Lost,21,,,21,0,0,0.02692308,,,Large,,
NA_,P06,MSS,EC,Nodul,PD,"p53, POLE wt",False,False,True,False,,,,,,,Lost,74,,,74,0,0,0.04695431,,,Hyperexpanded,,


In [69]:
# Remove NA_clonotype_ids 
all_combined_TCR_df_wide <- all_combined_TCR_df_wide %>%
    filter(clonotype_id != "NA_")
nrow(all_combined_TCR_df_wide)

### Renormalize again after removing clonotypes without beta chain

In [70]:
all_combined_TCR_df_wide <- all_combined_TCR_df_wide %>%
    group_by(patient) %>%
    mutate(
        norm_cloneSize_T0 = cloneSize_T0 / sum(cloneSize_T0, na.rm = TRUE),
        norm_cloneSize_T1 = cloneSize_T1 / sum(cloneSize_T1, na.rm = TRUE),
        norm_cloneSize_EOT = cloneSize_EOT / sum(cloneSize_EOT, na.rm = TRUE)
    )

### Save data

In [72]:
# Save the long TCR data with clonesize info
qsave(all_combined_TCR_df,
       file = file.path(root_dir, "out", "data", "SERP_TCR_combined_clones_with_sizes_df_09-2025_v2.qs"))

In [74]:
# Save the TCR data in wide format 
qsave(all_combined_TCR_df_wide, 
       file = file.path(root_dir, "out", "data", "SERP_TCR_combined_clones_with_sizes_wide_df_09-2025_v2.qs"))

### Numbers

In [79]:
# Entire Serpentine cohort
print(paste("Total number of clones (T cells):", nrow(distinct(all_combined_TCR_df))))
print(paste("Total number of clonotypes: ", nrow(all_combined_TCR_df_wide)))

[1] "Total number of clones (T cells): 208530"
[1] "Total number of clonotypes:  95697"


In [None]:
# MSS CRC cohort subset
patients_keep <- c("P01", "P02", "P03", "P10", "P14", "P17", "P20", "P26", "P29", "P31", "P33", "P34", "P35")
print(paste("Total number of VdJ cells at SCR:", nrow(distinct(all_combined_TCR_df %>% filter(patient %in% patients_keep, timepoint == "T0")))))
print(paste("Total number of VdJ cells at C02:", nrow(distinct(all_combined_TCR_df %>% filter(patient %in% patients_keep, timepoint == "T1")))))
print(paste("Total number of clonotypes: ", nrow(all_combined_TCR_df_wide %>% filter(patient %in% patients_keep, cloneSize_EOT == 0 & (cloneSize_T0 != 0 | cloneSize_T1 != 0)))))
print(paste("Total number of clonotypes at SCR: ", nrow(all_combined_TCR_df_wide %>% filter(patient %in% patients_keep, cloneSize_T0 != 0))))
print(paste("Total number of clonotypes at C02: ", nrow(all_combined_TCR_df_wide %>% filter(patient %in% patients_keep, cloneSize_T1 != 0))))