# Analysis of Tumor-Blood Overlapping CLonotypes

### Env Setup

In [None]:
# Load project configuration
setwd("/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR")
options(repr.matrix.max.rows=100, repr.matrix.max.cols=100)
options(warn = -1)
source("code/helper/Config.R", echo = FALSE)

In [None]:
# Import plotting helper functions
source("/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR/code/helper/Plotting_Functions.R", echo = FALSE)

In [None]:
# Define figures path
fig_dir <- "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR/out/figs/TCR_Fig_Jan/tumor_blood_overlap"

### Load Tumor 10x Processed TCR Data with matched GEX Profiles and blood TCR data (wide)

In [None]:
# Read data 
data <- qread(file.path(root_dir, "out", "data", "SP_Tumor_GEX-Blood_Full_TCR_wide_12-2025_v2.qs"), nthreads=32)

In [None]:
table(data$source)

### Prepare data

In [None]:
patients_keep <- c(patients_keep <- "P01", "P02", "P03", "P10", "P14", "P20") # keep patients with matching tumor and blood SCR and C02 time points) # keep patients with matching tumor and blood SCR and C02 time points
data <- data %>%
    filter(
        Patient %in% patients_keep,
        !(is.na(SCR_counts) & !is.na(C01_counts) & C01_counts != 0 & is.na(C02_counts) & cloneSize_T0 == 0 &
        cloneSize_T1 == 0) # keep clonotypes present either at SCR and or C02, not at C01 only
    ) %>%
    select(clonotype_id, Patient, met_loc, presence_status, cloneSize_T0, norm_cloneSize_T0, cloneSize_T1, norm_cloneSize_T1, SCR_counts, C01_counts, C02_counts, `SCR-C02_event`, source)
dim(data)
head(data)

In [None]:
table(data$source)
sum(table(data$source))

In [None]:
## COmpute overlap (Jaccard) between tumor and blood time points

In [None]:
data <- data %>%
    mutate(
        T0_present = cloneSize_T0 > 0,
        T1_present = cloneSize_T1 > 0,
        SCR_present = !is.na(SCR_counts) & SCR_counts > 0,
        C01_present = !is.na(C01_counts) & C01_counts > 0,
        C02_present = !is.na(C02_counts) & C02_counts > 0
    )
head(data,2)
dim(data)

In [None]:
# function to compute the overlap (jaccard index)
compute_overlap <- function(x, y) {
    intersection = sum(x & y, na.rm = TRUE)
    union = sum(x | y, na.rm = TRUE)
    jaccard = ifelse(union == 0, NA, intersection / union)
}

In [None]:
# Compute overlaps
overlap <- data %>%
    group_by(Patient) %>%
    summarise(
        jaccard_tSCR_tC02 = compute_overlap(T0_present, T1_present),
        jaccard_tSCR_bSCR = compute_overlap(T0_present, SCR_present),
        jaccard_tSCR_bC01 = compute_overlap(T0_present, C01_present),
        jaccard_tSCR_bC02 = compute_overlap(T0_present, C02_present),
        jaccard_tC02_bSCR = compute_overlap(T1_present, SCR_present),
        jaccard_tC02_bC01 = compute_overlap(T1_present, C01_present),
        jaccard_tC02_bC02 = compute_overlap(T1_present, C02_present),
        jaccard_bSCR_bC01 = compute_overlap(SCR_present, C01_present),
        jaccard_bSCR_bC02 = compute_overlap(SCR_present, C02_present),
        jaccard_bC01_bC02 = compute_overlap(C01_present, C02_present)
    ) %>% ungroup()
head(overlap)

In [None]:
# Format to long
overlap_long <- overlap %>%
    pivot_longer(
        cols = colnames(overlap)[-1],
        names_to = "pair",
        values_to = "jaccard"
    ) %>%
    filter(jaccard > 0) %>% # remove overlaps of 0 (1 sample is missing)
    mutate(pair = sub("^jaccard_", "", pair)) # remove jaccard_ prefix
head(overlap_long)

In [None]:
# Group by patient and compute median and Q1 and Q3
overlap_plot <- overlap_long %>%
    group_by(pair) %>%
    summarise(
        median_jacc = median(jaccard, na.rm = TRUE),
        q1 = quantile(jaccard, 0.25, na.rm = TRUE),
        q3 = quantile(jaccard, 0.75, na.rm = TRUE),
        mean_jacc = mean(jaccard, na.rm = TRUE),
        se = sd(jaccard, na.rm = TRUE) / sqrt(n()),
        n = n(),
        .groups = "drop"
    )
head(overlap_plot)

In [None]:
# Plot overlap
options(repr.plot.width = 5, repr.plot.height = 4)
overlap_plot$pair <- factor(overlap_plot$pair, levels = c("tSCR_tC02", "tSCR_bSCR", "tSCR_bC01", "tSCR_bC02", "tC02_bSCR", "tC02_bC01", "tC02_bC02", "bSCR_bC01", "bSCR_bC02", "bC01_bC02"))
ggplot(overlap_plot %>% filter(!(pair %in% c("tSCR_tC02", "bSCR_bC01", "bSCR_bC02", "bC01_bC02", "tSCR_bC01", "tC02_bC01"))), aes(x = pair, y = median_jacc), col = "black") +
    geom_col(position="dodge") +
    geom_errorbar(
        aes(ymin = q3, ymax = q1),
        position = position_dodge(0.9),
        width = 0.05
    ) + 
    scale_fill_identity() +  
    labs(y = "Jaccard Index", x = "Tumor-Blood Time Point Pairs") +
    theme_linedraw(base_size = 15) +
    theme(
        panel.border = element_rect(color = "black", fill = NA, size = 1),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
        panel.grid = element_blank(),
        plot.title = element_text(hjust = 0.5)
    ) 
ggsave(filename = file.path(fig_dir, "Tumor-Blood_Jaccard_Across.pdf"), plot = last_plot(), dpi = 300, width = 5, height = 4)

## Quantification of Tumor Matching DE Blood Clonotypes

In [None]:
# Read tumor-blood wide data
data <- qread(file.path(root_dir, "out", "data", "SP_Tumor_GEX-Blood_Full_TCR_wide_12-2025_v2.qs"), nthreads=32)

In [None]:
dim(data)

In [None]:
# Filter data for patients with matched tymor and blood SCR and C02
patients_keep <- c(patients_keep <- "P01", "P02", "P03", "P10", "P14", "P20") # keep patients with matching tumor and blood SCR and C02 time points) # keep patients with matching tumor and blood SCR and C02 time points
data <- data %>%
    filter(
        Patient %in% patients_keep,
        (cloneSize_EOT != 0) & (cloneSize_T0 > 0 | cloneSize_T1 > 0 | SCR_counts > 0 | C02_counts > 0), # remove EOT only clonotypes 
        (!is.na(SCR_counts) | !is.na(C02_counts) | cloneSize_T0 > 0 | cloneSize_T1 > 0), # keep clonotypes present either at SCR and or C02, not at C01 only or other time points
    ) %>%
    select(clonotype_id, Patient, met_loc, cloneSize_T0, cloneSize_T1, SCR_counts, C02_counts, `SCR-C02_event`)
dim(data)
head(data)

In [None]:
data <- data %>%
    mutate(
        T0_present = cloneSize_T0 > 0,
        T1_present = cloneSize_T1 > 0,
        SCR_present = !is.na(SCR_counts) & SCR_counts > 0,
        C02_present = !is.na(C02_counts) & C02_counts > 0
    )
head(data,5)
dim(data)
table(data$T0_present)
table(data$T1_present)
table(data$SCR_present)
table(data$C02_present)

In [None]:
# Add DE column
data <- data %>%
    mutate(
        `SCR-C02_diff` = ifelse(`SCR-C02_event` %in% c("expanded", "de_novo_expanded", "contracted", "de_novo_contracted"), "Event Related", "Not Event Related"),
        `SCR-C02_de` = ifelse(`SCR-C02_event` %in% c("expanded", "de_novo_expanded"), "DE", "Not DE")
          )

In [None]:
# Create compartment var
data <- data %>% 
    mutate(
        is_tumor = (T0_present | T1_present),
        is_blood = (SCR_present | C02_present),
        compartment = ifelse(is_tumor & is_blood, "Both", ifelse(is_tumor, "Tumor", "Blood"))
    )
table(data$is_tumor)
table(data$is_blood)
table(data$compartment)

## Quantification of De Novo and Pre-existing Tumor Clonotypes found in Blood at SCR and C02

In [None]:
# Read data 
data <- qread(file.path(root_dir, "out", "data", "SP_Tumor_GEX-Blood_Full_TCR_wide_12-2025_v2.qs"), nthreads=32)

In [None]:
nrow(data)

In [None]:
head(data)

In [None]:
patients_keep <- c("P01", "P02", "P03", "P10", "P14", "P17", "P20", "P26", "P29") 
data <- data %>%
    filter(Patient %in% c(patients_keep)) %>%
    select(clonotype_id, Patient, met_loc, presence_status, cloneSize_T0, norm_cloneSize_T0, cloneSize_T1, norm_cloneSize_T1, SCR_counts, C01_counts, C02_counts, C02_frequency, SCR_frequency, `SCR-C02_event`, source, CD4_CD8_assignment_T1, lv1_T1, annotation_T1) %>%
    filter(presence_status %in% c("Pre-existing", "De Novo")) %>% # filter for tumor post-ICI clones
    mutate(
        T0_present = cloneSize_T0 > 0,
        T1_present = cloneSize_T1 > 0,
        SCR_present = !is.na(SCR_counts) & SCR_counts > 0,
        C01_present = !is.na(C01_counts) & C01_counts > 0,
        C02_present = !is.na(C02_counts) & C02_counts > 0
    ) %>%
    # Compute log fold change and delta to then subset for post-ICI enriched clones
    group_by(Patient) %>% # compute patient-specific lost clonal size median
    mutate(norm_cloneSize_T0 = na_if(norm_cloneSize_T0, 0)) %>%
    mutate(median_norm_cloneSize_T0 = median(norm_cloneSize_T0, na.rm = TRUE)) %>%
    mutate(norm_cloneSize_T0 = replace_na(norm_cloneSize_T0, 0)) %>%

    mutate(
        LogFC = ifelse(presence_status == "Pre-existing", log2(norm_cloneSize_T1/norm_cloneSize_T0),
              ifelse(presence_status == "De Novo", log2(norm_cloneSize_T1/median_norm_cloneSize_T0), NA))
        , # Log2FC
        Delta = norm_cloneSize_T1 - norm_cloneSize_T0 # Delta in normalized clonal sizes
    ) %>%
    # Filter post-ICI enriched tumor clones
    mutate(enriched = ifelse( 
        (presence_status == "Pre-existing" & LogFC >= 2), TRUE, ifelse( 
            presence_status == "De Novo" & cloneSize_T1 >1 & LogFC >= 2, TRUE, FALSE) 
        ) 
    ) %>%
    mutate(blood = case_when(
                SCR_present & C02_present ~ "Both",
                SCR_present & !C02_present ~ "SCR",
                !SCR_present & C02_present ~ "C02",
                )
           ) 
dim(data)
head(data)

In [None]:
# Compute overlap of de novo clones in SCR at blood per patient
overlap_data <- data %>%
    filter(enriched == TRUE) %>% # filter for enriched clonotypes
    group_by(Patient, met_loc,  presence_status, blood, `SCR-C02_event`) %>%
    summarise(n = n()) %>%
    mutate(prop = n/sum(n)) %>%
    filter(!is.na(blood)) %>% 
    group_by(Patient, met_loc, presence_status, `SCR-C02_event`) %>%
    summarise(
        n_SCR  = sum(n[blood %in% c("SCR", "Both")], na.rm = TRUE),
        n_C02  = sum(n[blood %in% c("C02", "Both")], na.rm = TRUE),
        n_Both = sum(n[blood == "Both"], na.rm = TRUE),
        prop_SCR  = sum(prop[blood %in% c("SCR", "Both")], na.rm = TRUE),
        prop_C02  = sum(prop[blood %in% c("C02", "Both")], na.rm = TRUE),
        prop_Both = sum(prop[blood == "Both"], na.rm = TRUE),
        .groups = "drop"
    ) %>% 
    pivot_longer(
        cols = starts_with("prop_"),
        names_to = "blood",
        values_to = "prop"
    ) %>%
    mutate(
        blood = recode(
            blood,
            prop_SCR = "SCR",
            prop_C02 = "C02",
            prop_Both = "Both"
      )
    ) %>%
    mutate(prop = na_if(prop, 0))
    
    
overlap_data

In [None]:
# compute medians
data %>%
    filter(enriched == TRUE) %>% # filter for enriched clonotypes
    group_by(Patient, met_loc, presence_status, blood) %>%
    summarise(n = n()) %>%
    mutate(prop = n/sum(n)) %>%
    filter(!is.na(blood)) %>%
    summarise(
        n_SCR  = sum(n[blood %in% c("SCR", "Both")], na.rm = TRUE),
        n_C02  = sum(n[blood %in% c("C02", "Both")], na.rm = TRUE),
        n_Both = sum(n[blood == "Both"], na.rm = TRUE),
        prop_SCR  = sum(prop[blood %in% c("SCR", "Both")], na.rm = TRUE),
        prop_C02  = sum(prop[blood %in% c("C02", "Both")], na.rm = TRUE),
        prop_Both = sum(prop[blood == "Both"], na.rm = TRUE),
        .groups = "drop"
    ) %>% 
    group_by(presence_status) %>%
    summarise(
        median_SCR = median(prop_SCR),
        median_C02 = median(prop_C02),
        median_Both = median(prop_Both)
    )

In [None]:
# Enriched clones
options(repr.plot.width = 6, repr.plot.height = 4)
overlap_data$Patient <- factor(overlap_data$Patient, levels = c("P10", "P02", "P14", "P03", "P01", "P20", "P26", "P29", "P17"))
overlap_data$blood <- factor(overlap_data$blood, levels = c("SCR", "C02", "Both"))
ggplot(overlap_data, aes(x = Patient, y = prop)) +
    geom_line(size = 0.1) +   
    geom_point(size = 3, aes(color = blood)) +    
    labs(
        x = "Patient",
        y = "Proportion in Blood",
        col = "In Blood",
        #linetype = "Tumor Status"
    ) +
    theme_bw(base_size = 15) +
    theme(
        axis.text.x = element_text(angle = 0, hjust = 0.5),
        plot.title = element_text(hjust = 0.5)
    ) + 
    #scale_color_manual(
    #        values = c("Lung" = "#1a80bb", "Liver" = "#a00000")        
    #    ) + 
    scale_color_manual(
            values = c("SCR" = "#122740", "C02" = "#568b87", "Both" = "#b5d1ae")        
        ) + 
    #ylim(0,1) + 
    facet_wrap(~presence_status, ncol = 1)
    #scale_color_manual(
    #        values = c("Pre-existing" = "#f0c571", "De Novo" = "#59a89c")        
    #    )
ggsave(filename = file.path(fig_dir, "Lineplot_Enriched_Blood.pdf"), plot = last_plot(), dpi = 300, width = 6, height = 4)

## Phenotype of post-ICI enriched de novo and pre-existing clonotypes in Tumor at C02

In [None]:
phen_data <- data %>%
    filter(
        presence_status %in% c("De Novo", "Pre-existing"),
        enriched == TRUE,
        C02_present == TRUE
    ) %>%
    filter(!is.na(CD4_CD8_assignment_T1), !is.na(annotation_T1), !annotation_T1 %in% c("Tgd-V1", "NK-Tgd")) %>%
    group_by(presence_status, CD4_CD8_assignment_T1, annotation_T1) %>%
    summarise(n = n()) %>%
    group_by(presence_status, CD4_CD8_assignment_T1) %>%
    mutate(prop = n/sum(n))
    
head(phen_data)

In [None]:
pal <- c(
    "#023fa5", "#7d87b9", "#bec1d4", "#d6bcc0", "#bb7784", "#8e063b", "#4a6fe3", "#8595e1", "#b5bbe3", "#e6afb9",
    "#e07b91", "#d33f6a", "#11c638", "#8dd593", "#c6dec7", "#ead3c6", "#f0b98d", "#ef9708", "#0fcfc0", "#9cded6", 
    "#d5eae7", "#f3e1eb", "#f6c4e1", "#f79cd4"
)
names(pal) <- c(
    'CD4 central memory', 'CD4 central memory pre-Tfh', 'CD4 follicular helper', 'CD8 IFN', "CD8 NK-like", "CD8 activated",
    "CD8 effector", "CD8 metabolic", "CD8 pre-exhausted", "CD8 resident", "CD8 resident activated", "CD8 resident exhausted", 
    "MAIT-17", "NK", "NK-Tgd", 'T Naive', 'T proliferating', "Tgd-17", "Tgd-V1", 'Th-1', 'Th-17', 'Tregs', 'Tregs activated', 
    "Tregs proliferating")

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)
phen_data$presence_status <- factor(phen_data$presence_status, levels = c("Pre-existing", "De Novo"))
ggplot(phen_data, aes(fill = annotation_T1, y = prop, x = presence_status)) +
    geom_bar(position="fill", stat="identity") +
    theme_linedraw(base_size = 15) + 
        theme(
            panel.border = element_rect(color = "black", fill = NA, size = 1.5),
            plot.title = element_text(hjust=0.5),
            #axis.text.x = element_blank(),
            #axis.ticks.x = element_blank(),
            axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
            strip.background = element_rect(fill = "lightgray", color = "black"),
            strip.text = element_text(color = "black", face = "bold"),
            panel.grid = element_blank()
        ) + 
    labs(x = "", y = "Proportion", fill = "Annotation") +
    guides(fill = guide_legend(ncol = 1)) +
    scale_fill_manual(values = pal) +
    facet_grid(~CD4_CD8_assignment_T1)
ggsave(filename = file.path(fig_dir, "Barplot_Lv2-Enriched_C02.pdf"), plot = last_plot(), dpi = 300, width = 6, height = 5)

## Clonal scatter of enriched clonotypes in blood

In [None]:
scatter_data <- data %>%
    filter(enriched == TRUE)
nrow(scatter_data)

In [None]:
options(repr.plot.width = 7, repr.plot.height = 5)
library(ggpubr)
geom_params = list(shape = 21, alpha = 0.8, stroke = 1, color = "white")
data$norm_cloneSize_T1 <- as.numeric(as.character(data$norm_cloneSize_T1))
data <- scatter_data %>% arrange(norm_cloneSize_T1)
ggplot(data, aes(x = log10(C02_frequency+1e-4), y = log10(SCR_frequency+1e-4), shape = presence_status, fill = log10(as.numeric(as.character(norm_cloneSize_T1))))) +
        geom_point(color = "black", alpha = 0.8, stroke = 0.5) +
        geom_smooth(method = "lm", se = TRUE, color = "black", aes(group = presence_status)) +
        stat_cor(
            method = "pearson",
            label.x.npc = "left",
            label.y.npc = "top",
            aes(label = paste(..r.label.., ..p.label.., sep = "~`,`~")),
            size = 4
        ) + 
        #geom_density_2d(linewidth = 0.5, colour = "black") +
        scale_fill_viridis_c(option = "inferno") +
        scale_shape_manual(values = c("De Novo" = 21, "Pre-existing" = 24)) + 
        labs(
            x = expression(log[10]("Clonal Proportion Blood C02")),
            y = expression(log[10]("Clonal Proportion Blood SCR")),
            fill = expression("Tumor Clonal\nProportion ("*log[10] * ")" ),
            title = "Tumor Post-ICI Enriched Clones",
            shape = "Tumor Status"
        ) +
        theme_linedraw(base_size = 15) +
        theme(
            legend.position = "right",
            panel.grid = element_blank(),
            panel.border = element_rect(color = "black", linewidth = 1.5),
            plot.title = element_text(hjust = 0.5),
            axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0)),
            axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0))
        ) +
        xlim(log10(1e-4),-1.7) + ylim(log10(1e-4),-1.7) + 
        geom_abline(slope = 1, intercept = 0, color = "black") +
        geom_hline(yintercept = log10(1e-4), color = "lightgrey", linewidth = .5) + 
        geom_vline(xintercept = log10(1e-4), color = "lightgrey", linewidth = .5)
ggsave(filename = file.path(fig_dir, "ClonalScatter_Enriched_Blood.pdf"), plot = last_plot(), dpi = 300, width = 7, height = 5)