# V-J Germline Gene Usage Analysis

### Env Setup

In [None]:
# Load project configuration
setwd("/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR")
options(repr.matrix.max.rows=100, repr.matrix.max.cols=100)
options(warn = -1)
source("code/helper/Config.R", echo = FALSE)

In [None]:
# Import plotting helper functions
source("/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR/code/helper/Plotting_Functions.R", echo = FALSE)

In [None]:
# Define figures path
fig_dir <- "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR/out/figs/TCR_Fig_Jan/gene_usage"

### Load Tumor 10x Processed TCR Data with matched GEX Profiles (wide)

In [None]:
# Read data (wide)
data <- qread(file = file.path(root_dir, "out", "data", "SERP_TCR-GEX_wide_11-2025_v2.qs"))

### Prepare data

In [None]:
# Filter data SCR and C02 time points, liver and lung mets, and patients with matched SCR and C02 data
patients_keep <- c("P01", "P02", "P03", "P10", "P14", "P17", "P20", "P26", "P29", "P31", "P33", "P34", "P35")
data <- data %>%
    filter(
        patient %in% patients_keep,
    )
dim(data)
head(data,3)

In [None]:
# Create Columns for the V and J gene segments
data <- data %>%
    separate(clonotype_id, into=c("V", "J", "nt"), sep="_", remove=FALSE) %>%
    mutate(VJ = paste(V, J, sep="_"))
head(data,3)

## V-J Gene usage correlations across time points and metastatic sites

In [None]:
# Create consensus CD4/CD8 annotation between SCR and C02 for pre-existing clonotypes
data <- data %>%
    mutate(CD4_CD8_assignment = ifelse((presence_status == "Pre-existing") & (CD4_CD8_assignment_T0 == CD4_CD8_assignment_T1), CD4_CD8_assignment_T1, ifelse(
        presence_status == "De Novo", CD4_CD8_assignment_T1, ifelse(
            presence_status == "Lost", CD4_CD8_assignment_T0, "not_matched")
    )))

In [None]:
# Compute proportions of V and J genes usage
library(ggpubr)
v_j_data <- data %>%
    filter(CD4_CD8_assignment %in% c("CD4", "CD8")) %>%
    select(clonotype_id, V, J, cloneSize_T0, cloneSize_T1, met_loc) %>%
    pivot_longer(
        cols = starts_with("cloneSize_"),
        names_to = "timepoint",
        values_to = "cloneSize",
        names_pattern = "cloneSize_(T\\d)"
    ) %>%
    mutate(met_tp = paste(met_loc, timepoint, sep = "_")) %>%
    filter(cloneSize != 0) 

v_props <- v_j_data %>%
    group_by(met_tp, V) %>%
    summarise(n = n(), .groups = "drop") %>%
    group_by(met_tp) %>%
    mutate(v_prop = n / sum(n))
head(v_props,2)

j_props <- v_j_data %>%
    group_by(met_tp, J) %>%
    summarise(n = n(), .groups = "drop") %>%
    group_by(met_tp) %>%
    mutate(j_prop = n / sum(n))
head(j_props,2)

In [None]:
# Compute V and J proportions per patient
v_props_patient <- v_j_data %>%
  group_by(patient, met_tp, V) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(patient, met_tp) %>%
  mutate(
      v_prop = n / sum(n),
      v_clr_prop = log(v_prop) - mean(log(v_prop)) # CLR transformation
  )
head(v_props_patient,2)
      
j_props_patient <- v_j_data %>%
  group_by(patient, met_tp, J) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(patient, met_tp) %>%
  mutate(
      j_prop = n / sum(n),
      j_clr_prop = log(j_prop) - mean(log(j_prop)) # CLR transformation
  )
head(j_props_patient,2)

      
# Convert to wide format for correlation
v_wide <- v_props_patient %>%
  select(patient, met_tp, V, v_clr_prop) %>%
  pivot_wider(names_from = V, values_from = v_clr_prop, values_fill = 0)

j_wide <- j_props_patient %>%
  select(patient, met_tp, J, j_clr_prop) %>%
  pivot_wider(names_from = J, values_from = j_clr_prop, values_fill = 0)

# Compute correlations between all V and J per met_tp
cor_list <- map_dfr(unique(v_wide$met_tp), function(tp) {
  v_mat <- v_wide %>% as_tibble() %>% filter(met_tp == tp) %>% select(-c(1, 2))
  j_mat <- j_wide %>% as_tibble() %>% filter(met_tp == tp) %>% select(-c(1, 2))
  
  cor_mat <- cor(as.matrix(v_mat), as.matrix(j_mat), method = "pearson")
  
  as.data.frame(as.table(cor_mat)) %>%
    rename(V = Var1, J = Var2, correlation = Freq) %>%
    mutate(met_tp = tp)
})

In [None]:
options(repr.plot.width = 12, repr.plot.height = 6)
ggplot(cor_list, aes(V, J, fill=correlation)) + 
    geom_tile() +
    scale_fill_distiller(palette = "Purples") +
    theme_linedraw(base_size = 15) + 
    labs(x = "V Gene", y = "J Gene", fill = "Pearson \ncorrelation") + 
    theme(
        strip.background = element_rect(fill = "lightgray", color = "black"),
        strip.text = element_text(color = "black", face = "bold"),
        panel.border = element_rect(color = "black", fill = NA, size = 1.5),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 7),
        axis.text.y = element_text(size = 10),
        panel.grid = element_blank(),
        plot.title = element_text(hjust = 0.5)
    ) + 
    facet_wrap(~met_tp)
ggsave(filename = file.path(fig_dir, "TRBV-TRBJ_Correlations_Met-TP.pdf"), plot = last_plot(), dpi = 300, width = 12, height = 6)

## Randomness / Pattern of V-J correlations

In [None]:
# Function to compute pattern metrics
pattern_metrics <- function(A, k = 3) {
    s <- svd(A)
    singvals <- s$d
    power <- singvals^2
    p <- power / sum(power)
    
    eff_rank <- exp(-sum(p * log(p)))
    frac_var_topk <- sum(power[1:min(k, length(power))]) / sum(power)
    
    list(eff_rank = eff_rank, frac_var_topk = frac_var_topk)
}

In [None]:
# Compute pattern metrics on the combination and on random matrices to have a baseline

set.seed(123)   # for reproducibility
n_rand <- 200   # number of random matrices per combination 

results <- list()

for (comb in unique(cor_list$met_tp)) {

    corr_data <- cor_list %>%
        filter(
            met_tp == comb,
            !(V %in% c("TRBV6-7", "TRBV6-8", "TRBV6-9", "TRBV7-4"))
        ) %>%
        mutate(
            V = as.character(V),
            J = as.character(J)
        ) %>%
        select(V, J, correlation) %>%
        tidyr::pivot_wider(
            names_from  = J,
            values_from = correlation
        )
    
    A <- as.matrix(corr_data[, -1, drop = FALSE])
    rownames(A) <- corr_data$V

    # observed metrics
    obs <- pattern_metrics(A, k = 3)

    # parameters for random baseline
    mu <- mean(A, na.rm = TRUE)
    sdA <- stats::sd(as.numeric(A), na.rm = TRUE)

    # simulate random matrices and compute metrics
    m <- nrow(A)
    n <- ncol(A)

    rand_eff <- numeric(n_rand)
    rand_frac <- numeric(n_rand)

    for (i in seq_len(n_rand)) {
        R <- matrix(
            rnorm(m * n, mean = mu, sd = sdA),
            nrow = m, ncol = n
        )
        met <- pattern_metrics(R, k = 5)
        rand_eff[i]  <- met$eff_rank
        rand_frac[i] <- met$frac_var_topk
    }

    results[[comb]] <- list(
        comb       = comb,
        obs_eff    = obs$eff_rank,
        obs_frac   = obs$frac_var_topk,
        rand_eff   = rand_eff,
        rand_frac  = rand_frac
    )
}

In [None]:
# Store results in a table
summary_df <- map_dfr(results, function(x) {
    tibble(
        met_tp = x$comb,
        obs_eff_rank = x$obs_eff,
        obs_frac_var = x$obs_frac,
        rand_eff_mean = mean(x$rand_eff),
        rand_eff_sd   = sd(x$rand_eff),
        rand_frac_mean = mean(x$rand_frac),
        rand_frac_sd   = sd(x$rand_frac)
    )
})

summary_df

In [None]:
# Compute empirical p-values
pvals_df <- map_dfr(results, function(x) {
    tibble(
        met_tp = x$comb,
        p_eff  = mean(x$rand_eff <= x$obs_eff),   # smaller eff_rank = more pattern
        p_frac = mean(x$rand_frac >= x$obs_frac)  # larger frac_var_topk = more pattern
    )
})

pvals_df

In [None]:
# Effective rank visualization

options(repr.plot.width = 6, repr.plot.height = 4)
plot_eff <- map_dfr(results, function(x) {
    tibble(
        met_tp = x$comb,
        eff_rank = x$rand_eff,
        type = "random"
    )
}) %>%  bind_rows(
    map_dfr(results, function(x) {
        tibble(
            met_tp = x$comb,
            eff_rank = x$obs_eff,
            type = "observed"
          )
    })
  )

ggplot(plot_eff, aes(x = eff_rank, color = met_tp)) +
    geom_density(
        data = subset(plot_eff, type == "random"),
        alpha = 0.2
    ) +
    geom_vline(
        data = subset(plot_eff, type == "observed"),
        aes(xintercept = eff_rank, shape = "observed", col = met_tp),
        size = 1
    ) +
    theme_linedraw(base_size = 15) +
    theme(
        panel.border = element_rect(color = "black", fill = NA, size = 1.5),
        plot.title = element_text(hjust = 0.5)
    ) + 
    labs(
        title = "Effective rank: observed \n vs random baseline ",
        x = "Effective rank",
        y = "Density",
        color = "",
        shape = ""
    ) + 
    scale_color_manual(
            values = c("Liver_T0"="#eaa5a5", "Liver_T1"="#a00000", "Lung_T0"="#a7cde3", "Lung_T1"="#1a80bb")
        )
ggsave(filename = file.path(fig_dir, "Effective_Rank.pdf"), plot = last_plot(), dpi = 300, width = 6, height = 4)

In [None]:
# Fraction of variance explained by top k singular values visualization

options(repr.plot.width = 6, repr.plot.height = 4)
plot_frac <- map_dfr(results, function(x) {
    tibble(
        met_tp = x$comb,
        frac_var = x$rand_frac,
        type = "random"
    )
}) %>%  bind_rows(
    map_dfr(results, function(x) {
        tibble(
            met_tp = x$comb,
            frac_var = x$obs_frac,
            type = "observed"
          )
    })
  )

ggplot(plot_frac, aes(x = frac_var, color = met_tp)) +
    geom_density(
        data = subset(plot_frac, type == "random"),
        alpha = 0.2
    ) +
    geom_vline(
        data = subset(plot_frac, type == "observed"),
        aes(xintercept = frac_var, shape = "observed", col = met_tp),
        size = 1
    ) +
    theme_linedraw(base_size = 15) +
    theme(
        panel.border = element_rect(color = "black", fill = NA, size = 1.5),
        plot.title = element_text(hjust = 0.5)
    ) + 
    labs(
        title = "Variance explained by top k SVs:\n observed vs random baseline ",
        x = "Variance Fraction",
        y = "Density",
        color = "",
        shape = ""
    ) + 
    scale_color_manual(
            values = c("Liver_T0"="#eaa5a5", "Liver_T1"="#a00000", "Lung_T0"="#a7cde3", "Lung_T1"="#1a80bb")
        )
ggsave(filename = file.path(fig_dir, "Fraction_Variance.pdf"), plot = last_plot(), dpi = 300, width = 6, height = 4)

## Correlations similarity (pearson and cosine)

In [None]:
# Build matrices for all met_tp

# list of matrices A_tp, each TRBV x TRBJ
matrices <- cor_list %>%
    filter(!(V %in% c("TRBV6-7", "TRBV6-8", "TRBV6-9", "TRBV7-4"))) %>% # remove combinations not found in all conditions
    mutate(
        V = as.character(V),
        J = as.character(J)
    ) %>%
    group_split(met_tp) %>%
    setNames(unique(cor_list$met_tp)) %>%
    lapply(function(df) {
        wide <- df %>%
            select(V, J, correlation) %>%
            pivot_wider(
                names_from  = J,
                values_from = correlation
            )
    M <- as.matrix(wide[, -1, drop = FALSE])
    rownames(M) <- wide$V
    M
  })

In [None]:
# Align matrices and compute pairwise similarities
align_mats <- function(A, B) {
    V_all <- union(rownames(A), rownames(B))
    J_all <- union(colnames(A), colnames(B))
    
    A2 <- matrix(NA_real_, nrow = length(V_all), ncol = length(J_all),
               dimnames = list(V_all, J_all))
    B2 <- A2
    
    A2[rownames(A), colnames(A)] <- A
    B2[rownames(B), colnames(B)] <- B
    
    list(A = A2, B = B2)
}

met_levels <- names(matrices)
n <- length(met_levels)

sim_df <- tibble(
    met_tp_1 = character(),
    met_tp_2 = character(),
    sim_pearson = numeric(),
    cosine_sim = numeric()
)

for (i in seq_len(n)) {
    for (j in seq_len(n)) {
    tp1 <- met_levels[i]
    tp2 <- met_levels[j]
    
    A <- matrices[[tp1]]
    B <- matrices[[tp2]]
    
    # align
    aligned <- align_mats(A, B)
    A2 <- aligned$A
    B2 <- aligned$B
    
    a_vec <- as.vector(A2)
    b_vec <- as.vector(B2)
    
    keep <- !(is.na(a_vec) | is.na(b_vec))
    a_vec <- a_vec[keep]
    b_vec <- b_vec[keep]
    
    sim_pearson <- cor(a_vec, b_vec)
    cosine_sim <- sum(a_vec * b_vec) /
        (sqrt(sum(a_vec^2)) * sqrt(sum(b_vec^2)))
    
    sim_df <- add_row(
        sim_df,
        met_tp_1 = tp1,
        met_tp_2 = tp2,
        sim_pearson = sim_pearson,
        cosine_sim = cosine_sim
    )
  }
}


In [None]:
# Pearson similarity heatmap
ggplot(sim_df, aes(x = met_tp_1, y = met_tp_2, fill = sim_pearson)) +
    geom_tile(color = "white") +
    geom_text(aes(label = sprintf("%.2f", sim_pearson)), size = 3) +
    scale_fill_gradient2(
        low = "firebrick2",
        mid = "white",
        high = "dodgerblue4",
        midpoint = 0
    ) +
    coord_fixed() +
    theme_linedraw(base_size = 15) +
    theme(
        panel.border = element_rect(color = "black", fill = NA, size = 1.5),
        plot.title = element_text(hjust = 0.5)
    ) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    labs(
        title = "Pearson similarity between \n V-J correlation matrices",
        x = "",
        y = "",
        fill = "Pearson\nsimilarity"
    )
ggsave(filename = file.path(fig_dir, "Pearson_Similarity.pdf"), plot = last_plot(), dpi = 300, width = 6, height = 4)

In [None]:
# Cosine similarity heatmap
ggplot(sim_df, aes(x = met_tp_1, y = met_tp_2, fill = cosine_sim)) +
    geom_tile(color = "white") +
    geom_text(aes(label = sprintf("%.2f", cosine_sim)), size = 3) +
    scale_fill_gradient2(
        low = "firebrick2",
        mid = "white",
        high = "dodgerblue4",
        midpoint = 0
    ) +
    coord_fixed() +
    theme_linedraw(base_size = 15) +
    theme(
        panel.border = element_rect(color = "black", fill = NA, size = 1.5),
        plot.title = element_text(hjust = 0.5)
    ) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    labs(
        title = "Cosine similarity between \n V-J correlation matrices",
        x = "",
        y = "",
        fill = "Cosine\nsimilarity"
    )
ggsave(filename = file.path(fig_dir, "Cosine_Similarity.pdf"), plot = last_plot(), dpi = 300, width = 6, height = 4)