## Serpentine: Tumor and Blood Clonotypes Integration

In [None]:
# Load project configuration
setwd("/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR/code")
source("helper/Config.R", echo = FALSE)
options(repr.matrix.max.rows=100, repr.matrix.max.cols=100)

# Define dirs
data_dir <- "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR/out/data"

### Read Tumor TCR Clonotype Data with GEX information in Wide Format

In [None]:
# Load Serpentine tumor TCR+GEX processed data
tumor_data_wide <- qread(file = file.path(root_dir, "out", "data", "SERP_TCR-GEX_wide_11-2025_v2.qs"), nthreads = 32)

### Read Blood TCR Clonotype data

In [None]:
# Load Serpentine blood TCR processed data
blood_data <- qread(file.path(data_dir, "SERP_Blood_OS_TCR_DiffExp_Wide_12-2025_v2.qs"), nthreads = 32)

In [None]:
# Filter for MSS CRC patients
patients_keep <- c("P01", "P02", "P03", "P10", "P14", "P17", "P20", "P26", "P29", "P31", "P33", "P34", "P35") 
blood_data <- blood_data %>%
    filter(Patient %in% patients_keep)
dim(blood_data)

In [None]:
sum(blood_data$SCR_counts, na.rm = TRUE) + sum(blood_data$C01_counts, na.rm = TRUE) + sum(blood_data$C02_counts, na.rm = TRUE)

### Integrate Tumor and Blood Clonotypes

In [None]:
# Check intersection of column names between tumor and blood data
intersect(names(tumor_data_wide), names(blood_data))

In [None]:
# Rename patient var so it matches blood's df 
tumor_data_wide <- tumor_data_wide %>%
    ungroup() %>%
    mutate(Patient = patient) %>%
    select(-patient)

In [None]:
intersect(names(tumor_data_wide), names(blood_data))

In [None]:
# Check duplicates in tumor and blood clonotypes
print("Tumor clonotypes:")
print(ifelse((as.data.table(tumor_data_wide)[, .N, by = .(clonotype_id, Patient)][N > 1]  %>% nrow() == 0), "No Duplicates!", "THERE ARE DUPLICATES!"))
print("Blood clonotypes:")
print(ifelse((as.data.table(blood_data)[, .N, by = .(clonotype_id, Patient)][N > 1]  %>% nrow() == 0), "No Duplicates!", "THERE ARE DUPLICATES!"))

In [None]:
# Check if clonotypes interesect
length(intersect(tumor_data_wide$clonotype_id, blood_data$clonotype_id))

In [None]:
# Integration (full join)
integrated_data <- 
    full_join(
        tumor_data_wide,
        blood_data,
        by = c("clonotype_id", "Patient"),
        suffix = c("_tumor", "_blood"),
        keep = TRUE
    ) %>%
    mutate(
        source = case_when(
            is.na(clonotype_id_tumor) ~ "blood_only",
            is.na(clonotype_id_blood) ~ "tumor_only",
            TRUE ~ "both"
        )
    )


In [None]:
# Create consensus Patient column
integrated_data <- integrated_data %>%
    mutate(
        Patient = coalesce(Patient_tumor, Patient_blood)
    ) %>%
    select(-Patient_tumor, -Patient_blood)

In [None]:
# Create consensus clonotype_id column
integrated_data <- integrated_data %>%
    mutate(
        clonotype_id = coalesce(clonotype_id_tumor, clonotype_id_blood)
    ) %>%
    select(-clonotype_id_tumor, -clonotype_id_blood)

In [None]:
# Quality control - Check for duplicates
as.data.table(integrated_data)[, .N, by = .(clonotype_id, Patient)][N > 1]

In [None]:
# Check for NA clonotypes
integrated_data %>%
  summarise(NAs = sum(is.na(clonotype_id)))

In [None]:
# Check for blood only clonotypes
integrated_data %>% filter(source %in% c("both", "blood_only")) %>% nrow()
nrow(blood_data) 

# Check for tumor only clonotypes
integrated_data %>% filter(source %in% c("both", "tumor_only")) %>% nrow()
nrow(tumor_data_wide)

In [None]:
# Check every blood clonotypes has been merged
anti_join(blood_data, integrated_data, by = c("clonotype_id", "Patient"))

In [None]:
table(integrated_data$source)

In [None]:
# Save integrated tumor and blood clonotypes
qsave(integrated_data, file = file.path(data_dir, "SP_Tumor_GEX-Blood_Full_TCR_wide_12-2025_v2.qs"))

In [None]:
# Save only tumor clonotypes with blood information
qsave(integrated_data %>% 
          filter(!is.na(cloneSize_T0), !is.na(cloneSize_T1), !is.na(cloneSize_EOT)), 
      file = file.path(data_dir, "SP_Tumor_GEX-Blood_TCR_wide_12-2025_2.qs"))