## Serpentine: Blood Omniscope TCR Clonotype Data (OS-TCR) Processing 

In [None]:
# Load project configuration
setwd("/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR")
source("code/helper/Config.R", echo = FALSE)
options(repr.matrix.max.rows=100, repr.matrix.max.cols=100)
library(tidyr)

In [None]:
# Define vars
OS_data_dir <- "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR/data/Omniscope"
data_dir <- "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR/out/data"

In [None]:
data <- fread("/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE_TCR/data/Omniscope/clone_files/spe_1_01_c01.csv.zip")

### Read and process sample-specific Blood TCR Clonotype Files

In [None]:
# Process blood TCR clonotype data per patient and combine it into a unique dataframe

patients <- c("P01", "P02", "P03", "P04", "P05", "P06", "P07", "P08", "P09", "P10", "P11", "P12", "P13", "P14", "P15", "P16", 
             "P17", "P18", "P19", "P20", "P21", "P22", "P24", "P25", "P26", "P27", "P28", "P29")
df_list = list()

for (p in patients) {

    # ------------------------- #
    # Process Blood OS TCR data #
    # ------------------------- #
    
    # Read OS blood clonotype data
    long_filenames <- list.files(file.path(OS_data_dir, "clone_files"), pattern=paste0("^spe_1_", sub("P", "", p)), full.names=TRUE)
    filenames <- list.files(file.path(OS_data_dir, "clone_files"), pattern=paste0("^spe_1_", sub("P", "", p)), full.names=FALSE)
    blood_data <- map(long_filenames, fread)

    # Rename dfs
    sample_names <- sub("\\.csv(\\.zip)?$", "", filenames)
    sample_names <- paste0("P", sub("^spe_1_", "", sample_names))
    names(blood_data) <- sample_names

    # Add Sample, Patient and Timepoint information
    full_blood_data <- lapply(sample_names, function(sample_name) {
        df <- blood_data[[sample_name]]
        sample_id <- strsplit(sample_name, "_")[[1]]
        df <- df %>%
            mutate(
                Sample_ID = sample_name,
                Patient = sample_id[1],
                Timepoint = sample_id[2]
            )
        return(df)
    })

    # Unlist blood clonotype data
    full_blood_data <- bind_rows(full_blood_data, .id = "id")
    
    # Rename Timepoint names
    full_blood_data <- full_blood_data %>%
        mutate(Timepoint = factor(Timepoint, 
                         levels = c("scr", "c01", "c02", "c03", "c04", "eot", "lti", "disc", "c06", "c14"), 
                         labels = c("SCR", "C01", "C02", "C03", "C04", "EOT", "LTI", "DISC", "C06", "C14"))
               )

    # Add a new formatted Timepoint column
    full_blood_data <- full_blood_data %>%
        mutate(Timepoint_2 = factor(Timepoint, 
                         levels = c("SCR", "C01", "C02", "C03", "C04", "EOT", "LTI", "DISC", "C06", "C14"), 
                         labels = c("T0", "T1", "T2", "T3", "T4", "EOT", "LTI", "DISC", "T6", "T14"))
               )

    # Check for duplicated clonotypes
    ifelse(((as.data.table(full_blood_data)[, .N, by = .(contig_id, Timepoint)][N > 1] %>% nrow()) == 0), "No Duplicates", "There are duplicates detected (shared contig_id), adding their counts ...")
    # Duplicates of P01 are 1k, this is unsignificant considering the total amount of clonotypes in blood


    # ----------------------------------------------- #
    # Convert patient-specific TCR df to wide format  #
    # ----------------------------------------------- #
    
    # Convert patient clonotype data into wide format, so if a clonotype is at multiple time points it is indicated in new columns and not have duplicated rows
    full_blood_data$counts <- as.integer(full_blood_data$counts)
    print(names(full_blood_data))
    full_blood_data_wide <- full_blood_data %>%
        pivot_wider(
            id_cols = c(contig_id, cdr3, cdr3_aa, Patient), # Check patient,
            names_from = c(Timepoint),
            values_from = c(counts, frequency),
            names_glue = "{Timepoint}_{.value}",
            values_fn = list(counts = unique, frequency = unique) # Deal with duplicates by adding their counts and frequencies since clonotypes with the same contig_id (cdr3 seq) are considered to be the same one
        )

    # Add the patient-specific df to the df list
    df_list[[p]] <- full_blood_data_wide
}

# Concatenate patient-specific dfs into a single df
all_blood_data_wide <- bind_rows(df_list)

In [None]:
# Check for duplicates
if (anyDuplicated(all_blood_data_wide)) {
    cat("THERE ARE DUPLICATES!\n")
} else {
    cat("No Duplicates!\n")
}

In [None]:
# Save processed blood TCR clonotype data
qsave(all_blood_data_wide, file = file.path(data_dir, "SERP_Blood_OS_TCR_Wide_09-2025_v2.qs"))

In [None]:
# Get number of clonotypes and cells
print(paste("Number of Clonotypes:", nrow(all_blood_data_wide)))
print(paste("Number of T cells / Molecules:", sum(colSums(all_blood_data_wide[, grep("_counts$", names(all_blood_data_wide))], na.rm = TRUE)))) 

In [None]:
# Read processed blood TCR clonotype data
all_blood_data_wide <- qread(file = file.path(data_dir, "SERP_Blood_OS_TCR_Wide_09-2025_v2.qs"), nthreads=32)

In [None]:
unique(all_blood_data_wide$Patient)

### Add Differential Expansion Information to the Blood TCR Clonotypes

In [None]:
# Read processed blood TCR data in wide format
all_blood_data_wide <- qread(file = file.path(data_dir, "SERP_Blood_OS_TCR_Wide_09-2025_v2.qs"), nthreads = 32)

In [None]:
# Process blood TCR data by adding differential expansion information
patients <- c("P01", "P02", "P03", "P04", "P05", "P06", "P07", "P08", "P09", "P10", "P11", "P12", "P13", "P14", "P15", "P16", 
             "P17", "P18", "P19", "P20", "P21", "P22", "P24", "P25", "P26", "P27", "P28", "P29")
df_list <- list()

# iterate over patients
for (p in patients) {
    message("Processing patient: ", p)
    
    # subset patient-specific wide blood clonotype
    p_blood <- all_blood_data_wide %>%
        filter(Patient == p)
    
    # list all diff-exp files 
    diff_files <- list.files(
        file.path(OS_data_dir, "diff_exp", p),
        pattern = "\\.csv(\\.zip)?$",
        full.names = TRUE,
        recursive = TRUE
    )
    
    # read + process each file
    for (f in diff_files) {
        sample_id <- str_extract(basename(f), "(?<=_)[^_]+(?=_event_related_clonotypes\\.csv)") # e.g. SCR-C02
        dt <- fread(f)
        dt[, Patient := p]
        
        # rename selected columns with prefix
        cols_to_keep <- c("contig_id", "Patient", "pre_count", "post_count", "event")
        dt <- dt[, ..cols_to_keep]
        setnames(dt,
                 old = c("pre_count", "post_count", "event"),
                 new = paste0(sample_id, "_", c("pre_count", "post_count", "event"))
        )
        
        # merge         
        p_blood <- p_blood %>%
            left_join(dt, by = c("contig_id", "Patient"))
    }

    df_list[[p]] <- p_blood
}

In [None]:
# Concatenate patient-specific dfs into a single df
all_diffexp_data <- bind_rows(df_list)
head(all_diffexp_data)

In [None]:
# Create clonotype_id column (consistency with tumor data)
all_diffexp_data <- all_diffexp_data %>%
    mutate(clonotype_id = str_replace(contig_id, 
                                    "^(TRBV[0-9A-Z\\-]+)-(TRBJ[0-9A-Z\\-]+)-(.*)$", 
                                    "\\1_\\2_\\3")
          )

In [None]:
# Save processed blood DiffExp TCR clonotype data
qsave(all_diffexp_data, file = file.path(data_dir, "SERP_Blood_OS_TCR_DiffExp_Wide_12-2025_v2.qs"))