**<span style="color:darkred; font-size:20px;">01. Data Preparation</span>**

<div style="text-align: right;">
    <p style="text-align: left;">Updated Time: 2025-02-02</p>
</div>

In [None]:
rm(list=ls())
library(Seurat)
options(Seurat.object.assay.version = "v3")
library(dplyr)
library(data.table)

##### Set Working Directory

In [None]:
setwd("/media/bio/Disk/Research Data/EBV/omicverse")
options(stringsAsFactors = F)
Sys.setenv("VROOM_CONNECTION_SIZE"=9999999)

#### Load GSE150430 Dataset

In [None]:
Data_GSE150430 <- fread("Dataset/GSE150430/npc_scRNA_hg19_processed_data.txt.gz", sep="\t")
geneNames <- unname(unlist(Data_GSE150430[,1, with=FALSE]))
Data_GSE150430 <- as.matrix(Data_GSE150430[,-1, with=FALSE])
dim(Data_GSE150430)
rownames(Data_GSE150430) <- geneNames
Data_GSE150430[1:5,1:4]
scRNA_GSE150430 <- CreateSeuratObject(counts = Data_GSE150430, project = "GSE150430", names.field = c(1), names.delim = "_", min.cells = 3)
head(scRNA_GSE150430@meta.data)
table(scRNA_GSE150430$orig.ident)
dim(scRNA_GSE150430)
rm(Data_GSE150430)

#### Load GSE150430 Dataset

In [None]:
Data_GSE150825 <-Read10X(data.dir="Dataset/GSE150825")
Data_GSE150825[1:5,1:4]
scRNA_GSE150825 <- CreateSeuratObject(counts = Data_GSE150825, project = "GSE150825", names.field = c(2), names.delim = "-", min.cells = 3)
head(scRNA_GSE150825@meta.data)
table(scRNA_GSE150825$orig.ident)
dim(scRNA_GSE150825)
rm(Data_GSE150825)

#### Load GSE162025 Dataset

In [None]:
sample_list = list.files("Dataset/GSE162025/")
scRNA_list = list()
for (sample in sample_list){
  filedir = paste0("Dataset/GSE162025/", sample)
  scrna_data <- fread(filedir)
  scrna_data[1:5,1:4]
  geneNames <- unname(unlist(scrna_data[,1, with=FALSE]))
  scrna_data <- as.matrix(scrna_data[,-1, with=FALSE])
  rownames(scrna_data) <- geneNames
  scrna_data[1:5,1:4]
  Seurat_object <- CreateSeuratObject(counts = scrna_data, project = "GSE162025", names.field = c(1,2,3), names.delim = "_", min.cells = 3)
  rm(scrna_data)
  sample = substring(sample, 12, 22)

  scRNA_list[[sample]] = Seurat_object
  rm(Seurat_object)
}
# merge
scRNA_GSE162025 <- merge(scRNA_list[["NPC_SC_1802"]], 
                         y = c(scRNA_list[["NPC_SC_1805"]], scRNA_list[["NPC_SC_1806"]], scRNA_list[["NPC_SC_1807"]], 
                               scRNA_list[["NPC_SC_1808"]], scRNA_list[["NPC_SC_1810"]], scRNA_list[["NPC_SC_1811"]], 
                               scRNA_list[["NPC_SC_1813"]], scRNA_list[["NPC_SC_1815"]], scRNA_list[["NPC_SC_1816"]]), 
                         project = "GSE162025")
head(scRNA_GSE162025@meta.data)
table(scRNA_GSE162025$orig.ident)
dim(scRNA_GSE162025)
rm(scRNA_list)

#### Load ChiCTR2000032317 Dataset

In [None]:
samples <- c("YX92",  "YX93", "YX122", "YX124", "YX155", "YX156")

for (i in seq_along(samples)){
  assign(paste0("scs_data_", samples[i]), Read10X(data.dir = paste0("Dataset/ChiCTR2000032317/", samples[i])))
}

for (i in seq_along(samples)){
  assign(paste0("seu_obj_", samples[i]), CreateSeuratObject(counts = eval(parse(text = paste0("scs_data_", samples[i]))), 
                                                            project = samples[i], min.cells = 3))
}
rm(scs_data_YX92, scs_data_YX93, scs_data_YX122, scs_data_YX124, scs_data_YX155, scs_data_YX156)

# merge
scRNA_ChiCTR2000032317 <- merge(seu_obj_YX92, y = c(seu_obj_YX93, seu_obj_YX122, seu_obj_YX124, 
                                                    seu_obj_YX155, seu_obj_YX156), 
                                add.cell.ids = samples, project = "ChiCTR2000032317")
head(scRNA_ChiCTR2000032317@meta.data)
table(scRNA_ChiCTR2000032317$orig.ident)
dim(scRNA_ChiCTR2000032317)
rm(seu_obj_YX92, seu_obj_YX93, seu_obj_YX122, seu_obj_YX124, seu_obj_YX155, seu_obj_YX156)

#### Merge All Dataset

In [None]:
scRNA <- merge(scRNA_GSE150430, y = c(scRNA_GSE150825, scRNA_GSE162025, scRNA_ChiCTR2000032317),  project = "scRNA_EBV")
rm(scRNA_GSE150430, scRNA_GSE150825, scRNA_GSE162025, scRNA_ChiCTR2000032317)
table(scRNA$orig.ident)

#### Add Metadata

In [None]:
library(readxl)
metatable <- read_excel("Dataset/Add_Metadata.xlsx", sheet = 1)
metatable <- as.data.frame(metatable)
for (i in names(metatable)[c(1:5,7:10)]){metatable[,i] <- as.factor(metatable[,i])}
summary(metatable)

In [None]:
metadata <- FetchData(scRNA, "orig.ident")
metadata$cell_id <- rownames(metadata)
metadata <- left_join(x = metadata, y = metatable, by = "orig.ident")
rownames(metadata) <- metadata$cell_id
scRNA <- AddMetaData(scRNA, metadata = metadata)
summary(scRNA@meta.data)
head(scRNA@meta.data)
table(scRNA$orig.ident)

#### Select Samples

In [None]:
scRNA_unfiltered <- subset(scRNA, Stage!="rII")
scRNA_unfiltered <- subset(scRNA_unfiltered, EBV_status!="NA")

head(scRNA_unfiltered)
table(scRNA$orig.ident)

In [None]:
saveRDS(scRNA_unfiltered, "Processed Data/scRNA_unfiltered.rds")

#### Transfer Seurat Object Into H5AD Format

In [None]:
library(sceasy)
use_condaenv('npc_env')

In [None]:
sceasy::convertFormat(scRNA_unfiltered, from="seurat", to="anndata", outFile= "Processed Data/scRNA_unfiltered.h5ad")

In [None]:
sessionInfo()