### read GSE data and filter using metadata (condition is single cell,  HGG, human).  

In [8]:
CANCER_TYPE = "HGG"

In [2]:
getwd()

In [9]:
meta <- read.table("./in/gse_metadata.tsv", header = TRUE)
head(meta)

Unnamed: 0_level_0,GSE_ID,GSM_ID,CANCER_TYPE,SEQUENCING_TYPE,TECHNOLOGY,SEX,LOCATION,AGE
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
1,GSE163655,GSM4983564,HGG,ATAC,SC,,,
2,GSE163655,GSM4983566,HGG,ATAC,SC,,,
3,GSE163656,GSM4983567,HGG,ATAC,SC,,,
4,GSE163656,GSM4983568,HGG,ATAC,SC,,,
5,GSE163656,GSM4983569,HGG,ATAC,SC,,,
6,GSE210568,GSM6432679,HGG,ATAC,SN,F,PONS,11.0


In [10]:
keep_meta <- subset(subset(subset(meta, subset=SEQUENCING_TYPE %in% c("RNA", "RNA+ATAC")), subset=CANCER_TYPE==CANCER_TYPE), subset=TECHNOLOGY=="SC")
keep_meta

Unnamed: 0_level_0,GSE_ID,GSM_ID,CANCER_TYPE,SEQUENCING_TYPE,TECHNOLOGY,SEX,LOCATION,AGE
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
29,GSE210568,GSM6432703,EPENDYMOMA,RNA,SC,M,POSTERIOR-FOSSA,9.6
30,GSE210568,GSM6432704,EPENDYMOMA,RNA,SC,M,POSTERIOR-FOSSA,0.9
31,GSE210568,GSM6432705,HGG,RNA,SC,F,PARIETO-OCCIPITAL-LOBE,20.0
32,GSE210568,GSM6432706,HGG,RNA,SC,M,CORTEX,8.0
49,GSE210568,GSM6432723,HGG,RNA,SC,M,LEFT-THALAMUS,14.0
50,GSE210568,GSM6432724,HGG,RNA,SC,M,RIGHT-FRONTAL-LOBE,12.0
51,GSE210568,GSM6432725,HGG,RNA,SC,M,BRAIN,9.75
56,GSE210568,GSM6432730,EPENDYMOMA,RNA,SC,M,POSTERIOR-FOSSA,0.5
57,GSE210568,GSM6432731,HGG,RNA,SC,M,PARIETAL-LOBE,12.0
61,GSE210568,GSM6432735,HGG,RNA,SC,M,BRAIN,


In [11]:
keep_meta$GSM_ID

In [12]:
library(Seurat)
library(dplyr)

base_dir <- "./in"

 datasets <- c("GSE231859", "GSE210568", "GSE163656", "GSE163655" ,"GSE222850","GSE244433")
seurat_objects <- list()

find_file <- function(dir, pattern) {
  files <- Sys.glob(file.path(dir, pattern))
  if (length(files) > 0) {
    return(files[1])  # Return the first match
  } else {
    return(NULL)
  }
}

for (gse in datasets) {
  gse_dir <- file.path(base_dir, gse)
  barcodes_files <- Sys.glob(file.path(gse_dir, "*_barcodes.tsv"))
  sample_ids <- unique(sub("_barcodes.tsv$", "", basename(barcodes_files)))

  cat("\nsample_ids ids",sample_ids, "\n")

  for (sample_id in sample_ids) {
    sample_gsm_id <- strsplit(sample_id, "_")[[1]][1]
    if (!(sample_gsm_id %in% keep_meta$GSM_ID)) {
      cat("Skipping sample:", sample_id, "not found in metadata.\n")
    next
    }
    print(sample_id)
    barcodes_file <- find_file(gse_dir, paste0(sample_id, "*_barcodes.tsv"))
    features_file <- find_file(gse_dir, paste0(sample_id, "*_features.tsv"))
    matrix_file   <- find_file(gse_dir, paste0(sample_id, "*_matrix.mtx"))

    if (!is.null(barcodes_file) && !is.null(features_file) && !is.null(matrix_file)) {

      cat("  Found single-cell RNA-seq files for sample:", sample_id, "\n")
      # Adjust feature.column if needed; commonly, column 2 is gene name if 10x format
      counts <- ReadMtx(
        mtx = matrix_file,
        features = features_file,
        cells = barcodes_file,
        feature.column = 2, # Adjust this if needed based on your file format
        cell.column = 1
      )

      # Add a prefix to the cell names to track sample of origin
      colnames(counts) <- paste0(sample_id, "_", colnames(counts))
      # Create a Seurat object
      seurat_obj <- CreateSeuratObject(
        counts = counts,
        project = sample_id,
        min.cells = 3,
        min.features = 200
      )
      seurat_obj@meta.data$CANCER_TYPE = CANCER_TYPE
      # Store the Seurat object in the list
      seurat_objects[[sample_id]] <- seurat_obj
    } else {
      cat("  Missing one or more files for sample:", sample_id, "\n")
    }
  }
}
cat("Processing complete.\n")
print("saving files")




sample_ids ids GSM7305260_834 GSM7305261_868 GSM7305262_905 GSM7305263_910 GSM7305264_954 GSM7305265_968 GSM7305266_1107 GSM7305267_1144 GSM7305268_1187 GSM7305269_1190 GSM7305270_1196 GSM7305271_1214 GSM7305272_1241 GSM7305273_1414 GSM7305274_1421 GSM7305275_1431 GSM7305276_1437 GSM7305277_1458 GSM7305278_1463_B 
[1] "GSM7305260_834"
  Found single-cell RNA-seq files for sample: GSM7305260_834 
[1] "GSM7305261_868"
  Found single-cell RNA-seq files for sample: GSM7305261_868 
[1] "GSM7305262_905"
  Found single-cell RNA-seq files for sample: GSM7305262_905 
[1] "GSM7305263_910"
  Found single-cell RNA-seq files for sample: GSM7305263_910 
[1] "GSM7305264_954"
  Found single-cell RNA-seq files for sample: GSM7305264_954 
[1] "GSM7305265_968"
  Found single-cell RNA-seq files for sample: GSM7305265_968 
[1] "GSM7305266_1107"
  Found single-cell RNA-seq files for sample: GSM7305266_1107 
[1] "GSM7305267_1144"
  Found single-cell RNA-seq files for sample: GSM7305267_1144 
[1] "GSM7305268

"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


[1] "GSM6432704_BT2018022"
  Found single-cell RNA-seq files for sample: GSM6432704_BT2018022 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


[1] "GSM6432705_P-1190_S-1197"
  Found single-cell RNA-seq files for sample: GSM6432705_P-1190_S-1197 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


[1] "GSM6432706_P-1569_S-1569"
  Found single-cell RNA-seq files for sample: GSM6432706_P-1569_S-1569 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


Skipping sample: GSM6432707_P-1713_S-1713 not found in metadata.
Skipping sample: GSM6432708_P-1741_S-2756 not found in metadata.
Skipping sample: GSM6432709_P-1764_S-1766 not found in metadata.
Skipping sample: GSM6432710_P-1775_S-1775 not found in metadata.
Skipping sample: GSM6432711_P-1969_S-3584 not found in metadata.
Skipping sample: GSM6432712_P-2077_S-2077 not found in metadata.
Skipping sample: GSM6432713_P-2687_S-2688 not found in metadata.
Skipping sample: GSM6432714_P-3198_S-3199 not found in metadata.
Skipping sample: GSM6432715_P-3200_S-3254 not found in metadata.
Skipping sample: GSM6432716_P-3200_S-3867 not found in metadata.
Skipping sample: GSM6432717_P-3387_S-3411 not found in metadata.
Skipping sample: GSM6432718_P-3407_S-3447 not found in metadata.
Skipping sample: GSM6432719_P-4198_S-4459 not found in metadata.
Skipping sample: GSM6432720_P-4504_S-4916 not found in metadata.
Skipping sample: GSM6432721_P-5099_S-6218 not found in metadata.
Skipping sample: GSM64327

"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


[1] "GSM6432724_P-6166_S-8321"
  Found single-cell RNA-seq files for sample: GSM6432724_P-6166_S-8321 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


[1] "GSM6432725_P-6240_S-8628"
  Found single-cell RNA-seq files for sample: GSM6432725_P-6240_S-8628 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


Skipping sample: GSM6432726_P-6251_S-8496 not found in metadata.
Skipping sample: GSM6432727_P-6253_S-8498 not found in metadata.
Skipping sample: GSM6432728_P-6254_S-8499 not found in metadata.
Skipping sample: GSM6432729_P-6255_S-8500 not found in metadata.
[1] "GSM6432730_P-6292_S-8579"
  Found single-cell RNA-seq files for sample: GSM6432730_P-6292_S-8579 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


[1] "GSM6432731_P-6308_S-8632"
  Found single-cell RNA-seq files for sample: GSM6432731_P-6308_S-8632 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


Skipping sample: GSM6432732_P-6328_S-8672 not found in metadata.
Skipping sample: GSM6432733_P-6329_S-8673 not found in metadata.
Skipping sample: GSM6432734_P-6331_S-8904 not found in metadata.
[1] "GSM6432735_P-6337_S-8821"
  Found single-cell RNA-seq files for sample: GSM6432735_P-6337_S-8821 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


Skipping sample: GSM6432736_P-6388_S-8770 not found in metadata.
[1] "GSM6432737_P-6431_S-8842"
  Found single-cell RNA-seq files for sample: GSM6432737_P-6431_S-8842 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


[1] "GSM6432738_P-6519_S-9084"
  Found single-cell RNA-seq files for sample: GSM6432738_P-6519_S-9084 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


[1] "GSM6432739_P-6640_S-9581"
  Found single-cell RNA-seq files for sample: GSM6432739_P-6640_S-9581 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"



sample_ids ids GSM4983567_2937 GSM4983568_3402 GSM4983569_4021 
Skipping sample: GSM4983567_2937 not found in metadata.
Skipping sample: GSM4983568_3402 not found in metadata.
Skipping sample: GSM4983569_4021 not found in metadata.

sample_ids ids GSM4983564_2932 GSM4983566_3749 
Skipping sample: GSM4983564_2932 not found in metadata.
Skipping sample: GSM4983566_3749 not found in metadata.

sample_ids ids GSM6934152_PA1 GSM6934153_PA2 GSM6934154_PA3 GSM6934155_PA4 GSM6934156_PA5 GSM6934157_PMA1 
[1] "GSM6934152_PA1"
  Found single-cell RNA-seq files for sample: GSM6934152_PA1 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


[1] "GSM6934153_PA2"
  Found single-cell RNA-seq files for sample: GSM6934153_PA2 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


Skipping sample: GSM6934154_PA3 not found in metadata.
[1] "GSM6934155_PA4"
  Found single-cell RNA-seq files for sample: GSM6934155_PA4 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


[1] "GSM6934156_PA5"
  Found single-cell RNA-seq files for sample: GSM6934156_PA5 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"


[1] "GSM6934157_PMA1"
  Found single-cell RNA-seq files for sample: GSM6934157_PMA1 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"



sample_ids ids GSM7816061_3mthFMC GSM7816062_3mthFMC2 GSM7816063_PBMC GSM7816065_6w GSM7816066_Dura GSM7816067_H590 GSM7816068_H966 GSM7816069_440h1 GSM7816070_875h2 GSM7816071_SPA1 GSM7816072_WT_F GSM7816073_WT_M GSM7816074_HET GSM8488329_6mthFMC2 
Skipping sample: GSM7816061_3mthFMC not found in metadata.
Skipping sample: GSM7816062_3mthFMC2 not found in metadata.
Skipping sample: GSM7816063_PBMC not found in metadata.
Skipping sample: GSM7816065_6w not found in metadata.
Skipping sample: GSM7816066_Dura not found in metadata.
[1] "GSM7816067_H590"
  Found single-cell RNA-seq files for sample: GSM7816067_H590 
[1] "GSM7816068_H966"
  Found single-cell RNA-seq files for sample: GSM7816068_H966 
[1] "GSM7816069_440h1"
  Found single-cell RNA-seq files for sample: GSM7816069_440h1 
[1] "GSM7816070_875h2"
  Found single-cell RNA-seq files for sample: GSM7816070_875h2 
[1] "GSM7816071_SPA1"
  Found single-cell RNA-seq files for sample: GSM7816071_SPA1 
Skipping sample: GSM7816072_WT_F no

In [13]:
names(seurat_objects)

In [14]:

for (name in names(seurat_objects)) {
    print(name)
    fname <- paste0("./out/", name, "_", CANCER_TYPE, ".rds")
    saveRDS(seurat_objects[[name]], file=fname)
}
cat("Files saved.\n")

[1] "GSM7305260_834"
[1] "GSM7305261_868"
[1] "GSM7305262_905"
[1] "GSM7305263_910"
[1] "GSM7305264_954"
[1] "GSM7305265_968"
[1] "GSM7305266_1107"
[1] "GSM7305267_1144"
[1] "GSM7305268_1187"
[1] "GSM7305269_1190"
[1] "GSM7305270_1196"
[1] "GSM7305271_1214"
[1] "GSM7305272_1241"
[1] "GSM7305273_1414"
[1] "GSM7305274_1421"
[1] "GSM7305275_1431"
[1] "GSM7305276_1437"
[1] "GSM7305277_1458"
[1] "GSM7305278_1463_B"
[1] "GSM6432703_BT2016062"
[1] "GSM6432704_BT2018022"
[1] "GSM6432705_P-1190_S-1197"
[1] "GSM6432706_P-1569_S-1569"
[1] "GSM6432723_P-6117_S-8370"
[1] "GSM6432724_P-6166_S-8321"
[1] "GSM6432725_P-6240_S-8628"
[1] "GSM6432730_P-6292_S-8579"
[1] "GSM6432731_P-6308_S-8632"
[1] "GSM6432735_P-6337_S-8821"
[1] "GSM6432737_P-6431_S-8842"
[1] "GSM6432738_P-6519_S-9084"
[1] "GSM6432739_P-6640_S-9581"
[1] "GSM6934152_PA1"
[1] "GSM6934153_PA2"
[1] "GSM6934155_PA4"
[1] "GSM6934156_PA5"
[1] "GSM6934157_PMA1"
[1] "GSM7816067_H590"
[1] "GSM7816068_H966"
[1] "GSM7816069_440h1"
[1] "GSM7816070_87