1. Install and load the required packages.

In [9]:
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager", version = "3.21")

In [2]:
BiocManager::install(c("DESeq2", "biomaRt"), quietly = TRUE)
install.packages(c("tidyverse", "pheatmap"), quietly = TRUE)

'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: http://cran.rstudio.com/

Bioconductor version 3.19 (BiocManager 1.30.23), R 4.4.0 (2024-04-24)

Installing package(s) 'DESeq2'

also installing the dependencies ‘UCSC.utils’, ‘GenomeInfoDbData’, ‘zlibbioc’, ‘SparseArray’, ‘GenomeInfoDb’, ‘XVector’, ‘S4Arrays’, ‘DelayedArray’, ‘S4Vectors’, ‘IRanges’, ‘GenomicRanges’, ‘SummarizedExperiment’, ‘Biobase’, ‘BiocParallel’, ‘MatrixGenerics’


Old packages: 'abind', 'actuar', 'ade4', 'adegraphics', 'adehabitatHR',
  'adehabitatLT', 'adehabitatMA', 'admisc', 'AER', 'afex', 'aglm', 'AID',
  'akc', 'ale', 'AlgDesign', 'AlphaSimR', 'amap', 'Amelia', 'anacor',
  'antiword', 'anytime', 'ape', 'aplot', 'apollo', 'aqp', 'arrow', 'ars',
  'ARTool', 'AsioHeaders', 'askpass', 'ASMap', 'astsa', 'automap', 'baguette',
  'BAS', 'base64', 'bayesplot', 'bayestestR', 'bbotk', 'BDgraph', 'bedr',
 

In [6]:
library(DESeq2)
library(biomaRt)
library(tidyverse)
library(pheatmap)

2. Read in the expression matrix and the metadata file.

In [23]:
options(timeout = 1000)
meta_filepath <- "/kaggle/input/metadata/metadata.csv"
matrix_link <- "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_human_m1_10x/matrix.csv"

meta_data <- read.csv(meta_filepath)
gene_matrix <- data.table::fread(matrix_link)

3. Explore and clean both files.

In [26]:
str(meta_data)

'data.frame':	76533 obs. of  39 variables:
 $ sample_name                : chr  "AAACCCAAGGATTTCC-LKTX_190129_01_A01" "AAACCCAAGTATGGCG-LKTX_190129_01_A01" "AAACCCACAAAGTGTA-LKTX_190129_01_A01" "AAACCCACACTACTTT-LKTX_190129_01_A01" ...
 $ exp_component_name         : chr  "AAACCCAAGGATTTCC-21L8TX_180927_001_A01" "AAACCCAAGTATGGCG-21L8TX_180927_001_A01" "AAACCCACAAAGTGTA-21L8TX_180927_001_A01" "AAACCCACACTACTTT-21L8TX_180927_001_A01" ...
 $ cluster_label              : chr  "Inh L1-2 SST CCNJL" "Exc L5-6 FEZF2 IFNG-AS1" "Exc L3-5 RORB LINC01202" "Exc L2 LINC00507 GLRA3" ...
 $ cluster_color              : chr  "#fb8d00" "#2c815f" "#547d7a" "#cecd32" ...
 $ cluster_order              : int  50 116 87 75 120 101 111 90 90 100 ...
 $ class_label                : chr  "GABAergic" "Glutamatergic" "Glutamatergic" "Glutamatergic" ...
 $ class_color                : chr  "#FF7373" "#3DCC3D" "#3DCC3D" "#3DCC3D" ...
 $ class_order                : int  1 2 2 2 3 2 2 2 2 2 ...
 $ subclass_label   

In [27]:
meta_data <- meta_data %>% select(!ends_with("_color") & !ends_with("_order"))
colnames(meta_data)

In [24]:
str(gene_matrix)

Classes ‘data.table’ and 'data.frame':	76533 obs. of  50282 variables:
 $ sample_name           : chr  "AAACCCAAGGATTTCC-LKTX_190129_01_A01" "AAACCCAAGTATGGCG-LKTX_190129_01_A01" "AAACCCACAAAGTGTA-LKTX_190129_01_A01" "AAACCCACACTACTTT-LKTX_190129_01_A01" ...
 $ DDX11L1               : int  0 0 0 0 0 0 0 0 0 0 ...
 $ WASH7P                : int  0 0 0 0 0 0 0 0 0 0 ...
 $ MIR6859-1             : int  0 0 0 0 0 0 0 0 0 0 ...
 $ MIR1302-2             : int  0 0 0 0 0 0 0 0 0 0 ...
 $ FAM138A               : int  0 0 0 0 0 0 0 0 0 0 ...
 $ LOC105379212          : int  0 0 0 0 0 0 0 0 0 0 ...
 $ OR4G4P                : int  0 0 0 0 0 0 0 0 0 0 ...
 $ OR4G11P               : int  0 0 0 0 0 0 0 0 0 0 ...
 $ OR4F5                 : int  0 0 0 0 0 0 0 0 0 0 ...
 $ LOC105379213          : int  0 0 0 0 0 0 0 0 0 0 ...
 $ CICP27                : int  0 0 0 0 0 0 0 0 0 0 ...
 $ LOC729737             : int  0 0 0 0 0 0 0 0 0 0 ...
 $ LOC100996442          : int  0 0 0 0 0 0 0 0 0 0 ...
 $ LOC1053792

In [28]:
head(gene_matrix)

sample_name,DDX11L1,WASH7P,MIR6859-1,MIR1302-2,FAM138A,LOC105379212,OR4G4P,OR4G11P,OR4F5,⋯,ND4,TRNH,TRNS2,TRNL2,ND5,ND6,TRNE,CYTB,TRNT,TRNP
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
AAACCCAAGGATTTCC-LKTX_190129_01_A01,0,0,0,0,0,0,0,0,0,⋯,8,0,0,0,2,0,0,19,0,0
AAACCCAAGTATGGCG-LKTX_190129_01_A01,0,0,0,0,0,0,0,0,0,⋯,5,0,0,0,0,0,0,3,0,0
AAACCCACAAAGTGTA-LKTX_190129_01_A01,0,0,0,0,0,0,0,0,0,⋯,8,0,0,0,1,0,0,5,0,0
AAACCCACACTACTTT-LKTX_190129_01_A01,0,0,0,0,0,0,0,0,0,⋯,6,0,0,0,6,0,0,8,0,0
AAACCCACAGTGAGCA-LKTX_190129_01_A01,0,0,0,0,0,0,0,0,0,⋯,1,0,0,0,0,0,0,0,0,0
AAACCCAGTCACCCTT-LKTX_190129_01_A01,0,0,0,0,0,0,0,0,0,⋯,19,0,0,0,2,0,0,7,0,0


In [29]:
head(meta_data)

Unnamed: 0_level_0,sample_name,exp_component_name,cluster_label,class_label,subclass_label,donor_sex_label,region_label,cortical_layer_label,cell_type_accession_label,cell_type_alias_label,cell_type_alt_alias_label,cell_type_designation_label,external_donor_name_label,specimen_type,full_genotype_label,outlier_call,outlier_type
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>,<lgl>
1,AAACCCAAGGATTTCC-LKTX_190129_01_A01,AAACCCAAGGATTTCC-21L8TX_180927_001_A01,Inh L1-2 SST CCNJL,GABAergic,Sst,F,M1,all,CS1912131050,Inh L1-2 SST CCNJL,,Neuron 50,H18.30.001,nucleus,,False,
2,AAACCCAAGTATGGCG-LKTX_190129_01_A01,AAACCCAAGTATGGCG-21L8TX_180927_001_A01,Exc L5-6 FEZF2 IFNG-AS1,Glutamatergic,L5/6 NP,F,M1,all,CS1912131116,Exc L5-6 FEZF2 IFNG-AS1,,Neuron 116,H18.30.001,nucleus,,False,
3,AAACCCACAAAGTGTA-LKTX_190129_01_A01,AAACCCACAAAGTGTA-21L8TX_180927_001_A01,Exc L3-5 RORB LINC01202,Glutamatergic,L5 IT,F,M1,all,CS1912131087,Exc L3-5 RORB LINC01202,,Neuron 87,H18.30.001,nucleus,,False,
4,AAACCCACACTACTTT-LKTX_190129_01_A01,AAACCCACACTACTTT-21L8TX_180927_001_A01,Exc L2 LINC00507 GLRA3,Glutamatergic,L2/3 IT,F,M1,all,CS1912131075,Exc L2 LINC00507 GLRA3,,Neuron 75,H18.30.001,nucleus,,False,
5,AAACCCACAGTGAGCA-LKTX_190129_01_A01,AAACCCACAGTGAGCA-21L8TX_180927_001_A01,Oligo L2-6 OPALIN FTH1P3,Non-Neuronal,Oligo,F,M1,all,CS1912131120,Oligo L2-6 OPALIN FTH1P3,,Non-neuron 3,H18.30.001,nucleus,,False,
6,AAACCCAGTCACCCTT-LKTX_190129_01_A01,AAACCCAGTCACCCTT-21L8TX_180927_001_A01,Exc L5-6 FEZF2 C9orf135-AS1,Glutamatergic,L6 CT,F,M1,all,CS1912131101,Exc L5-6 FEZF2 C9orf135-AS1,L6 CT_2,Neuron 101,H18.30.001,nucleus,,False,
