Match genotypes and methylation data using genotype concordance

- Date: 27.10.25

### Setup

In [1]:
R.version

               _                           
platform       x86_64-conda-linux-gnu      
arch           x86_64                      
os             linux-gnu                   
system         x86_64, linux-gnu           
status                                     
major          4                           
minor          5.1                         
year           2025                        
month          06                          
day            13                          
svn rev        88306                       
language       R                           
version.string R version 4.5.1 (2025-06-13)
nickname       Great Square Root           

In [2]:
## load libraries
library(stringr)
library(data.table) 
library(vroom)
library(ggplot2)
library(tidyr)
library(limma)
library(meffil)
library(readxl)
library(dplyr)


Loading required package: illuminaio

Loading required package: MASS

Loading required package: lmtest

Loading required package: zoo


Attaching package: ‘zoo’


The following objects are masked from ‘package:data.table’:

    yearmon, yearqtr


The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric


Loading required package: sandwich

Loading required package: sva

Loading required package: mgcv

Loading required package: nlme

This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.

Loading required package: genefilter


Attaching package: ‘genefilter’


The following object is masked from ‘package:MASS’:

    area


The following object is masked from ‘package:vroom’:

    spec


Loading required package: BiocParallel

Loading required package: plyr

Loading required package: reshape2


Attaching package: ‘reshape2’


The following object is masked from ‘package:tidyr’:

    smiths


The following objects are masked from ‘package:data.table’:

  

In [3]:
# set wd
setwd('/exports/cmvm/eddie/smgphs/groups/Quantgen/Users/vasilis/PHD/EBB_methylation/')

In [4]:
# set # of cores
library(parallel)
cores = detectCores()
cores
options(mc.cores=cores)

In [5]:
# generate sample sheet
samplesheet <- meffil.create.samplesheet('BrainSamples/data//idats_140716', recursive=TRUE)
# samplesheet %>% head
samplesheet %>% dim

In [6]:
# update sex and Sample name
mf  <- fread('metadata/mapping_file.csv')
sex <- 
read_xlsx('metadata/SampleIDs.xlsx') %>%
    dplyr::rename(Sex = Sample_Group)
upd <- 
fread('BrainSamples/data/idats_140716/Samples_Table_140716.csv') %>% 
    mutate(Sample_Name = paste0(`Sentrix Barcode`, "_", `Array`)) %>%
    dplyr::select(c('Sample ID', 'Sample_Name')) %>%
    dplyr::rename('Sample_Name2' = 'Sample ID')

samplesheet <-
inner_join(samplesheet, upd, by = 'Sample_Name') %>% 
    dplyr::mutate(Sample_Name = Sample_Name2) %>%
    dplyr::select(-c(Sample_Name2)) %>%
    dplyr::select(-c(Sex)) %>%
    left_join(., sex, by = 'Sample_Name')
samplesheet %>% head
samplesheet %>% dim

Unnamed: 0_level_0,Sample_Name,Slide,sentrix_row,sentrix_col,Basename,Sex
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,SD001/11B,200514040135,1,1,BrainSamples/data//idats_140716/200514040135_R01C01,M
2,SD033/10,200514040135,2,1,BrainSamples/data//idats_140716/200514040135_R02C01,M
3,SD024/08,200514040135,3,1,BrainSamples/data//idats_140716/200514040135_R03C01,M
4,SD039/08,200514040135,4,1,BrainSamples/data//idats_140716/200514040135_R04C01,M
5,SD043/06,200514040135,5,1,BrainSamples/data//idats_140716/200514040135_R05C01,F
6,SD034/09B,200514040135,6,1,BrainSamples/data//idats_140716/200514040135_R06C01,M


### Check ID missmatch with genotype data

If you have genotype data available on the same individuals with methylation profiles you can check for ID mismatches. The methylation arrays have 65 SNPs which can be extracted from the methylation data. These 65 SNPs can be compared to genotypes measured with genotype arrays.

In [7]:
### Load meffil objects
load('meffil_data/qc.objects.Robj')
load('meffil_data/qcsummary.Robj')

In [8]:
featureset <- qc.objects[[2]]$featureset
featureset
#writeLines(meffil.snp.names(featureset), con="snp-names.txt")

In [9]:
# genotypes extracted using:
# >plink2 --pfile genotypingdata/plink_files/pgen/imputed_allchr --extract snp-names-newID.txt --recode A --out meffil_data/genotypes-imp

#### Original genotypes

In [10]:
## load genotypes (imputed)
genotypes0 <- meffil.extract.genotypes("meffil_data/genotypes-imp.raw")
genotypes_df <- 
    as.data.frame(genotypes0) %>%
    tibble::rownames_to_column("gen.id")
genotypes_df %>% head

Unnamed: 0_level_0,gen.id,Titan_EG0082_10610AT_Plate1_A01.CEL,Titan_EG0082_10610AT_Plate1_A02.CEL,Titan_EG0082_10610AT_Plate1_A03.CEL,Titan_EG0082_10610AT_Plate1_A04.CEL,Titan_EG0082_10610AT_Plate1_A05.CEL,Titan_EG0082_10610AT_Plate1_A06.CEL,Titan_EG0082_10610AT_Plate1_A07.CEL,Titan_EG0082_10610AT_Plate1_A08.CEL,Titan_EG0082_10610AT_Plate1_A09.CEL,⋯,Titan_EG0083_10610AT_Plate2_H03.CEL,Titan_EG0083_10610AT_Plate2_H04.CEL,Titan_EG0083_10610AT_Plate2_H05.CEL,Titan_EG0083_10610AT_Plate2_H06.CEL,Titan_EG0083_10610AT_Plate2_H07.CEL,Titan_EG0083_10610AT_Plate2_H08.CEL,Titan_EG0083_10610AT_Plate2_H09.CEL,Titan_EG0083_10610AT_Plate2_H10.CEL,Titan_EG0083_10610AT_Plate2_H11.CEL,Titan_EG0083_10610AT_Plate2_H12.CEL
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,X1.4131726.A.G,2,2,1,2,1,0,2,0,1,⋯,1,0,0,0,1,2,0,0,1,2
2,X1.11489678.A.G,1,1,0,1,2,0,1,1,0,⋯,1,1,1,2,1,0,1,1,1,1
3,X1.21652177.C.T,1,2,1,1,0,1,1,2,2,⋯,1,2,0,0,1,2,1,2,2,2
4,X1.25277982.C.T,2,1,2,0,1,0,2,1,0,⋯,0,0,1,1,2,1,1,0,2,2
5,X1.82173048.C.T,1,0,2,1,1,1,0,0,0,⋯,1,0,0,0,0,2,0,1,1,2
6,X1.177431168.T.C,2,2,2,1,1,1,2,2,2,⋯,2,2,1,2,2,2,1,2,2,2


In [14]:
## fix SNP names 
rsids <-
fread('snp-names-pvar-table.txt') %>% 
    select(snp, gen.id) %>% 
    mutate(
        gen.id = ifelse(str_detect(gen.id, "^X:"), gen.id, paste0("X", gen.id)),
        gen.id = gsub(":", ".", gen.id)
          )
genotypes_df2 <- left_join(genotypes_df, rsids, by = 'gen.id') %>% select(-c(gen.id))
genotypes     <- as.matrix(genotypes_df2[,!(names(genotypes_df2) %in% 'snp')])
rownames(genotypes) <- genotypes_df2$snp  
genotypes %>% head
genotypes %>% dim

Unnamed: 0,Titan_EG0082_10610AT_Plate1_A01.CEL,Titan_EG0082_10610AT_Plate1_A02.CEL,Titan_EG0082_10610AT_Plate1_A03.CEL,Titan_EG0082_10610AT_Plate1_A04.CEL,Titan_EG0082_10610AT_Plate1_A05.CEL,Titan_EG0082_10610AT_Plate1_A06.CEL,Titan_EG0082_10610AT_Plate1_A07.CEL,Titan_EG0082_10610AT_Plate1_A08.CEL,Titan_EG0082_10610AT_Plate1_A09.CEL,Titan_EG0082_10610AT_Plate1_A10.CEL,⋯,Titan_EG0083_10610AT_Plate2_H03.CEL,Titan_EG0083_10610AT_Plate2_H04.CEL,Titan_EG0083_10610AT_Plate2_H05.CEL,Titan_EG0083_10610AT_Plate2_H06.CEL,Titan_EG0083_10610AT_Plate2_H07.CEL,Titan_EG0083_10610AT_Plate2_H08.CEL,Titan_EG0083_10610AT_Plate2_H09.CEL,Titan_EG0083_10610AT_Plate2_H10.CEL,Titan_EG0083_10610AT_Plate2_H11.CEL,Titan_EG0083_10610AT_Plate2_H12.CEL
rs3936238,2,2,1,2,1,0,2,0,1,1,⋯,1,0,0,0,1,2,0,0,1,2
rs877309,1,1,0,1,2,0,1,1,0,1,⋯,1,1,1,2,1,0,1,1,1,1
rs213028,1,2,1,1,0,1,1,2,2,1,⋯,1,2,0,0,1,2,1,2,2,2
rs11249206,2,1,2,0,1,0,2,1,0,1,⋯,0,0,1,1,2,1,1,0,2,2
rs654498,1,0,2,1,1,1,0,0,0,1,⋯,1,0,0,0,0,2,0,1,1,2
rs715359,2,2,2,1,1,1,2,2,2,1,⋯,2,2,1,2,2,2,1,2,2,2


#### Original methylation data

In [13]:
### extract methylation array genotypes
snp.betas <- meffil.snp.betas(qc.objects)
methyl.genotypes <- meffil:::calculate.beta.genotypes(snp.betas)
methyl.genotypes %>% head
methyl.genotypes %>% dim

Unnamed: 0,SD001/11B,SD033/10,SD024/08,SD039/08,SD043/06,SD034/09B,SD025/13,SD027/11,SD004/06,SD025/09,⋯,SD024/14B,SD008/09,SD032/09,SD022/08B,SD048/12,SD055/12,SD036/12,SD033/08,SD025/08,SD031/09
rs10796216,0,1,1,1,2,2,2,1,1,2,⋯,1,1,1,1,1,2,1,2,2,0
rs213028,2,1,0,1,0,1,1,0,0,0,⋯,0,1,1,0,0,0,0,1,0,1
rs3936238,0,0,1,1,1,2,1,0,0,1,⋯,1,0,1,1,0,0,0,0,1,1
rs6991394,0,0,0,0,1,1,1,0,1,1,⋯,1,1,0,0,1,0,1,0,1,0
rs1520670,0,0,0,0,1,0,0,1,1,1,⋯,1,1,1,1,1,2,2,1,1,0
rs9292570,2,1,1,2,0,2,1,2,0,0,⋯,2,1,1,1,1,1,1,0,0,1


#### Calcuate genotype concordances

In [21]:
## for all test include a sample0 + sample i (code fails when there is only one sample)
sample0 = samplesheet$Sample_Name[1]
sample0_gen = colnames(genotypes)[1]
message(paste('buffer sample names:', sample0, 'and', sample0_gen))

## setup results matrix
conc_mat <- matrix(ncol = length(colnames(snp.betas)), nrow = length(colnames(genotypes)))
colnames(conc_mat) <- colnames(snp.betas)
rownames(conc_mat) <- colnames(genotypes)

for (i in 1:length(samplesheet$Sample_Name)) {
    sample_i = samplesheet$Sample_Name[i]
    #message(sample_i)
    # subset REAL methylation genotype betas for sample i
    snp.betas_i <- snp.betas[, c(sample0, sample_i), drop=FALSE]
    # subset array genotypes 1:186 to check concordance with sample i 
    for (j in 1:length(colnames(genotypes))) {
        sample_j = colnames(genotypes)[j]
        #print(paste0('calculating concordance between: ', sample_i, ' and ', sample_j))
        # subset TESTED genotypes for sample j
        genotypes_j <- genotypes[, c(sample0_gen, sample_j), drop=FALSE]
        # match columns - genotypes
        colnames(genotypes_j) <- colnames(snp.betas_i)
        #genotypes_j <- genotypes_j[ , match(colnames(snp.betas_i), colnames(genotypes_j))]
        # match rows (SNPs) - methylation genotype betas
        snp.betas_i <- snp.betas_i[match(rownames(genotypes_j), rownames(snp.betas_i)), ]
        # calculate concordance
        conc_ij <- meffil.snp.concordance(snp.betas_i, genotypes_j)
        # put result in concordance results matrix
        conc_mat[sample_j,sample_i] <- conc_ij$sample[2]
    }
    cat("\rFinished ", i, " of ", length(samplesheet$Sample_Name))
}

buffer sample names: SD001/11B and Titan_EG0082_10610AT_Plate1_A01.CEL



Finished  136  of  136

In [22]:
conc_mat %>% head

Unnamed: 0,SD001/11B,SD033/10,SD024/08,SD039/08,SD043/06,SD034/09B,SD025/13,SD027/11,SD004/06,SD025/09,⋯,SD024/14B,SD008/09,SD032/09,SD022/08B,SD048/12,SD055/12,SD036/12,SD033/08,SD025/08,SD031/09
Titan_EG0082_10610AT_Plate1_A01.CEL,0.4827586,0.4827586,0.5,0.5,0.4310345,0.5,0.3448276,0.5,0.4655172,0.3965517,⋯,0.4137931,0.5344828,0.2931034,0.4310345,0.4482759,0.2931034,0.4310345,0.4827586,0.4655172,0.4137931
Titan_EG0082_10610AT_Plate1_A02.CEL,0.5172414,0.637931,0.5689655,0.5344828,0.3793103,0.5172414,0.5,0.5172414,0.5862069,0.4482759,⋯,0.5517241,0.5344828,0.4827586,0.5172414,0.5,0.6206897,0.637931,0.5172414,0.5,0.5517241
Titan_EG0082_10610AT_Plate1_A03.CEL,0.5344828,0.5517241,0.5172414,0.6724138,0.4827586,0.5689655,0.4827586,0.4827586,0.4655172,0.4482759,⋯,0.5862069,0.4482759,0.4827586,0.5862069,0.4482759,0.5344828,0.4827586,0.5344828,0.5,0.5
Titan_EG0082_10610AT_Plate1_A04.CEL,0.4655172,0.5689655,0.4827586,0.5172414,0.5,0.5,0.4137931,0.5,0.5,0.3965517,⋯,0.4137931,0.5344828,0.4310345,0.4655172,0.4482759,0.4310345,0.362069,0.5,0.5689655,0.4482759
Titan_EG0082_10610AT_Plate1_A05.CEL,0.4482759,0.4482759,0.5344828,0.4655172,0.4310345,0.5172414,0.5344828,0.5,0.3448276,0.4827586,⋯,0.4310345,0.5172414,0.5172414,0.4482759,0.5517241,0.3965517,0.5,0.4655172,0.4827586,0.4310345
Titan_EG0082_10610AT_Plate1_A06.CEL,0.5,0.4655172,0.4655172,0.5517241,0.3103448,0.4655172,0.3965517,0.3793103,0.4310345,0.4482759,⋯,0.5344828,0.4655172,0.4310345,0.4655172,0.4827586,0.4827586,0.4827586,0.5517241,0.5172414,0.4137931


In [50]:
## max genotype concordance for each sample:
sample.ID.mapping <-
conc_mat %>% 
  as.data.frame() %>%
  tibble::rownames_to_column("gen.sample.ID") %>%
  pivot_longer(
    cols = -gen.sample.ID,
    names_to = "sample.ID",
    values_to = "gen.concordance"
  ) %>%
  group_by(sample.ID) %>%
  slice_max(gen.concordance, n = 1, with_ties = FALSE) 
sample.ID.mapping %>% filter(gen.concordance < 0.75)

gen.sample.ID,sample.ID,gen.concordance
<chr>,<chr>,<dbl>
181,SD010/09,0.637931
178,SD023/11B,0.7068966
22,SD023/13,0.7068966
135,SD030/09,0.6896552
13,SD032/08,0.7068966
18,SD036/10,0.637931
36,SD036/13,0.5862069
183,SD036/14B,0.5689655
83,SD038/08,0.7241379
140,SD042/13,0.6551724


In [26]:
## save
fwrite(sample.ID.mapping, 'meffil_data/sampleID.mapping.genconc.txt', sep = " ")
fwrite(conc_mat, 'meffil_data/genconc.full.mat.csv')


x being coerced from class: matrix to data.table



### Re-run with high-concordance SNPs

In [32]:
# load first run results
sample.ID.mapping <- fread('meffil_data/sampleID.mapping.genconc.txt')
conc_mat <- fread('meffil_data/genconc.full.mat.csv')
# load qc.clean object, see meffil_01_QC.ipynb 
load("meffil_data/qcsummary.clean.Robj")
# extract SNPs to exclude
bad.snps <- qc.summary$genotype.summary$tabs$snps %>% filter(is.concordant == FALSE) %>% pull(snp.name)
cat(length(bad.snps), 'SNPs with low concordance (< 0.9):', bad.snps)

4 SNPs with low concordance (< 0.9): rs1520670 rs2208123 rs11034952 rs6471533

In [44]:
# remove bad SNPs from genotype matrics
snp.betas.high <- snp.betas[!(rownames(snp.betas) %in% bad.snps),]
genotypes.high <- genotypes[!(rownames(genotypes) %in% bad.snps),]
# all(rownames(snp.betas.high)==rownames(genotypes.high))

In [45]:
### re-run

## for all test include a sample0 + sample i (code fails when there is only one sample)
sample0 = samplesheet$Sample_Name[1]
sample0_gen = colnames(genotypes.high)[1]
message(paste('buffer sample names:', sample0, 'and', sample0_gen))

## setup results matrix
conc_mat_high <- matrix(ncol = length(colnames(snp.betas.high)), nrow = length(colnames(genotypes.high)))
colnames(conc_mat_high) <- colnames(snp.betas.high)
rownames(conc_mat_high) <- colnames(genotypes.high)

for (i in 1:length(samplesheet$Sample_Name)) {
    sample_i = samplesheet$Sample_Name[i]
    #message(sample_i)
    # subset REAL methylation genotype betas for sample i
    snp.betas_i <- snp.betas.high[, c(sample0, sample_i), drop=FALSE]
    # subset array genotypes 1:186 to check concordance with sample i 
    for (j in 1:length(colnames(genotypes.high))) {
        sample_j = colnames(genotypes.high)[j]
        #print(paste0('calculating concordance between: ', sample_i, ' and ', sample_j))
        # subset TESTED genotypes for sample j
        genotypes_j <- genotypes.high[, c(sample0_gen, sample_j), drop=FALSE]
        # match columns - genotypes
        colnames(genotypes_j) <- colnames(snp.betas_i)
        #genotypes_j <- genotypes_j[ , match(colnames(snp.betas_i), colnames(genotypes_j))]
        # match rows (SNPs) - methylation genotype betas
        snp.betas_i <- snp.betas_i[match(rownames(genotypes_j), rownames(snp.betas_i)), ]
        # calculate concordance
        conc_ij <- meffil.snp.concordance(snp.betas_i, genotypes_j)
        # put result in concordance results matrix
        conc_mat_high[sample_j,sample_i] <- conc_ij$sample[2]
    }
    cat("\rFinished ", i, " of ", length(samplesheet$Sample_Name))
}

buffer sample names: SD001/11B and Titan_EG0082_10610AT_Plate1_A01.CEL



Finished  136  of  136

In [46]:
conc_mat_high %>% head

Unnamed: 0,SD001/11B,SD033/10,SD024/08,SD039/08,SD043/06,SD034/09B,SD025/13,SD027/11,SD004/06,SD025/09,⋯,SD024/14B,SD008/09,SD032/09,SD022/08B,SD048/12,SD055/12,SD036/12,SD033/08,SD025/08,SD031/09
Titan_EG0082_10610AT_Plate1_A01.CEL,0.4814815,0.4814815,0.4814815,0.5185185,0.4259259,0.5185185,0.3518519,0.5,0.462963,0.4074074,⋯,0.4074074,0.537037,0.2962963,0.4444444,0.4444444,0.2962963,0.4444444,0.4814815,0.4444444,0.4259259
Titan_EG0082_10610AT_Plate1_A02.CEL,0.5185185,0.6111111,0.5555556,0.5185185,0.3703704,0.5,0.5,0.5185185,0.6111111,0.4814815,⋯,0.5555556,0.5740741,0.5,0.537037,0.5185185,0.6111111,0.6296296,0.5185185,0.5185185,0.5740741
Titan_EG0082_10610AT_Plate1_A03.CEL,0.537037,0.5185185,0.5,0.6666667,0.4814815,0.5555556,0.4814815,0.4814815,0.4814815,0.4814815,⋯,0.5925926,0.4814815,0.5,0.6111111,0.462963,0.5185185,0.462963,0.537037,0.5185185,0.5185185
Titan_EG0082_10610AT_Plate1_A04.CEL,0.4814815,0.5740741,0.4814815,0.5185185,0.5,0.5,0.3703704,0.5,0.4814815,0.3888889,⋯,0.4444444,0.5740741,0.4444444,0.4814815,0.462963,0.4074074,0.3703704,0.5,0.5740741,0.4444444
Titan_EG0082_10610AT_Plate1_A05.CEL,0.4074074,0.4444444,0.5185185,0.4814815,0.462963,0.537037,0.5555556,0.537037,0.3703704,0.5,⋯,0.4259259,0.5185185,0.537037,0.462963,0.5555556,0.4074074,0.4814815,0.5,0.5,0.4074074
Titan_EG0082_10610AT_Plate1_A06.CEL,0.5,0.4814815,0.4814815,0.5740741,0.3148148,0.4814815,0.3703704,0.3888889,0.4259259,0.4259259,⋯,0.5555556,0.4814815,0.4259259,0.462963,0.4814815,0.4814815,0.4814815,0.5740741,0.537037,0.3888889


In [51]:
## max genotype concordance for each sample:
sample.ID.mapping.high <-
conc_mat_high %>% 
  as.data.frame() %>%
  tibble::rownames_to_column("gen.sample.ID") %>%
  pivot_longer(
    cols = -gen.sample.ID,
    names_to = "sample.ID",
    values_to = "gen.concordance"
  ) %>%
  group_by(sample.ID) %>%
  slice_max(gen.concordance, n = 1, with_ties = FALSE) 
sample.ID.mapping.high %>% filter(gen.concordance < 0.75)

gen.sample.ID,sample.ID,gen.concordance
<chr>,<chr>,<dbl>
Titan_EG0083_10610AT_Plate2_H07.CEL,SD010/09,0.6296296
Titan_EG0082_10610AT_Plate1_C01.CEL,SD023/11B,0.7037037
Titan_EG0082_10610AT_Plate1_B10.CEL,SD023/13,0.7037037
Titan_EG0083_10610AT_Plate2_D07.CEL,SD030/09,0.6666667
Titan_EG0082_10610AT_Plate1_B01.CEL,SD032/08,0.7037037
Titan_EG0082_10610AT_Plate1_A02.CEL,SD036/10,0.6481481
Titan_EG0082_10610AT_Plate1_G01.CEL,SD036/13,0.6111111
Titan_EG0082_10610AT_Plate1_D06.CEL,SD036/14B,0.5555556
Titan_EG0082_10610AT_Plate1_G12.CEL,SD038/08,0.7407407
Titan_EG0083_10610AT_Plate2_D12.CEL,SD042/13,0.7037037


In [53]:
a <- sample.ID.mapping %>% filter(gen.concordance < 0.75) %>% pull(sample.ID)
b <- sample.ID.mapping.high %>% filter(gen.concordance < 0.75) %>% pull(sample.ID)
intersect(a,b)%>% length

In [54]:
## save
fwrite(sample.ID.mapping.high, 'meffil_data/sampleID.mapping.genconc.high.txt', sep = " ")
fwrite(conc_mat_high, 'meffil_data/genconc.full.mat.high.csv')


x being coerced from class: matrix to data.table

