This notebook prepares the genomic data for imputation of brain CpG levels from Edinburh Brain Bank (EBB) to UKB genotypes.
- get bgen variant ids (chr:pos_a1_a2) that correspond to EBB variant ids (chr:pos:a1:a2); note: alleles may be swapped so get chr:pos:a1:a2 and chr:pos:a2:a1 EBB variant ids and find the intersect wiht bgen ids.
- extract the bgen variant ids (so i dont have to update the large bgen files)
- Sample and variant QC
- update the bgen variant ids with the ebb variant ids
- Date: 03.02.2026


## Setup

In [1]:
%%bash
pip install openpyxl
pip install -U scikit-learn
pip install statsmodels











[0m

In [13]:
import os
import glob
import pandas as pd
from pandas.core.common import flatten
import re
import numpy as np
import seaborn as sns
from sklearn import datasets, linear_model, metrics
import statsmodels.api as sm
import statsmodels.formula.api as smf

### load EBB weights 

In [30]:
%%bash
dx download -f vasilis/data/ebb/weights/EBB.BRAIN.METHYL.HERIT.tar.bz2 

In [None]:
%%bash
tar -xjf EBB.BRAIN.METHYL.HERIT.tar.bz2 

In [None]:
%%bash
dx upload -r EBB.BRAIN.METHYL.HERIT --dest vasilis/data/ebb/weights/

### load UKB imputation data

In [5]:
%%bash
mkdir -p imp/
dx download -f -o imp/ Bulk/Imputation/'UKB imputation from genotype'/ukb22828_c*_b0_v3.mfi.txt 

## Filter samples

- no sex missmatch: p22001 vs p31
- white british ancestry: p22006
- no heterozygosiry or missingness outlier: p22027
- no \>\=10 3rd degree relatives in dataset: p22021
- no sex chromosome aneuploidy: p22019


In [50]:
all = pd.read_csv('/mnt/project/vasilis/data/pheno_all_unfiltered.csv')

  all = pd.read_csv('/mnt/project/vasilis/data/pheno_all_unfiltered.csv')


In [94]:
all['p22001'] = pd.to_numeric(all['p22001'], errors="coerce").astype("Int64")
cols = ['eid', 'p22006', 'p22027', 'p22021', 'p22019', 'p22001', 'p31']
qc = all[cols]
qc.head()

Unnamed: 0,eid,p22006,p22027,p22021,p22019,p22001,p31
0,1000015,1.0,,0.0,,1,1
1,1000053,1.0,,1.0,,0,0
2,1000132,1.0,,0.0,,0,0
3,1000148,1.0,,0.0,,0,0
4,1000163,1.0,,0.0,,1,1


In [113]:
qced= qc.loc[
    (qc['p22006'] == 1) & 
    (qc['p22001'] == qc['p31']) &
    (qc['p22021'] != 10) &
    (qc['p22027'].isna()) &
    (qc['p22019'].isna())
]
len(qced['eid'])

407827

In [131]:
# make .keep file
qced[['eid', 'eid']].to_csv('wb.eids', index=False, header=False, sep=" ")
!dx upload wb.eids --dest vasilis/data/ebb/

ID                                file-J60Kx0jJZB71xj86Gg2kFfQ4
Class                             file
Project                           project-GfvP6PQJZB72v2Vk348Bb2yg
Folder                            /vasilis/data/ebb
Name                              wb.eids
State                             [33mclosing[0m
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Mon Feb  2 18:35:47 2026
Created by                        vasilisraptis
 via the job                      job-J607PJ0JZB75g08bgjY771xk
Last modified                     Mon Feb  2 18:35:48 2026
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


## Extract weights' variants & QC variants

In [6]:
%%bash
## get EBB variant ids
awk '{print $1}' EBB.BRAIN.METHYL.HERIT/EBB.BRAIN.METHYL.HERIT.variants | sort | uniq > ebb.variants.1
## also get the swapped variant ids
cat ebb.variants.1 | awk 'BEGIN{FS=":"} {print $1":"$2":"$4":"$3}' | sort | uniq > ebb.variants.2
wc -l ebb.variants.1
wc -l ebb.variants.2

473982 ebb.variants.1
473982 ebb.variants.2


In [25]:
%%bash
mkdir -p imp/temp
mkdir -p imp/extract
for chr in {1..22}
do
    # filter INFO > 0.8 & MAF > 0.01; create ebb variant format: chr:pos:a1:a2; sort (for joining)
    awk -v chr=$chr '$NF > 0.8 && $6 > 0.01 { print chr":"$3":"$4":"$5, $2}' imp/ukb22828_c${chr}_b0_v3.mfi.txt | sort | uniq > imp/temp/temp_c${chr}.variants
    # find ebb variants in ukb (intersect)
    join -1 1 -2 1 ebb.variants.1 imp/temp/temp_c${chr}.variants > imp/extract/imp_c${chr}.extract1
    join -1 1 -2 1 ebb.variants.2 imp/temp/temp_c${chr}.variants > imp/extract/imp_c${chr}.extract2
    # keep both formats
    cat imp/extract/imp_c${chr}.extract1 imp/extract/imp_c${chr}.extract2 | awk '{print $0}' | sort | uniq > imp/extract/imp_c${chr}.bothIDs.extract
    # keep only ukb format
    awk '{print $2}' imp/extract/imp_c${chr}.bothIDs.extract | sort | uniq > imp/extract/imp_c${chr}.extract
    wc -l imp/extract/imp_c${chr}.extract
    rm imp/extract/imp_c${chr}.extract1 imp/extract/imp_c${chr}.extract2
done

44171 imp/extract/imp_c1.extract
37344 imp/extract/imp_c2.extract
28347 imp/extract/imp_c3.extract
22820 imp/extract/imp_c4.extract
26461 imp/extract/imp_c5.extract
29832 imp/extract/imp_c6.extract
26394 imp/extract/imp_c7.extract
22743 imp/extract/imp_c8.extract
15984 imp/extract/imp_c9.extract
25929 imp/extract/imp_c10.extract
25705 imp/extract/imp_c11.extract
23941 imp/extract/imp_c12.extract
13520 imp/extract/imp_c13.extract
16043 imp/extract/imp_c14.extract
15787 imp/extract/imp_c15.extract
17556 imp/extract/imp_c16.extract
19684 imp/extract/imp_c17.extract
9922 imp/extract/imp_c18.extract
17248 imp/extract/imp_c19.extract
13402 imp/extract/imp_c20.extract
6694 imp/extract/imp_c21.extract
10350 imp/extract/imp_c22.extract


In [29]:
%%bash
head -n2 imp/extract/imp_c22.extract # -> for plink 
echo ""
head -n2 imp/extract/imp_c22.bothIDs.extract # -> for matching

22:17060409_TTTTG_T
22:18223198_CT_C

22:17054103:G:A rs4008588
22:17054720:T:C rs9605903


In [34]:
%%bash
n=$(cat imp/extract/imp_c*.bothIDs.extract | wc -l)
nebb=$(cat ebb.variants.1 | wc -l)

echo "$n out of $nebb ($((100*n/nebb)) %) weights' variants in UKB imputed data (INFO > 0.8 & MAF > 0.01)"

469881 out of 473982 (99 %) weights' variants in UKB imputed data (INFO > 0.8 & MAF > 0.01)


In [None]:
!dx upload -r imp/extract/ --dest vasilis/data/ebb/extract/

In [None]:
# !dx upload -r imp/extract/*bothIDs.extract --dest vasilis/data/ebb/extract/

In [133]:
%%bash
dx upload mwas_01_extract.sh --dest vasilis/SAK_scripts/

ID                                file-J60P7QjJZB79k2xq9v6gqz7Q
Class                             file
Project                           project-GfvP6PQJZB72v2Vk348Bb2yg
Folder                            /vasilis/SAK_scripts
Name                              mwas_01_extract.sh
State                             closing
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Mon Feb  2 18:58:11 2026
Created by                        vasilisraptis
 via the job                      job-J607PJ0JZB75g08bgjY771xk
Last modified                     Mon Feb  2 18:58:12 2026
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"


In [22]:
%%bash
bgen_dir="/Bulk/Imputation/UKB imputation from genotype"
datadir="vasilis/data/ebb"
dest="vasilis/data/ebb/imp_bed/"

for CHR in {1..2}; do
    dx run swiss-army-knife \
        -iin="vasilis/SAK_scripts/mwas_01_extract.sh" \
        -iin="${bgen_dir}/ukb22828_c${CHR}_b0_v3.bgen" \
        -iin="${bgen_dir}/ukb22828_c${CHR}_b0_v3.sample" \
        -iin="${datadir}/wb.eids" \
        -iin="${datadir}/extract/imp_c${CHR}.extract" \
        -icmd="sh mwas_01_extract.sh ${CHR}" \
        --tag="ext_${CHR}" \
        --instance-type "mem1_ssd1_v2_x72" \
        --destination="${dest}" \
        --brief --yes --priority high
done


job-J60xGx0JZB7F6YGYY5FvzK1P
job-J60xGy0JZB76F8pk230GFZv8
