### **Setup:**

In [1]:
.libPaths(c(.libPaths(), "/exports/eddie3_apps_local/apps/SL7/R/4.3.0/lib64/R/library"))
library("dplyr", warn.conflicts = F)
library("stringr", warn.conflicts = F)
library("data.table", warn.conflicts = F) 
library("vroom", warn.conflicts = F)
library("ggplot2", warn.conflicts = F)

### **Change working directory:**

In [2]:
wd = "/exports/cmvm/eddie/smgphs/groups/Quantgen/Users/vasilis/PHD/"
setwd(wd)
getwd()

### **Load data:**

In [3]:
file <- "ukb_data/ukb44986/extra_data/f21000.ethnicity.csv"
a    <- fread(file, select = c(1,2))
names(a) <- c("eid", "ethnicity_code")

In [4]:
head(a)

eid,ethnicity_code
<int>,<int>
1000015,1001
1000027,1001
1000039,1001
1000040,1001
1000053,1001
1000064,1001


In [7]:
# coding file, see here: https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=1001
codes <- fread("ukb.delirium.gwas/non_eur/data/coding1001.tsv")
codes %>% arrange(coding)

coding,meaning,node_id,parent_id,selectable
<int>,<chr>,<int>,<int>,<chr>
-3,Prefer not to answer,-3,0,Y
-1,Do not know,-1,0,Y
1,White,1,0,Y
2,Mixed,2,0,Y
3,Asian or Asian British,3,0,Y
4,Black or Black British,4,0,Y
5,Chinese,5,0,Y
6,Other ethnic group,6,0,Y
1001,British,1001,1,Y
1002,Irish,1002,1,Y


### Make ethnic background groups:
see here for data-coding: https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=1001

In [9]:
# south asian
sas_names <- c("Asian or Asian British", "Indian", "Pakistani", "Bangladeshi", "Any other Asian background")
sas_codes <- codes %>% filter(meaning %in% sas_names) %>% pull(coding)
# east asian (china)
eas_names <- c("Chinese")
eas_codes <- codes %>% filter(meaning %in% eas_names) %>% pull(coding)
# african 
afr_names <- c("Black or Black British", "Caribbean", "African", "Any other Black background")
afr_codes <- codes %>% filter(meaning %in% afr_names) %>% pull(coding)

In [11]:
a %>% head

eid,ethnicity_code
<int>,<int>
1000015,1001
1000027,1001
1000039,1001
1000040,1001
1000053,1001
1000064,1001


In [12]:
ethnic <-
a %>% 
    mutate(ethnicity = case_when(ethnicity_code %in% sas_codes ~ "SAS",
                                 ethnicity_code %in% eas_codes ~ "EAS",
                                 ethnicity_code %in% afr_codes ~ "AFR", 
                                 TRUE ~ NA)
          ) %>% 
    filter(!is.na(ethnicity)) %>%
    select("eid","ethnicity")

ethnic %>% 
    group_by(ethnicity) %>% 
    summarise(n=n())

ethnicity,n
<chr>,<int>
AFR,8058
EAS,1573
SAS,9878


In [13]:
# extract eids
afr_eids <- ethnic %>% filter(ethnicity=="AFR") %>% pull(eid)
eas_eids <- ethnic %>% filter(ethnicity=="EAS") %>% pull(eid)
sas_eids <- ethnic %>% filter(ethnicity=="SAS") %>% pull(eid)

### Extract delirium phenotype for non-white british:

In [11]:
# load ukb main dataset
dat <- fread("ukb_data/ukb44986/Download#675074/ukb675074_vr.csv")

#### exclude latest withdrawals:

In [12]:
w_eids <- fread("/exports/cmvm/eddie/smgphs/groups/Quantgen/Users/ukb_data/ukb44986/withdraw44986_131_20231013.txt")
w_eids <- w_eids$V1
dat    <- dat %>% filter(!(eid %in% w_eids))

#### get delirium cases (F05):

In [13]:
# get all delirium cases, based on "F05" date of first occurance 
del_eids <- dat %>% filter(!(is.na(`130846-0.0`))) %>% pull(eid)
print(paste0("at least one delirium occurance: n=", length(del_eids)))

[1] "at least one delirium occurance: n=9150"


#### get dead individuals:

In [14]:
# have a non NA age at death (cols 40007*)
dead_eids <-
    dat %>% 
    filter(if_any(starts_with("40007"), ~ !is.na(.))) %>%
    pull(eid)
print(paste0("Individuals who have died: n=", length(dead_eids))) 


[1] "Individuals who have died: n=44500"


#### rename & create columns:

In [15]:
dat2 <- 
dat %>% 
    # remale useful columns
    rename(
        "sex" = "31-0.0",
        "YOB" = "34-0.0",
        "MOB" = "52-0.0",
        "DOA_0" = "53-0.0", # date of attending assessment centre
        "DOA_1" = "53-1.0", 
        "DOA_2" = "53-2.0",
        "DOA_3" = "53-3.0",
        "AAA"   = "21003-0.0", # Age when attended assessment centre
        #"ass_centre" = "54-0.0",
        "batch" = "22000-0.0",
        "g_sex" = "22001-0.0",
        "het_miss_outlier" = "22027-0.0",
        "sex_chr_aneuploidy" = "22019-0.0",
        "g_kinship" = "22021-0.0",
        "g_ethnicity" = "22006-0.0",
        "age_death" = "40007-0.0",
        "delirium_date"= "130846-0.0",
        "delirium_source" = "130847-0.0",
        "provider_0" = "40022-0.0", # provider of inpatient data 
        "provider_1" = "40022-0.1",
        "provider_2" = "40022-0.2",
        #"dementia_date" = "42018-0.0"
        
    ) %>%
    rename_with(~gsub("22009-0.", "PC", x=.), .cols = starts_with("22009-0")) %>%
    # replace empty values with NAs in data provider columns
    mutate(across(starts_with("provider"), ~ na_if(.,""))) %>%
    # replace empty values with NAs in icd10 diagnoses codes columns
    mutate(across(starts_with("41270-0"), ~ na_if(.,""))) %>% 
    # make variable: has the individual any icd10 diagnosis? 
    mutate(hasICD10 = ifelse(!is.na(`41270-0.0`),1,0)) %>% 
    # make an IDate class column for approx. DOB. Day of birth is not accessible so set to 15th
    mutate(approx_dob = as.IDate(ISOdate(year=YOB, month=MOB, day=15))) %>%
    ## make a dummy age at first assessment variable based on approx. dob and date of assessment
    mutate(approx_AAA = round(as.numeric(difftime(DOA_0, approx_dob, units="days")) / 365.25, digits=2)) %>%
    # make isdead variable: dead (1) or alive (0)
    mutate(isdead = ifelse(eid %in% dead_eids, 1, 0)) %>%
    # make outcome variable: delirium case (1) or control (0) 
    mutate(delirium = ifelse(eid %in% del_eids, 1, 0)) %>% 
    # extract selected columns
    select(eid, YOB, MOB, approx_dob, starts_with("DOA"), AAA, approx_AAA, batch, sex, g_sex, het_miss_outlier, sex_chr_aneuploidy, g_ethnicity, g_kinship, 
           paste0("PC", seq(1,20)), starts_with("provider"), hasICD10, isdead, age_death, delirium_source, delirium_date, delirium)


#### make "age" variables:
- for cases: *date of diagnosis - dob* <br>
- for dead controls: *age at death* <br>
- for alive controls: *date of cencoring - dob* <br>

\* cencoring dates are different depending on provider, see these: <br>
https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=40022 (providers) <br>
https://biobank.ndph.ox.ac.uk/ukb/exinfo.cgi?src=Data_providers_and_dates (censoring dates) <br>

In [16]:
# create censoring dates (as of: 15.10.23)

#  HES: 31 October 2022
#  SMR: 31 August 2022
# PEDW: 31 May 2022
# no data: 31 October 2022 (assumed healthy?)

dat2 <-
dat2 %>% 
    mutate(censoring_date = case_when( (provider_0 == "HES")  ~ as.IDate(ISOdate(2022,10,31)),
                                       (provider_0 == "SMR" | provider_1 == "SMR" | provider_2 == "SMR")    ~ as.IDate(ISOdate(2022,08,31)),
                                       (provider_0 == "PEDW" | provider_1 == "PEDW" | provider_2 == "PEDW") ~ as.IDate(ISOdate(2022,05,31)),
                                       TRUE ~ as.IDate(ISOdate(2022,10,31))))


In [17]:
# create age variable
dat2 <-
dat2 %>% 
    mutate(age = case_when((delirium == 1) ~ round(as.numeric(difftime(delirium_date, approx_dob, units="days")) / 365.25, digits=1), # cases
                           (delirium == 0 & isdead == 0) ~ round(as.numeric(difftime(censoring_date, approx_dob, units="days")) / 365.25, digits=1), # alive control
                           (delirium == 0 & isdead == 1) ~ round(as.numeric(age_death), digits = 1))) # dead controls 


**Filter data based on:**
- no sex missmatch 
- **NOT in white british ancestry**
- no heterozygosiry or missingness outlier
- no \>\=10 3rd degree relatives in dataset
- no sex chromosome aneuploidy


In [14]:
dat3 <-
dat2 %>% 
    filter(sex == g_sex, 
           is.na(het_miss_outlier), 
           is.na(g_ethnicity),
           g_kinship != 10,
           is.na(sex_chr_aneuploidy)
           )
print(paste0("n=", dim(dat3)[1], " non-white british individuals remaining"))
print(table(dat3$delirium)) # 

ERROR: Error in eval(expr, envir, enclos): object 'dat2' not found


#### **export file used here:**

In [103]:
outfile3 <- "/exports/cmvm/eddie/smgphs/groups/Quantgen/Users/vasilis/PHD/ukb.delirium.gwas/non_eur/data/prelim_file.txt"
write.table(dat3, file=outfile3, quote=F, sep=" ", row.names=F, col.names=T)

### Make per-ancestry tables:
**Export .txt file with GWAS covariates and phenotype for regenie** <br>
Space separated with header: FID IID C1 C2 C3 P1 <br> 
see:https://rgcgithub.github.io/regenie/options/#covariate-file-format 

#### AFR:

In [5]:
dat3 <- fread("/exports/cmvm/eddie/smgphs/groups/Quantgen/Users/vasilis/PHD/ukb.delirium.gwas/non_eur/data/prelim_file.txt")

In [43]:
afr <-
dat3 %>% 
    rename("IID" = "eid") %>% mutate(FID = IID) %>%
    filter(IID %in% afr_eids) %>%
    select(FID, IID, sex, age, batch, starts_with("PC"), delirium,delirium_date)
# delirium stats
afr %>% group_by(delirium) %>% summarise(n=n(), 
                                         `prev (%)`   = round(100*n()/nrow(afr),2),
                                         `age (mean)` = round(mean(age),2),
                                         `age (sd)` = round(sd(age),2),
                                         `age (median)` = round(median(age),2),
                                         `female (%)` = round(100*(1- sum(sex)/n()),2)

                                        )
#head(afr)

delirium,n,prev (%),age (mean),age (sd),age (median),female (%)
<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0,7515,98.49,65.34,7.92,64.0,57.22
1,115,1.51,71.83,8.15,74.4,45.22


In [22]:
outfile_afr <- "/exports/cmvm/eddie/smgphs/groups/Quantgen/Users/vasilis/PHD/ukb.delirium.gwas/non_eur/data/pheno/delirium_afr.phe"
write.table(afr, file=outfile_afr, quote=F, sep=" ", row.names=F, col.names=T)

#### SAS:

In [57]:
sas <-
dat3 %>% 
    rename("IID" = "eid") %>% mutate(FID = IID) %>%
    filter(IID %in% sas_eids) %>%
    select(FID, IID, sex, age, batch, starts_with("PC"), delirium,delirium_date)
# delirium stats
sas %>% group_by(delirium) %>% summarise(n=n(), 
                                         `prev (%)`   = round(100*n()/nrow(afr),2),
                                         `age (mean)` = round(mean(age),2),
                                         `age (sd)` = round(sd(age),2),
                                         `age (median)` = round(median(age),2),
                                         `female (%)` = round(100*(1- sum(sex)/n()),2)
                                        )
#head(sas)

delirium,n,prev (%),age (mean),age (sd),age (median),female (%)
<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0,9294,121.81,66.81,8.32,66.3,46.18
1,107,1.4,71.45,8.05,73.5,29.91


In [24]:
outfile_sas <- "/exports/cmvm/eddie/smgphs/groups/Quantgen/Users/vasilis/PHD/ukb.delirium.gwas/non_eur/data/pheno/delirium_sas.phe"
write.table(sas, file=outfile_sas, quote=F, sep=" ", row.names=F, col.names=T)

#### EAS:

In [25]:
eas <-
dat3 %>% 
    rename("IID" = "eid") %>% mutate(FID = IID) %>%
    filter(IID %in% eas_eids) %>%
    select(FID, IID, sex, age, batch, starts_with("PC"), delirium,delirium_date)
# delirium stats
eas %>% group_by(delirium) %>% summarise(n=n(), 
                                         `prev (%)`   = round(100*n()/nrow(eas),1),
                                         `age (mean)` = round(mean(age),1)
                                        )

delirium,n,prev (%),age (mean)
<dbl>,<int>,<dbl>,<dbl>
0,1495,99.7,66.3
1,5,0.3,66.7


In [26]:
outfile_eas <- "/exports/cmvm/eddie/smgphs/groups/Quantgen/Users/vasilis/PHD/ukb.delirium.gwas/non_eur/data/pheno/delirium_eas_LOWCOUNT.phe"
write.table(eas, file=outfile_eas, quote=F, sep=" ", row.names=F, col.names=T)