### Uses a very broad search (Parent term contains string 'Cardiovascular') to select a large number of trait IDs (these are later pruned)

In [4]:
# supress warnings
options(warn=-1)

In [5]:
# get date for time stamping
today=format(Sys.Date(), "%Y-%m-%d")

In [6]:
# load packages
library('gwasrapidd')
library('tidyverse')
library('glue')                     

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.3.6      [32m✔[39m [34mpurrr  [39m 0.3.5 
[32m✔[39m [34mtibble [39m 3.1.8      [32m✔[39m [34mdplyr  [39m 1.0.10
[32m✔[39m [34mtidyr  [39m 1.2.1      [32m✔[39m [34mstringr[39m 1.4.1 
[32m✔[39m [34mreadr  [39m 2.1.2      [32m✔[39m [34mforcats[39m 0.5.1 
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[31m✖[39m [34mdplyr[39m::[32mn()[39m      masks [34mgwasrapidd[39m::n()


In [7]:
# Read in CHD SNPs
snps=read.csv('/nfs/team205/heart/CHD_GWAS/chd_gwas_SNPs.csv')
snps

simplified_classification,phenotype,SNP,study
<chr>,<chr>,<chr>,<chr>
obstructive,Obstructive Heart Defects,rs2360743,Rashkin2021
LH,LH lesions,rs8061121,Mitchell2015
LH,LH lesions,rs1975649,Mitchell2015
LH,LH lesions,rs6495706,Mitchell2015
NOS,NOS,rs185531658,Lahm2021
septal,ASD,rs870142,Lahm2021
septal,ASD II,rs145619574,Lahm2021
septal,ASD II,rs187369228,Lahm2021
septal,ASD II,rs72917381,Lahm2021
LH,LH lesions,rs114503684,Lahm2021


In [18]:
chrom<-list()
pos<-list()
region<-list()

tmp<-get_variants(variant_id=snps$SNP[2])@variants
chrom[]<-tmp$chromosome_name

variant_id,merged,functional_class,chromosome_name,chromosome_position,chromosome_region,last_update_date
<chr>,<int>,<chr>,<chr>,<int>,<chr>,<dttm>
rs8061121,0,intron_variant,16,87159478,16q24.2,2021-06-24 17:50:13


In [27]:
# get metadata for those trait terms, the efo_ids in particular

chrom<-list()
pos<-list()
region<-list()

for (i in seq_along(snps$SNP)) {
    tmp<-get_variants(variant_id=snps$SNP[i])@variants
    chrom[i]<-tmp$chromosome_name[1]
    pos[i]<-tmp$chromosome_position[1]
    region[i]<-tmp$chromosome_region[1]
}

snps$chrom<-chrom
snps$pos<-pos
snps$region<-region

snps

simplified_classification,phenotype,SNP,study,chrom,pos,region
<chr>,<chr>,<chr>,<chr>,<list>,<list>,<list>
obstructive,Obstructive Heart Defects,rs2360743,Rashkin2021,19,10625838.0,19p13.2
LH,LH lesions,rs8061121,Mitchell2015,16,87159478.0,16q24.2
LH,LH lesions,rs1975649,Mitchell2015,3,63453210.0,3p14.2
LH,LH lesions,rs6495706,Mitchell2015,15,34684669.0,15q14
NOS,NOS,rs185531658,Lahm2021,5,113800824.0,5q22.3
septal,ASD,rs870142,Lahm2021,4,4646320.0,4p16.2
septal,ASD II,rs145619574,Lahm2021,,,
septal,ASD II,rs187369228,Lahm2021,3,190084650.0,3q28
septal,ASD II,rs72917381,Lahm2021,18,56878992.0,18q21.31
LH,LH lesions,rs114503684,Lahm2021,,,


In [3]:
%%time

# add chromosomal position of each of the tagged SNPs to output

tagged_pos=[]


for tagged_snp in range(len(out['tagged_snps'])):

    if type(out['tagged_snps'][tagged_snp])==str and out['tagged_snps'][tagged_snp].startswith('rs'):
            server = "https://rest.ensembl.org"
            ext = f"/variation/human/{out['tagged_snps'][tagged_snp]}?"
            r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
            decoded = r.json()

            if len(decoded)>0 and "mappings" in decoded:
                tagged_pos.append(decoded['mappings'][0]['start'])
            else:
                tagged_pos.append('NA')
    else:
        tagged_pos.append('NA')

# add the column
out['tagged_snps_pos']=tagged_pos

# write a file (for each trait)
out.to_csv('/nfs/team205/heart/EBI_GWAS/PITX2/EFO_FAKE_PITX2_index_SNPs_with_SNPs_in_LD_with_pos.csv',index=False)

EFO_0000275
CPU times: user 1.1 s, sys: 39.1 ms, total: 1.14 s
Wall time: 15.1 s


In [51]:
# Make a table of index SNPs for each trait

list_of_tibbles<-list()

for (i in seq_along(efos$trait)) {
    if (file.exists(glue('/nfs/team205/heart/EBI_GWAS/index_snps/{efos$efo_id[i]}_{efos$trait[i]}_index_SNPs.csv'))==FALSE) {
        print(glue('{efos$trait[i]}: file needs to be made'))
        variants_table<-get_variants(efo_id=efos$efo_id[i])@variants #gets variants for the trait
        variants_table<-variants_table[complete.cases(variants_table), ] # removes any variants where SNP coordinates are not known, since these are useless to us
        if (dim(variants_table)[1]>0) { # only keep traits for which there are SNPs
            variants_table$efo_id=efos$efo_id[i]
            variants_table$efo_term=efos$trait[i]
            n_SNPs<-dim(variants_table)[1]
            print(efos$efo_id[i])
            list_of_tibbles[[i]]<-variants_table
            write_csv(list_of_tibbles[[i]],glue('/nfs/team205/heart/EBI_GWAS/index_snps/{efos$efo_id[i]}_{efos$trait[i]}_index_SNPs.csv'))
        }
    } else {
        print(glue('{efos$trait[i]}: file already exists'))
    }
}

mri defined brain infarct: file needs to be made
vascular dementia: file needs to be made
[1] "EFO_0004718"
aortic aneurysm: file needs to be made
[1] "EFO_0001666"
cerebral autosomal dominant arteriopathy with subcortical infarcts and leukoencephalopathy: file needs to be made
[1] "Orphanet_136"
nt-probnp measurement: file needs to be made
[1] "EFO_0004745"
TP segment duration: file needs to be made
ST segment duration: file needs to be made
T wave duration: file needs to be made
carotid-femoral pulse wave velocity: file needs to be made
[1] "EFO_0004724"
congestive heart failure: file needs to be made
[1] "EFO_0000373"
atrial natriuretic factor measurement: file needs to be made
[1] "EFO_0004789"
cardiotoxicity: file needs to be made
[1] "EFO_1001482"
non-obstructive coronary artery disease: file needs to be made
[1] "EFO_1001483"
tissue plasminogen activator measurement: file needs to be made
[1] "EFO_0004791"
vascular endothelial growth factor measurement: file needs to be made
[1]