# In this notebook we will convert the GWAS summary statistics to the VCF file format using the gwasvcf R package from MRCIEU

# Note: at the moment this coloc analysis only uses biallelic SNPs

In [None]:
# install.packages('R.utils')

In [None]:
# Install gwasvcf if needed. This is not included in the original conda envs
#install.packages('gwasvcf', repos = c('https://mrcieu.r-universe.dev', 'https://cloud.r-project.org'))

In [None]:
library(data.table)
library(VariantAnnotation)
library(magrittr)
library("GenomicRanges")
library(rtracklayer)
library(coloc)
library(dplyr)
library(parallel)


In [4]:
library(gwasvcf)

In [33]:
source("Coloc_helper_functions.R")

## In case the SNP position is present but the rsID is not, we can use a mapping file to add this info
## The mapping file required columns are: 
- chr
- pos
- ref
- alt
- rsid

In [6]:
working_dir = "/lustre/groups/itg/teams/zeggini/projects/child_diabesity/Coloc_pipeline/Coloc"

In [7]:
# These are UKBB variant annotations in hg19 coordinates
variant_ann <- fread(file.path(working_dir, "ReferenceData/variants.tsv"), data.table=FALSE)

variant_ann$chr <- as.integer(variant_ann$chr)
variant_ann <- variant_ann[!is.na(variant_ann$chr) & nchar(variant_ann$ref)==1 & nchar(variant_ann$alt)==1,]


“NAs introduced by coercion”


In [8]:
head(variant_ann)

Unnamed: 0_level_0,variant,chr,pos,ref,alt,rsid,varid,consequence,consequence_category,info,⋯,p_hwe,n_called,n_not_called,n_hom_ref,n_het,n_hom_var,n_non_ref,r_heterozygosity,r_het_hom_var,r_expected_het_frequency
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,⋯,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>
1,1:15791:C:T,1,15791,C,T,rs547522712,1:15791_C_T,splice_region_variant,missense,0.861678,⋯,0.5,361194,0,361194,0,0,0,0.0,,0.0
2,1:69487:G:A,1,69487,G,A,rs568226429,1:69487_G_A,missense_variant,missense,0.956975,⋯,0.500004,361194,0,361190,4,0,4,1.10744e-05,,1.10743e-05
3,1:69569:T:C,1,69569,T,C,rs2531267,1:69569_T_C,missense_variant,missense,0.831664,⋯,0.506315,361194,0,361058,136,0,136,0.000376529,,0.000376459
4,1:139853:C:T,1,139853,C,T,rs533633326,1:139853_C_T,splice_region_variant,missense,0.985255,⋯,0.500004,361194,0,361190,4,0,4,1.10744e-05,,1.10743e-05
6,1:693731:A:G,1,693731,A,G,rs12238997,1:693731_A_G,upstream_gene_variant,non_coding,0.875969,⋯,0.164164,361192,2,281659,74692,4841,79533,0.206793,15.429,0.206315
7,1:707522:G:C,1,707522,G,C,rs371890604,1:707522_G_C,intron_variant,non_coding,0.803693,⋯,0.131323,361188,6,294151,63688,3349,67037,0.176329,19.017,0.175886


In [9]:
dir.create("VCF")

# We need to provide the number of cases and controls. This can be done manually or, if the study is present in the GWAScatalog, it can be extracted programmatically

In [10]:
# If extracted from the GWAScatalog
#install.packages("gwasrapidd")
library(gwasrapidd)


Attaching package: ‘gwasrapidd’


The following object is masked from ‘package:dplyr’:

    n




In [35]:
# get top associations
GWAS_ID <- "GCST009004" # Provide the GWAScatalog ID

GWAS_n <- Get_sampleSize_GWAScatalog(GWAS_ID)

In [36]:
GWAS_n

In [37]:
# This is in hg19
GWASfile <- file.path(working_dir, "GWAS_sumstats/bmi.giant-ukbb.meta-analysis.combined.23May2018.txt.gz")

GWAS_prefix <- "BMI"

make_vcf(GWASfile=GWASfile,
           chrom="CHR",
           pos="POS",
           nea="Other_Allele",
           ea="Tested_Allele",
           snp="SNP",
           ea_af="Freq_Tested_Allele",
           effect="BETA",
           se="SE",
           pval="P",
           WantToLiftOver=FALSE, # If you want to liftover, make sure to specify a path to the chain files
           ch_path = file.path(working_dir, "ReferenceData/hg19ToHg38.over.chain"), # Note, this is an example, since I set WantToLiftOver=FALSE, this chain file will not be used
           GWAS_n=GWAS_n, # a vector of one or two elements. If quantitative trait, total sample size, if case/control, number of cases and controls
           variant_ann=variant_ann, # reference file to map the missing values
           # output=paste0("/lustre/groups/itg/teams/zeggini/projects/fungen-oa/analyses/Ana_coloc_mr/GO2_sumstats/GO2_b38_", trait, "_ody_cptid.vcf"))
           output=paste0("VCF/", GWAS_prefix, ".vcf"))


[1] "Reading GWAS sumstats"
[1] "Filtering VCF for MAF 5% and known SNPs"
[1] "Creating VCF with 5477563 SNPs"


In [38]:
# get top associations
GWAS_ID <- "GCST009382" # Provide the GWAScatalog ID
# in case of multi-ancestry study, if you specify a population, sucj as population="eur", it will extract the sample size of only that population
GWAS_n <- Get_sampleSize_GWAScatalog(GWAS_ID)

In [39]:
GWAS_n <- Get_sampleSize_GWAScatalog(GWAS_ID)
GWAS_n

In [40]:
# In this example, it is case-control and I want to lift it over to hg38
GWASfile <- file.path(working_dir, "GWAS_sumstats/CHILDHOOD_OBESITY.TRANS_ANCESTRAL.RESULTS.txt.gz")

GWAS_prefix <- "childhood_obesity"

make_vcf(GWASfile=GWASfile,
           chrom="CHR",
           pos="POS",
           nea="OA",
           ea="EA",
           snp=NULL,
           ea_af="EUR_FRQ",
           effect="EUR_BETA",
           se="EUR_SE",
           pval="EUR_P",
           WantToLiftOver=TRUE, # If you want to liftover, make sure to specify a path to the chain files
           ch_path = file.path(working_dir, "ReferenceData/hg19ToHg38.over.chain"), # Note, this is an example, since I set WantToLiftOver=FALSE, this chain file will not be used
           GWAS_n=GWAS_n, # a vector of one or two elements. If quantitative trait, total sample size, if case/control, number of cases and controls
           variant_ann=variant_ann, # reference file to map coordinates. If position is not provided but the rsID is, then it uses variant_ann to add the position. If the position is given, but the rsID is not, then it uses it to add the rsID
           # output=paste0("/lustre/groups/itg/teams/zeggini/projects/fungen-oa/analyses/Ana_coloc_mr/GO2_sumstats/GO2_b38_", trait, "_ody_cptid.vcf"))
           output=paste0("VCF/", GWAS_prefix, "_hg38.vcf"))


[1] "Reading GWAS sumstats"
[1] "SNP ID not provided, so using reference to map coordinates to rsid"
[1] "Lifting over GWAS"


Discarding unchained sequences: chr23



[1] "Filtering VCF for MAF 5% and known SNPs"
[1] "Creating VCF with 6255594 SNPs"
 [1] "ID"       "chrom"    "pos"      "Allele1"  "Allele2"  "AFR_N"   
 [7] "AFR_FRQ"  "AFR_BETA" "AFR_SE"   "AFR_P"    "ASN_N"    "ASN_FRQ" 
[13] "ASN_BETA" "ASN_SE"   "ASN_P"    "EUR_N"    "Freq1"    "Effect"  
[19] "StdErr"   "p"        "AMR_N"    "AMR_FRQ"  "AMR_BETA" "AMR_SE"  
[25] "AMR_P"    "BAYES"    "SNP"      "SS"       "ncase"   


In [None]:
# We now have a tabix indexed VCF file
list.files("VCF")

In [None]:
# This is what it looks like
cat(system("zcat VCF/BMI.vcf.bgz|head -n 40", intern = TRUE), sep = "\n")

# We can now move to 2_run_coloc_abf