# Regenie Interaction

This piepeline implement regenie interaction based on a self-made version of regenie. It was built on the raw source code that has already had the code for interaction test and option from [https://github.com/rgcgithub/regenie](https://github.com/rgcgithub/regenie) (2021-11-24).

The regenie program can be found in `/mnt/mfs/statgen/guangyou/containers/regenie.v.2.3`

## Method

### The interaction model

$$
y = \beta_{0}+\beta_{1} snp_{1}+\beta_{2} snp_{2}+\beta_{12} snp_{1}snp_{2}+\beta_{c} c+g+\varepsilon\ \ g\sim MNV(0,\sigma^{2}_{a}K), \varepsilon\sim MNV(0,\sigma^{2}_{e}I_{N})
$$

Where $K$ is the genetic-relatedness matrix $K=\frac{G_{S}G_{S}^{T}}{M}$, $G_{S}$ is the standardized genotypes

The interaction between covariates and genotypes are also supported. Only one phenotype is supported.

In [1]:
sos run Regenie_interaction.ipynb -h

usage: sos run Regenie_interaction.ipynb
               [workflow_name | -t targets] [options] [workflow_options]
  workflow_name:        Single or combined workflows defined in this script
  targets:              One or more targets to generate
  options:              Single-hyphen sos parameters (see "sos run -h" for details)
  workflow_options:     Double-hyphen workflow-specific parameters

Workflows:
  regenie_qc
  regenie

Global Workflow Options:
  --cwd VAL (as path, required)
                        the output directory for generated files
  --sampleFile . (as path)
                        Path to sample file
  --bfile VAL (as path, required)
                        Genotype files in plink binary this is used for
                        computing the GRM
  --genoFile  paths('.')

                        Path to bgen or bed files
  --phenoFile VAL (as path, required)
                        Phenotype file for quantitative trait (BMI)
  --phenoCol VAL VAL ... (as type, required)

In [None]:
[global]
# the output directory for generated files
parameter: cwd = path
# Specific number of threads to use
parameter: numThreads = 2
# For cluster jobs, number commands to run job_sizeper job
parameter: job_size = 1
# For cluster jobs, wall time
parameter: walltime = '1h'
# For cluster jobs, mem to run job_sizeper job
parameter: mem = '2G'
# The container with the lmm software. Can be either a dockerhub image or a singularity `sif` file.
# Default is set to using dockerhub image
parameter: container_lmm = 'statisticalgenetics/lmm:2.4'
parameter: container_annovar = 'gaow/gatk4-annovar'
parameter: container_marp = 'gaow/marp'

In [None]:
# Select the SNPs and samples to be used based on maf, geno, hwe and mind options
[regenie_qc]
# Genotype files in plink binary this is used for computing the GRM
parameter: bfile = path
parameter: maf_filter = 0.0
parameter: geno_filter = 0.0
parameter: hwe_filter = 0.0
parameter: mind_filter = 0.0
input: bfile
output: f'{cwd}/cache/{bfile:bn}.qc_pass.id', f'{cwd}/cache/{bfile:bn}.qc_pass.snplist' 
task: trunk_workers = 1, walltime = '10h', mem = '5G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
bash: container=container_lmm, expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout' 
    plink2 \
      --bfile ${bfile:n} --mac 1 \
      ${('--maf %s' % maf_filter) if maf_filter > 0 else ''} ${('--geno %s' % geno_filter) if geno_filter > 0 else ''} ${('--hwe %s' % hwe_filter) if hwe_filter > 0 else ''} ${('--mind %s' % mind_filter) if mind_filter > 0 else ''} \
      --write-snplist --write-samples --no-id-header \
      --threads ${numThreads} \
      --out ${_output[0]:n} 

In [None]:
# extract and prepare phenotype & covariate files
[regenie_1]
# Phenotype file for quantitative trait (BMI)
parameter: phenoFile = path
# Phenotype to be analyzed (specify the column)
parameter: phenoCol = list
# Covariate file path. Will use phenoFile if empty
parameter: covarFile = path('.')
# Qualitative covariates to be used in the analysis
parameter: covarCol = []
# Quantitative covariates to be used in the analysis
parameter: qCovarCol = []
# Path to bgen or bed files 
parameter: genoFile = list
# Interacting enviroment covariates to be used in the analysis
parameter: covariates = list
if not covarFile.is_file():
    covarFile = phenoFile
cwd = path(f"{cwd:a}")
import pandas as pd
import numpy as np
dat = pd.read_csv(phenoFile, header=0, delim_whitespace=True, dtype=str)
dat = dat.replace(to_replace =np.nan, value ="NA")
if len(phenoCol) > 0:    
    dat.to_csv(f"{cwd}/{phenoFile:bn}.regenie_phenotype", sep='\t', index=False, columns = ['FID', 'IID'] + phenoCol)
dat = pd.read_csv(covarFile, header=0, delim_whitespace=True, dtype=str)
if len(covarCol) > 0 or len(qCovarCol) > 0:
    dat = dat.dropna(subset=covarCol)
    dat = dat.dropna(subset=qCovarCol)
    dat.replace(to_replace =np.nan, value ="NA")
    dat1 = pd.DataFrame(dat, columns = ['FID','IID'] + covarCol)
    #dat1 = dat1.astype(int)
    dat2 = pd.DataFrame(dat, columns = ['IID'] + qCovarCol)
    merged_left = pd.merge(left=dat1, right=dat2, how='left', left_on='IID', right_on='IID')
    merged_left.to_csv(f"{cwd}/{phenoFile:bn}.regenie_covar", sep=' ', index=False)
input: for_each = dict(cov=covariates)
output: f"{cwd}/{phenoFile:bn}.{cov}.regenie_covar"
task: trunk_workers = 1, trunk_size = 2, walltime = '2h', mem = '2G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
python: container=container_lmm, expand = "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    import pandas as pd
    import numpy as np
    dat = pd.read_csv("${covarFile}", header=0, sep='\t', dtype=str)
    if len(${covarCol}) > 0 or len(${qCovarCol}) > 0:
        dat = dat.dropna(subset=${covarCol})
        dat = dat.dropna(subset=${qCovarCol})
        dat = dat.dropna(subset=["${cov}"])
        dat.replace(to_replace =np.nan, value ="NA")
        dat1 = pd.DataFrame(dat, columns = ['FID','IID'] + ${covarCol} + ["${cov}"])
        #dat1 = dat1.astype(int)
        dat2 = pd.DataFrame(dat, columns = ['IID'] + ${qCovarCol})
        merged_left = pd.merge(left=dat1, right=dat2, how='left', left_on='IID', right_on='IID')
        merged_left.to_csv("${_output}", sep=' ', index=False)

In [None]:
# Run REGENIE step 1: fitting the null
[regenie_2]
# Path to sample file
parameter: sampleFile = path('.')
# Genotype files in plink binary this is used for computing the GRM
parameter: bfile = path
# Phenotype file for quantitative trait (BMI)
parameter: phenoFile = path
# Phenotype to be analyzed (specify the column)
parameter: phenoCol = list
# Summary statisticss format file path used for unifying output column names. Will not unify names if empty
parameter: formatFile = path('.')
# Qualitative covariates to be used in the analysis
parameter: covarCol = []
# Quantitative covariates to be used in the analysis
parameter: qCovarCol = []
# Path to bgen or bed files 
parameter: genoFile = list
# Interacting enviroment covariates to be used in the analysis
parameter: covariates = list
# Path to regenie v2.3
parameter: regenieFile = path
# Size of the genotype blocks to be used 
parameter: bsize = 400
# Path to temporarily store block predictions
parameter: lowmem_dir = cwd
# Specify that traits are binary with 0=control,1=case,NA=missing (default is quantitative)
parameter: trait = 'bt'
depends: f'{cwd}/cache/{bfile:bn}.qc_pass.snplist', f'{cwd}/cache/{bfile:bn}.qc_pass.id'
input: geno = bfile, pheno = f"{cwd}/{phenoFile:bn}.regenie_phenotype", covar = f"{cwd}/{phenoFile:bn}.regenie_covar", qc = output_from("regenie_qc")
output: f'{cwd}/{phenoFile:bn}_' + "_".join([x for x in phenoCol]) + f'.regenie_pred.list'
task: trunk_workers = 1, trunk_size = 1, walltime = '24h', mem = '50G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container_lmm, expand = "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', volumes = [f"{lowmem_dir:a}:{lowmem_dir:a}"]
    ${regenieFile} \
      --step 1 \
      --bed ${_input["geno"]:n} \
      --phenoFile ${_input["pheno"]} \
      --covarFile ${_input["covar"]} \
      --keep ${_input["qc"][0]} \
      --extract ${_input["qc"][1]} \
      ${('--' + trait) if trait in ['bt'] else ''} \
      --bsize ${bsize} \
      --lowmem --lowmem-prefix ${lowmem_dir:a}/${_output:bn} \
      --threads ${numThreads} \
      --out ${_output:nn}.regenie

In [1]:
# Run REGENIE step 2: association analysis
[regenie_3]
# Path to sample file
parameter: sampleFile = path('.')
# Phenotype file for quantitative trait (BMI)
parameter: phenoFile = path
# Phenotype to be analyzed (specify the column)
parameter: phenoCol = list
# Qualitative covariates to be used in the analysis
parameter: covarCol = []
# Quantitative covariates to be used in the analysis
parameter: qCovarCol = []
# Path to bgen or bed files 
parameter: genoFile = list
# Interacting enviroment covariates to be used in the analysis
parameter: covariates = list
# Path to regenie v2.3
parameter: regenieFile = path
# Size of the genotype blocks to be used 
parameter: bsize = 400
# Minimum MAF to be used
parameter: bgenMinMAF = 0.001
# Mimimum info score to be used
parameter: bgenMinINFO = 0.8
# Mimimum allele count to be used
parameter: minMAC = int
parameter: trait = 'bt'
# in the case of bgen data from UKBB ref_first should be set to true
parameter: ref_first= False
input:for_each = [dict(geno = genoFile),dict(cov=covariates)]
input_options = f"--bgen {geno} --sample {sampleFile}" if geno.endswith('.bgen') else f"--bed " + geno.split('.bed')[0]
info = f'{cwd}/{phenoFile:bn}_' + "_".join([x for x in phenoCol]) + '.regenie_pred.list'
outputprefix=f'{cwd}/cache/'+geno.split('/')[-1].split('.bgen')[0]+f'.{cov}'
output: f'{cwd}/cache/'+geno.split('/')[-1].split('.bgen')[0]+f'.{cov}_'+ "_".join([x for x in phenoCol]) + f".regenie.gz"
task: trunk_workers = 1, trunk_size = 1, walltime = '200h', mem = '10G', max_walltime = '200h', cores = 2, tags = f'{step_name}_{_output:bn}'
bash:container=container_lmm, expand = "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    set -e
    ${regenieFile} \
     --step 2 \
     ${input_options} \
     --phenoFile ${cwd}/${phenoFile:bn}.regenie_phenotype \
     --covarFile ${cwd}/${phenoFile:bn}.${cov}.regenie_covar \
     --phenoColList ${','.join(phenoCol)} \
     ${('--' + trait) if trait in ['bt'] else ''} \
     ${('--ref-first') if ref_first else ''} \
     --interaction ${cov} \
     --firth 0.01 --approx \
     --pred ${info} \
     --bsize ${bsize} \
     --minMAC ${minMAC} \
     --minINFO ${bgenMinINFO}\
     --threads ${numThreads} \
     --out ${outputprefix} && \
     gzip -f --best ${_output:n}

## Annotation for variants in the top interactions

In [None]:
# Merge Regenie sumstats, extract the top interaction terms.
[annovar_1]
# Set p-value to filter for annotations
parameter: p_filter = 0
# Top k interaction terms for annotations 
parameter: k = 0
# Path sumstats file
parameter: sumstatsFile = list
input: sumstatsFile
output: f'{cwd}/sumstats.snp_annotate'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}'
python: container=container_lmm, expand='${ }', stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'
    import pandas as pd
    import math
    result = pd.DataFrame(columns=['CHROM', 'GENPOS', 'ID', 'ALLELE0', 'ALLELE1', 'A1FREQ', 'INFO', 'N','TEST', 'BETA', 'SE', 'CHISQ', 'LOG10P', 'EXTRA'])
    if ${TRUE if p_filter !=0 else FALSE}:
        for file in ${sumstatsFile}:
            tmp = pd.read_csv("file",compression="gzip",header=0,sep=" ")
            tmp = tmp[tmp["TEST"].str.startswith("ADD-INT_SNPx")]
            tmp = tmp[tmp["LOG10P"] >= math.log10(${p_filter})]
            result = result.append(tmp,ignore_index=True)
        result = result.sort_values("LOG10P", axis=0, ascending=False)
        result.to_csv("${_output}",sep = " ",index=False)
    elif ${TRUE if k !=0 else FALSE}:
        for file in ${sumstatsFile}:
            tmp = pd.read_csv(file,compression="gzip",header=0,sep=" ")
            tmp = tmp[tmp["TEST"].str.startswith("ADD-INT_SNPx")]
            result = result.append(tmp,ignore_index=True)
            result = result.sort_values("LOG10P", axis=0, ascending=False).iloc[:${k},:]
        result.to_csv("${_output}",sep = " ",index=False)
    else :
        print("Please enter either top k or p value threshold to filter the variants for annotation")

In [None]:
# Get chr, start, end, ref_allele, alt_allele format for ANNOVAR input
[annovar_2]
output: f'{_input:n}.avinput'
task: trunk_workers = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
python: container=container_lmm, expand= "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout' 
    import pandas as pd
    top = pd.read_csv("${_input}",sep = " ",header=0)
    top = top[['CHROM', 'GENPOS', 'ALLELE1', 'ALLELE0']]
    top["end"] = top["GENPOS"] +  top["ALLELE0"].str.len()
    top = top.astype("string")
    top["varID"] = top["CHROM"].str.cat(others=[top.GENPOS, df.ALLELE1, df.ALLELE0], sep=':')
    top = top[['CHROM', 'GENPOS', "end", 'ALLELE1', 'ALLELE0',"varID"]]
    top.to_csv("${_output}",index=False,header=False,sep=" ")

In [None]:
# Annotate variants file using ANNOVAR
[annovar_3]
# humandb path for ANNOVAR
parameter: humandb = path
# Path to x-ref file
parameter: xref_path = path
# Human genome build hg19 or hg38
parameter: build = 'hg38'
# Annovar protocol
if build == 'hg19':
    protocol = ['refGene', 'refGeneWithVer', 'knownGene', 'ensGene', 'phastConsElements46way', 'gwasCatalog', 'gnomad211_exome', 'avsnp150', 'dbnsfp42a', 'dbscsnv11', 'gene4denovo201907']
    operation = ['g', 'g', 'g', 'g', 'r', 'r', 'f', 'f', 'f', 'f', 'f']
    arg = ['"-splicing 12 -exonicsplicing"', '"-splicing 30"', '"-splicing 12 -exonicsplicing"', '"-splicing 12"', '', '', '', '', '', '', '']
else:
    protocol = ['refGene', 'refGeneWithVer', 'knownGene', 'ensGene', 'phastConsElements30way', 'encRegTfbsClustered', 'gwasCatalog', 'gnomad30_genome', 'gnomad211_exome', 'gme', 'kaviar_20150923', 'avsnp150', 'dbnsfp41a', 'dbscsnv11', 'clinvar_20200316', 'gene4denovo201907']
    operation = ['g', 'g', 'g', 'gx', 'r', 'r', 'r', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f']
    arg = ['"-splicing 12 -exonicsplicing"', '"-splicing 30"', '"-splicing 12 -exonicsplicing"', '"-splicing 12"', '', '', '', '', '', '', '', '', '', '', '', '']
    
#add xreffile to option without -exonicsplicing
#mart_export_2019_LOFtools3.txt #xreffile latest option -> Phenotype description,HGNC symbol,MIM morbid description,CGD_CONDITION,CGD_inh,CGD_man,CGD_comm,LOF_tools
parameter: x_ref = path(f"{xref_path}/mart_export_2021_LOFtools.txt")
output: anno_file = f'{cwd}/{_input:bn}.{build}_multianno.csv'
task: trunk_workers = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}', template = '{cmd}' if executable('annotate_variation.pl').target_exists() else annovar_module
bash: container=container_annovar, volumes=[f'{humandb:a}:{humandb:a}', f'{x_ref:ad}:{x_ref:ad}'], expand="${ }", stderr=f'{_output:n}.err', stdout=f'{_output:n}.out'
    #do not add -intronhgvs as option -> writes cDNA variants as HGVS but creates issues (+2 splice site reported only)
    #-nastring . can only be . for VCF files
    #regsnpintron might cause shifted lines (be carefull using)
    table_annovar.pl \
        ${_input} \
        ${humandb} \
        -buildver ${build} \
        -out ${_output:nn}\
        -otherinfo\
        -remove \
        -polish \
        -nastring . \
        -protocol ${",".join(protocol)}\
        -operation ${",".join(operation)} \
        -arg ${",".join(arg)} \
        -csvout \
        -xreffile ${x_ref} 

In [None]:
# Re-format the annovar csv to have the BETA, SE and P in the front and with headers
[annovar_4]
input: named_output('anno_file')
output: f'{cwd}/{_input:bn}.formatted.csv'
task: trunk_workers = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
python: expand= "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout' 
    import pandas as pd
    import numpy as np
    import csv
    df = pd.read_csv('${_input}', header=0)
    df1 = df[["Otherinfo1"]]
    df1 = df1.astype(str)
    df2 = df1["Otherinfo1"].str.split(" ", n = 4, expand = True)
    df2.columns = ["alternate_id", "BETA", "SE", "P"]
    df = df2.join(df)
    df.to_csv('${_output}', index=False)

In [None]:
# Annotate snps to gene
[snp_to_gene]
# Column name for BP
parameter: bp = 'POS'
# Column name for p-value
parameter: pval = 'P'
# Column name for SNP
parameter: snp = 'SNP'
# Path sumstats file
parameter: sumstatsFile = path
# Genome assembly hg_37, hg_38
parameter: hg = int
input: sumstatsFile
output: f'{_input:nn}.gene_ann'      
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}'
R: expand='${ }', stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    library('snpGeneSets')
    library('dplyr')
    # Import the sumstats file as dataframe
    data <- read.table(gzfile('${_input}'), header=T)
    head(data)
    # Filter SNPs with p-val <5e-06
    # Subset data to obtain only chr, pos and snp for gene mapping
    sig.p <- data %>%
      filter(P < 5e-8) %>%
      mutate(chr = CHR,
             pos = ${bp},
             snp = as.character(${snp})) %>%
      select(chr, pos, snp)
    head(sig.p)
    # Get the annotation of SNPs with different genome assemblies
    snpMapAnn<- getSNPMap(sig.p$snp, GRCh=${hg})
    # Mapping SNPs to genes (define gene boundary ‘up’ for the upstream region and ‘down’ for the downstream region with default value of 2,000 bp for both)
    snpGeneMapAnn<- snp2Gene(snpMapAnn$rsid_map$snp)
    cat("The unique number of genes is",length(unique(snpGeneMapAnn$map$gene_id),"\n"))
    cat("The number of variants that could not be mapped to a gene is:",length(snpGeneMapAnn$other),"\n")
    #Get the gene-name and gene-id for the mapped variants
    gene_mapped <- getGeneMap(snpGeneMapAnn$map$gene_id)$gene_map
    # Merge the datasets
    snp_gene = merge(x = snpMapAnn37$rsid_map,y = snpGeneMapAnn$map[,c("snp", "gene_id")],by="snp", all.x=TRUE)
    snp_gene_2 = merge(x = snp_gene,y = gene_mapped[,c("gene_id", "gene_name")],by="gene_id", all.x=TRUE)
    names(snp_gene_2)[names(snp_gene_2) == 'snp'] <- 'SNP'
    snp_gene_3 = merge(x = snp_gene_2,y = data[,c("A1", "A2", "N", "AF1","P","BETA", "SE", "INFO","SNP")],by="SNP", all.x=TRUE)
    # Get the final table with ordered pval
    final_gene_set <- snp_gene_3 %>%
     select(chr, ${snp}, pos, A1, A2, N, AF1, BETA, SE, ${pval}, INFO, gene_id, gene_name) %>%
     arrange(P)
    names(final_gene_set)[names(final_gene_set) == 'chr'] <- 'CHR'
    names(final_gene_set)[names(final_gene_set) == 'pos'] <- 'POS'
    # Write results to a table
    write.table(final_gene_set, '${_output}', sep = "\t", quote=FALSE, row.names=FALSE)