# Gene symbol annotation


This is the phenotype annotation step for data processing pipeline for xqtl workflow, containing the generation of annotated phenotype

### Input
The input for this workflow is the collection of data for 1 conditions as described in the readme of this git repo
1. 1 complete unresidual molecular_phenotype data
2. (Optional) 1 file documenting the list of region to be analyzed to filtered out unneeded genes

### Output
For each collection, the output is 
3. 1 region list file documenting chr, pos, start, end, gene_ID
4. 1 complete and annotated unresidual phenotype file (bed+index), suitable to be fed into factor analysis

In [2]:
[global]
import os
# Work directory & output directory
parameter: wd = path
# The filename namefor output data
parameter: container = 'gaow/twas'
# namefor the analysis output
parameter: name= 'ROSMAP'

# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20
# Path to the input molecular phenotype data.
parameter: molecular_pheno_whole = path
# A list of gene to keep, containing 1 column of gene_ID
parameter: keep_gene_list = "FALSE"

Prefix = name


## Annotation of molecular phenotype file
This workflow adds the annotations of chr pos(TSS where start = end -1) and gene_ID to the bed file

In [None]:
[annotation]
input: molecular_pheno_whole
output: f'{wd}/{name}.{_input:bn}.annotated.bed.gz',
        f'{wd}/{name}.{_input:bn}.region_list'

R:  expand= "$[ ]", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout' ,container = container
    library("biomaRt")
    library(dplyr)
    library(readr)
    biomartCacheClear()
    gene_exp = readr::read_delim("$[_input[0]]",delim = "\t")
    if("#chr" %in% colnames(gene_exp) ){
    gene_exp = gene_exp[,4:ncol(gene_exp)]}
    ensembl = useDataset("hsapiens_gene_ensembl",mart=useMart("ENSEMBL_MART_ENSEMBL"))
    ensembl_df <- getBM(attributes=c("ensembl_gene_id","chromosome_name", "start_position", "end_position"),mart=ensembl)
    my_genes = gene_exp$gene_ID
    keep_genes =  my_genes
    my_genes_ann = ensembl_df[match(my_genes, ensembl_df$ensembl_gene_id),]%>%filter(chromosome_name%in%1:23)%>%dplyr::rename( "#chr" = chromosome_name, "start" = start_position, "end" = end_position,"gene_ID" = ensembl_gene_id)%>%filter(gene_ID!="NA", gene_ID%in%keep_genes)
    my_genes_ann%>%select(`#chr`,start,end,gene_ID)%>%write_delim(path = "$[_output[1]]","\t")
    my_gene_bed = inner_join(my_genes_ann %>%mutate(end = start + 1) %>%select(`#chr`,start,end,gene_ID),gene_exp,by = "gene_ID" )%>%arrange(`#chr`,start) 
    my_gene_bed%>%readr::write_tsv( path = "$[_output[0]:n]", na = "NA", append = FALSE, col_names = TRUE, quote_escape = "double")

bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container = container
        bgzip -f $[_output[0]:n]
        tabix -p bed $[_output[0]] -f