# Phenotype data formatting


This module implements a collection of workflows used to format molecular phenotype data.



## Input
The input for this workflow is the collection of data for 1 conditions as described in the readme of this git repo
1. 1 complete residual molecular_phenotype data
2. 1 region_list
Both of these input can be generated by the annotation module of this pipeline

## Output
For each collection, the output is 
1. 1 lists of phenotype file (bed+index) for each chrom, suitable to be fed into both apex and tensorQTL, annotated with chrom and pos
2. 1 lists of phenotype file (bed+index) for each gene, annotated with chrom and tss

In [None]:
 sos run /home/hs3163/GIT/xqtl-pipeline/pipeline/data_preprocessing/phenotype/phenotype_formatting.ipynb reformat \
--region_list /home/hs3163/GIT/ADSPFG-xQTL/MWE/mwe_region_long \
--phenoFile /mnt/mfs/statgen/xqtl_workflow_testing/success_example/testing_10/Data_Processing/Phenotype/AC.mol_phe.bed  \
--cwd ./  \
--name "Dry" --container "/mnt/mfs/statgen/containers/apex.sif" &

In [2]:
[global]
import os
# Work directory & output directory
parameter: cwd = path
# The filename namefor output data
parameter: container = 'gaow/twas'
# An index text file with 5 columns specifying the chr, start, end and names of regions to analyze
parameter: region_list = path
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20
# Path to the input molecular phenotype data.
parameter: phenoFile = path
# name for the analysis output
parameter: name= f'{phenoFile:bn}'
parameter: pop_file = "None"
regions = [x.strip().split() for x in open(region_list).readlines() if x.strip() and not x.strip().startswith('#')]
# Get the unique chormosome that have regions to be analyzed.
def extract(lst):
    return [item[0] for item in lst]
chrom = list(set(extract(regions)))
# Whether the input data is named by gene_id or gene_name. By default it is gene_id, if not, please change it to gene_name
parameter: gene_name_as_phenotype_id = False

## Region List generation

To partitioning the data by genes require a region list file which:

    1. have 5 columns: chr,start,end,gene_id,gene_name
    2. have the same gene as or less gene than that of the bed file
    
Input:

    1. A gtf file used to generated the bed
    2. A phenotype bed file, must have a gene_id column indicating the name of genes.    

In [None]:
[generate_region_list]
#  gene gtf annotation table
parameter: annotation_gtf = path
input: phenoFile, annotation_gtf
output: f'{cwd:a}/{_input[0]:bnn}.region_list'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime,  mem = mem, tags = f'{step_name}_{_output:bn}'  
python: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container = container
    import pandas as pd
    import qtl.io
    # get the five column data
    bed_template_df_id = qtl.io.gtf_to_tss_bed(${_input[1]:ar}, feature='transcript',phenotype_id = "gene_id" )
    bed_template_df_name = qtl.io.gtf_to_tss_bed(${_input[1]:ar}, feature='transcript',phenotype_id = "gene_name" )
    bed_template_df = bed_template_df_id.merge(bed_template_df_name, on = ["chr","start","end"])
    # retaining only somatic chromosome
    bed_template_df = bed_template_df[bed_template_df.chr.isin(["chr" + str(x) for x in (range(1,23))])]
    bed_template_df.columns = ["#chr","start","end","gene_id","gene_name"]
    pheno = pd.read_csv(${_input[0]:r}, sep = "\t")
    # Retaining only the genes in the data
    region_list = bed_template_df[bed_template_df.${phenotype_id_type}.isin(pheno.gene_id)]
    region_list.to_csv("${_output}", sep = "\t",index = 0)

## Process of molecular phenotype file
This workflow produce a bed+tabix file for all the molecular pheno data that are included in the region list to feed into downstream analysis

In [None]:
[reformat_1,partition_by_chrom_1]
# Path to the input molecular phenotype data.
input: phenoFile ,for_each = "chrom"
output: f'{cwd:a}/{name}.{_chrom}.mol_phe.bed.gz'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime,  mem = mem, tags = f'{step_name}_{_output:bn}'  
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container = container
    zcat $[_input] | head -1 > $[_output:n]
    tabix $[_input] $[_chrom] >> $[_output:n] 
    bgzip -f $[_output:n]
    tabix -p bed $[_output] -f

In [None]:
[reformat_2,partition_by_chrom_2]
# Path to the input molecular phenotype data.
input: group_by = "all"
output: f'{cwd:a}/{name}.processed_phenotype.per_chrom.recipe'
import pandas as pd
chrom_df = pd.DataFrame({"#id" : chrom ,"#dir" : _input})
chrom_df.to_csv(_output,index = 0,sep = "\t")

In [None]:
[partition_by_gene_1]
# Path to the input molecular phenotype data.
input: phenoFile ,for_each = "regions"
output: f'{cwd:a}/{name}.{_regions[3]}.{_regions[4]}.mol_phe.bed.gz'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime,  mem = mem, tags = f'{step_name}_{_output:bn}'  
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container = container
    zcat $[_input] | head -1 > $[_output:n]
    zcat $[_input] | grep  $[_regions[3] if gene_name_as_phenotype_id else _regions[4]] >> $[_output:n]
    bgzip -f $[_output:n]
    tabix -p bed $[_output] -f

In [None]:
[partition_by_gene_2]
input: group_by = "all"
output: f'{cwd:a}/{name}.processed_phenotype.per_gene.recipe'
import pandas as pd
region_df = pd.DataFrame({"#id" : [x[3] for x in regions] ,"dir" : _input})
region_df.to_csv(_output,index = 0,sep = "\t")