# Molecular phenotype residuals
This is the region extraction step for data processing pipeline for xqtl workflow, containing the generation of:
1. regressed out molecular phenotype file

### Input
The input for this workflow is the collection of data for 1 conditions as described in the readme of this git repo
1. 1 complete molecular phenotype data ib bed format
2. 1 file containing the factors to be regressed out, with a #id column documenting the name of factors and other column names = sample names

### Output

1 residual expression file with everything in the factors+covariate file regressed out

In [2]:
[global]
import os
# Work directory & output directory
parameter: wd = "./"
# The filename name for output data
parameter: container = '/mnt/mfs/statgen/containers/apex.sif'
# name for the analysis output
parameter: name = 'ROSMAP'
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20
parameter: pop_file = "None"

In [None]:
[Residual_Expression]
# Path to the input molecular phenotype data.
parameter: molecular_pheno_whole = path
# Path to the factor file 
parameter: factor_covariate = path
input: molecular_pheno_whole,factor_covariate
output: f'{wd}/{name}.mol_phe.resid.bed.gz'
task: trunk_workers = 1, trunk_size = 1, walltime = '4h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
R: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container = container
    library("dplyr")
    library("tibble")
    library("readr")
    library("purrr")
    factor = read_delim("$[_input[1]]",delim = "\t")
    pheno = read_delim("$[_input[0]]",delim = "\t")
    int = intersect(colnames(factor),pheno%>%colnames)
    factor = factor%>%filter()%>%select(int)%>%as.matrix()%>%t()
    pheno_id = pheno[,1:4]
    pheno = pheno%>%select(int)%>%as.matrix()%>%t()
    # Pheno is a matrixs samles as rows, and gene as column 
    pheno_resid = .lm.fit(x = factor, y = pheno)$residuals
    pheno_output = cbind(pheno_id, pheno_resid%>%t())
    pheno_output%>%write_delim("$[_output[0]:n]",delim = "\t")
  
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',container = container
    bgzip -f $[_output[0]:n]
    tabix -p bed $[_output[0]] -f