## Generate posterior output for mash
This notebook is a extraction of the original mashr_flashr_workflow.ipynb, the reason for extracting this part is otherwise the mashr_flashr_workflow will run the MASH steps again when ask to only compute the covariate

## Input and output
Input: 
    A path to:
        1. The Vhat RDS object generated by mashr_flashr_workflow
        2. The model RDS object generated by mashr_flashr_workflow
    A space connected, collections of path to the data object of each of the gene

Output: 
    A list of path to a RDS object which contains
        1. the output of mash_compute_posterior_matrices function based on the input.
        2. The snps names in the form of chr:pos_alt_ref for each of the variant in the input.

In [None]:
[global]
parameter: cwd = path('./mashr_flashr_workflow_output')
# Input summary statistics data
parameter: data = path("fastqtl_to_mash_output/FastQTLSumStats.mash.rds")
# Prefix of output files. If not specified, it will derive it from data.
# If it is specified, for example, `--output-prefix AnalysisResults`
# It will save output files as `{cwd}/AnalysisResults*`.
parameter: output_prefix = ''
# Exchangable effect (EE) or exchangable z-scores (EZ)
parameter: effect_model = 'EZ'
# Identifier of $\hat{V}$ estimate file
# Options are "identity", "simple", "mle", "vhat_corshrink_xcondition", "vhat_simple_specific"
parameter: vhat = 'simple'
parameter: mixture_components = ['flash', 'flash_nonneg', 'pca',"canonical"]
parameter: container = str
data = data.absolute()
cwd = cwd.absolute()
if len(output_prefix) == 0:
    output_prefix = f"{data:bn}"
prior_data = file_target(f"{cwd:a}/{output_prefix}.{effect_model}.prior.rds")
vhat_data = file_target(f"{cwd:a}/{output_prefix}.{effect_model}.V_{vhat}.rds")
mash_model = file_target(f"{cwd:a}/{output_prefix}.{effect_model}.V_{vhat}.mash_model.rds")

def sort_uniq(seq):
    seen = set()
    return [x for x in seq if not (x in seen or seen.add(x))]

In [11]:
# Apply posterior calculations
[posterior_1]
parameter: mash_model = path
parameter: posterior_input = paths()
parameter: posterior_vhat_files = paths()
# eg, if data is saved in R list as data$strong, then
# when you specify `--data-table-name strong` it will read the data as
# readRDS('{_input:r}')$strong
parameter: data_table_name = ''
parameter: bhat_table_name = 'bhat'
parameter: shat_table_name = 'sbhat'

skip_if(len(posterior_input) == 0, msg = "No posterior input data to compute on. Please specify it using --posterior-input.")
fail_if(len(posterior_vhat_files) > 1 and len(posterior_vhat_files) != len(posterior_input), msg = "length of --posterior-input and --posterior-vhat-files do not agree.")
for p in posterior_input:
    fail_if(not p.is_file(), msg = f'Cannot find posterior input file ``{p}``')

depends: R_library("mashr")
input: posterior_input, group_by = 1
output: f"{cwd}/mash_rds/{_input:bn}.posterior.rds"
task: trunk_workers = 1, walltime = '20h', trunk_size = 1, mem = '20G', cores = 1, tags = f'{_output:bn}'
R: expand = "${ }", workdir = cwd, stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout"
    library(mashr)
    data = readRDS(${_input:r})${('$' + data_table_name) if data_table_name else ''}
    vhat = readRDS("${vhat_data if len(posterior_vhat_files) == 0 else posterior_vhat_files[_index]}")
    mash_data = mash_set_data(data$${bhat_table_name}, Shat=data$${shat_table_name}, alpha=${1 if effect_model == 'EZ' else 0}, V=vhat, zero_Bhat_Shat_reset = 1E3)
    mash_output = mash_compute_posterior_matrices(readRDS(${mash_model:r}), mash_data)
    mash_output$snps = data$snps
    saveRDS(mash_output, ${_output:r})

In [None]:
[posterior_2]
input: group_by = "all"
output:f"{cwd}/mash_output_list"
python: expand = "$[ ]", workdir = cwd, stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout"
    import pandas as pd
    pd.DataFrame({"#mash_result" :  [$[_input:ar,]] }).to_csv("$[_output]",index = False ,header = False, sep = "t")