# Meta analysis
This note book document the process of meta analysis. 

The output of QTL association will firstly be grouped based on whether they could be conducted meta analysis according to a METAL_theme_str input.

Those that were grouped together will be fed into METAL and generate 1 sumstat for each chromosome. These sumstat will then be treated as the sumstat of a new theme, connected by -.

all the non_METALED and the new theme will be partioned from chromosome into genes, and then merged together.

For example, in a three theme study, AC, DLPFC, and PCC
a METAL_theme_str of "AC,DLPFC" indicates AC and DLPFC are to be METALed. Their METAL result will be named as AC-DLPFC

Then the AC-DLPFC and the PCC will each be partitioned and then merged together for downstream analysis.

AC and DLPFC will not be analyzed individually anymore.

In [None]:
[global]
import os
# Work directory & output directory
parameter: wd = "./"
# The filename name for output data
parameter: container = 'gaow/twas'
# name for the analysis output
parameter: name = 'ROSMAP'
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20
# Diretory to the executable
parameter: exe_dir = path("~/GIT/ADSPFG-xQTL/workflow")
# yml template
parameter: yml = f'{exe_dir:d}/code/csg.yml'
# queue for analysis
parameter: queue = "csg"
# Number of submission
parameter: J = 200
# Mash Options
parameter: vhat = 'simple'
parameter: region_list = path
import csv
import pandas as pd
## A multi column file, each row is 1 chr, first col is chr, each subsqeunt a theme. each cell is the path to a sumstat file, which with following column format: variant_id, alt, ref, pval, beta, se ; 
parameter: sumstat_list_path = path
sumstat_list =  pd.read_csv(sumstat_list_path,delimiter="\t")
## A comma sep str that indicates what are the themes that goes into metal
parameter: METAL_theme_str = "."


## Retain the chr column as well
METAL_theme = METAL_theme_str.split(",")
METAL_theme_prefix = "-".join(METAL_theme)
METAL_theme.append("chr")
METAL_list = sumstat_list[METAL_theme].values.tolist()
Non_METAL_list= sumstat_list.drop(METAL_theme_str.split(","),axis = 1)
chrom = sumstat_list["chr"].values.tolist()
METALed_sumstat_list = Non_METAL_list.assign(**{METAL_theme_prefix : [f'{wd}/METAL/{METAL_theme_prefix}.chr{x}.METAL.txt' for x in chrom ]})
Theme = METALed_sumstat_list.drop(["chr"],axis = 1).columns.values.tolist()
Theme_list = pd.DataFrame({"#Theme" : [f'{wd}/sumstat/{x}' for x in Theme]})
Theme_prefix = "_".join(Theme)


## METAL

In [None]:
[METAL]
input: for_each = "METAL_list"
output: METAL_output = f'{wd}/METAL/{METAL_theme_prefix}.chr{_METAL_list[-1]}.METAL.txt'
task: trunk_workers = 1, trunk_size = 20, walltime = '4h',  mem = '6G', tags = f'{step_name}_{_output:bn}'  
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
        sos run $[exe_dir]/Meta_Analysis/METAL/METAL.ipynb METAL \
            --wd $[wd]/METAL/ \
            --name $[METAL_theme_prefix].chr$[_METAL_list[-1]] \
            --sumstat_list $[" ".join(_METAL_list[0:-1])]   

## Partitioning
Partition into each gene first then merge, each input is a list of sumstat per chromosome, each output is sumstat in rds format per gene
Input is n*p matrix where n is number of chr and p is number of theme not went through metal + 1.

The idea is, turn the n*p mtr into p n*2 mtr with p and chr column

In [None]:
[Partition_1]
depends: sos_step("METAL")
input: for_each = "Theme"
output: f'{wd}/sumstat/{_Theme}/sumstat_list',
        Partition_list = f'{wd}/sumstat/{_Theme}/analysis_unit.txt'
import pandas as pd
METALed_sumstat_list[[_Theme,"chr"]].to_csv(_output[0],index = 0,sep = "\t" )
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
            sos run $[exe_dir]/Meta_Analysis/Reformatting/Partitioned.ipynb Sumstat_Annotation \
            --wd $[wd] \
            --container $[container] \
            --name $[_Theme] \
            --sumstat_list $[_output[0]] \
            --region_list $[region_list] \
            -J $[J] -q $[queue] -c $[yml]

In [None]:
## Get only the intersection
[Partition_2]
input: output_from("Partition_1")["Partition_list"], group_by = "all"
output: f'{wd}/sumstat/Partition_analysis_unit.txt'
python: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
    import pandas as pd
    au = [ pd.read_csv(x ,header = None).to_dict("list")[0] for x in [$[_input:r,]] ]
    intersect_list = list(set.intersection(*map(set,au)))
    pd.DataFrame({"#Analysis_Unit":intersect_list }).to_csv("$[_output]" , sep = "\t", index = 0)

## Merging

In [None]:
[merge_and_alleleQC]
depends: sos_step("METAL")     
Theme_list.to_csv(f'{wd}/Theme_list.txt',sep = "\t",index = 0)
input: output_from("Partition")
output: merged_analysis_unit = f'{wd}/sumstat/merged_analysis_unit.txt'
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
            sos run $[exe_dir]/Meta_Analysis/Reformatting/Merged.ipynb merge_and_alleleQC \
            --wd $[wd]/sumstat/ \
            --container $[container] \
            --theme_list $[f'{wd}/Theme_list.txt'] \
            --analysis_units $[_input] \
            -J $[J] -q $[queue] -c $[yml]

## Extract effect


In [None]:
[extract_effects]
input: output_from("merge_and_alleleQC")["merged_analysis_unit"]
output: extracted_effect = f'{wd}/sumstat/{Theme_prefix}.rds'
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
            sos run $[exe_dir]/Meta_Analysis/Reformatting/Signal_Extraction.ipynb extract_effects \
            --cwd $[wd]/sumstat/ \
            --container $[container] \
            --name $[Theme_prefix] \
            --analysis_units $[_input] \
            -J $[J] -q $[queue] -c $[yml]

# Factor analysis and MASH Model

In [None]:
[MASH_FLASH]
parameter: effect_model = 'EZ'
parameter: mixture_components = ['flash', 'flash_nonneg', 'pca',"canonical"]
input: output_from("extract_effects")["extracted_effect"]
output: MASH_model = f"{wd}/sumstat/{Theme_prefix}.{effect_model}.V_{vhat}.mash_model.rds",
        resid_corr = f"{wd}/sumstat/{Theme_prefix}.{effect_model}.V_{vhat}.rds",
        flash_output = [f"{wd}/sumstat/{Theme_prefix}.{m}.rds" for m in mixture_components]
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
            sos run $[exe_dir]/Meta_Analysis/MASH/mashr_flashr_workflow.ipynb mash \
            --cwd $[wd]/sumstat/ \
            --container $[container] \
            --effect_model $[effect_model] \
            --vhat $[vhat] \
            --output_prefix $[Theme_prefix] \
            --data $[_input] \
            -J $[J] -q $[queue] -c $[yml]

## MASH Posterior

In [None]:
[MASH_posterior]
parameter: effect_model = 'EZ'
parameter: mixture_components = ['flash', 'flash_nonneg', 'pca',"canonical"]
analysis_units = f'{wd}/sumstat/merged_analysis_unit.txt'
regions = [x.replace("\"","").strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]
gene = [x[0] for x in regions]
input: output_from("MASH_FLASH")["MASH_model"], regions
output: mash_output_list = f'{wd}/sumstat/mash_output_list'
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
            sos run $[exe_dir]/Meta_Analysis/MASH/mash_posterior.ipynb  posterior \
            --cwd $[wd]/sumstat/ \
            --container $[container] \
            --mash_model $[_input[0]] \
            --vhat $[vhat] \
            --output_prefix $[Theme_prefix] \
            --posterior_input  $[" ".join(gene)] \
            -J $[J] -q $[queue] -c $[yml]

## Mixture Prior

In [None]:
[Mixture_prior]
parameter: effect_model = 'EZ'
parameter: mixture_components = ['flash', 'flash_nonneg', 'pca',"canonical"]
parameter: mixture_prior_method = "ed_bovy"
input: output_from("MASH_FLASH")["MASH_model"], output_from("extract_effects")["extracted_effect"]
analysis_units = f'{wd}/sumstat/merged_analysis_unit.txt'
regions = [x.replace("\"","").strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]
output: mixture_prior = f'{wd}/../Fine_Mapping/Mixture_Prior/{Theme_prefix}.{mixture_prior_method}.V_{vhat}.rds'
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
            sos run $[exe_dir]/Meta_Analysis/MASH/mixture_prior.ipynb $[mixture_prior_method] \
            --cwd $[wd] \
            --container $[container] \
            --name $[Theme_prefix] \
            --vhat $[vhat] \
            --data  $[_input[1]] \
            --name $[Theme_prefix] \
            -J $[J] -q $[queue] -c $[yml]

## Recipe for next step

In [None]:
[Recipe]
depends: sos_step("RDS2VCF")
parameter: mixture_prior_method = "ed_bovy"
input:  output_from("merge_and_alleleQC")["merged_analysis_unit"],output_from("MASH_FLASH")["resid_corr"], output_from("Mixture_prior")["mixture_prior"]
output: f'{wd}/Fine_mapping_recipe.txt'
python: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
    import pandas as pd
    mixture_prior = pd.DataFrame({"mixture_prior_list" : [$[_input[2]:r,]]})
    prior = mixture_prior.query('mixture_prior_list.str.contains("$[mixture_prior_method]")',engine = "python")["mixture_prior_list"].values.tolist()
    output_pd = pd.DataFrame({
    "merged_analysis_unit" : ["$[_input[0]]"],
    "resid_corr" : ["$[_input[1]]"],
    "prior" : prior,
    "Theme_prefix" : ["$[Theme_prefix]"]})
    output_pd.to_csv('$[_output]',sep = "\t", index = 0)

## RDS to VCF

In [None]:
[RDS2VCF]
parameter: data_dir = path("/")
input: output_from("MASH_posterior")["mash_output_list"]
output: f'{wd}/mash_vcf/vcf_output_list.txt'
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
        sos run $[exe_dir]/Meta_Analysis/Reformatting/RDS_to_vcf.ipynb rds_to_vcf \
            --wd $[wd]/ \
            --container $[container] \
            --name $[Theme_prefix] \
            --analysis_units $[_input] \
            --data_dir $[data_dir]  \
            -J $[J] -q $[queue] -c $[yml]