# Extracting Sumstat for each gene from the per chromosome sumstat of each study

Input:
    1. A list of genes that needs their sumstat packages in RDS format
    2. A list of sumstat file for each chromosome
Output:
    1. A collections of rds files each are a list with bhat and sbhat matrixs, with their rowname being the snps name.
    2. A file that documenting the name for Output.1

## Global parameters

In [None]:
[global]
# An index text file with 4 columns specifying the chr, start, end and names of regions to analyze
parameter: region_list = path
# Path to the work directory
parameter: wd = path('./')
# Specify the scanning window for the up and downstream radius to analyze around the region of interest, in units of Kb
parameter: window = 500000
# Name for the theme or theme combined (Meta_SS1_SS2)
parameter: name = "geneTpmResidualsAgeGenderAdj_rename"

# Container option for software to run the analysis: docker or singularity
parameter: container = 'gaow/twas'

# Get regions of interest to focus on.
regions = [x.strip().split() for x in open(region_list).readlines() if x.strip() and not x.strip().startswith('#')]


## Sumstat annotation
Each sumstat_list is a index for the code to looks for the correct sumstat file so that only the chr overlapping with the regions will be analyzed


In [None]:
[Sumstat_Annotation_1]
# a list of sumstat file from 1 study, for each of them, chr and bp column shall be named CHR and BP
parameter: sumstat_list = path
# name of column for chr and pos  
parameter: chr_col = "chr"
parameter: pos_col = "pos"
# location of column for beta, SE_beta, and SNP names  
parameter: beta_col = "beta"
parameter: se_beta_col = "se"
# SNP names in the form of chr:pos_alt_ref
parameter: snp_col = "variant_id"
input: sumstat_list, for_each = "regions"
output: f'{wd}/sumstat/{name}/{_regions[3]}.rds'
task: trunk_workers = 1, trunk_size = 1, walltime = '12h',  mem = '10G', tags = f'{step_name}_{_output:bn}'
R: expand= "$[ ]", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout',container = container
    library("dplyr")
    library("tibble")
    library("readr")
    library("modelr")
    library("purrr")
    library("tidyr")
    sumstat_list = read_delim("$[_input]","\t")
    sumstat_path = (sumstat_list%>%filter(chr == $[_regions[0]]))[[1,1]]
    sumstat = read_delim(sumstat_path,delim = "\t" ) 
    sumstat_ftr = sumstat%>%mutate(Z = sumstat$$[beta_col]/sumstat$$[se_beta_col],
                                    pos = map_dbl(variant_id, ~read.table(text = read.table(text = .x, sep = ":")$V2,sep = "_")$V1))%>%
    filter( `$[pos_col]` >=  $[_regions[1]] - 500000, `$[pos_col]` <= $[_regions[1]] + 500000)%>%
    ## remove all the NA,nan,Inf sumstat
    filter(!is.na(Z) && !is.nan(Z) && is.finite(Z))
    output = list()
    output$bhat = as.matrix(sumstat_ftr$$[beta_col])
    rownames(output$bhat) = (sumstat_ftr$$[snp_col])%>%unlist%>%as.character
    output$sbhat = as.matrix(sumstat_ftr$$[se_beta_col])
    rownames(output$sbhat) = (sumstat_ftr$$[snp_col])%>%unlist%>%as.character
    ## remove all the NA,nan,Inf sumstat
    output%>%saveRDS("$[_output]")
  
  

In [None]:
[Sumstat_Annotation_2]
input: group_by = "all"
output: f'{wd}/sumstat/{name}/analysis_unit.txt'
python: expand= "$[ ]", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'
    import pandas as pd
    pd.DataFrame({"analysis_unit" : [$[_input:br,]]}).to_csv("$[_output]",index = False ,header = False, sep = "t")