# Leave one chromosome out GRM
This is the GRM part of data processing pipeline for xqtl workflow, containing the generation of:
. GRM from genotype

### Input
The input for this workflow is the collection of data for 1 conditions as described in the readme of this git repo
1. 1 collection of genotype data in plink format, partitioned by chrm
2. A list of genotype file documenting the location of 1.

### Output
For each collection, the output is 23 sets of :
1. GRM in Matrix form
2. GRM in table form for APEX 



In [2]:
[global]
import os
# Work directory & output directory
parameter: wd = path
# The filename name for output data
parameter: container = 'gaow/twas'
# name for the analysis output
parameter: name = str
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20
# List of Genotype file in plink trio format
parameter: genotype_list = path

chrom_list = [x.strip().split() for x in open(genotype_list).readlines() if x.strip() and not x.strip().startswith('#')]
# Get the unique chormosome that have regions to be analyzed.
def extract(lst):
    return [item[0] for item in lst]
chrom = list(set(extract(chrom_list)))

import os
def get_genotype_file(chrom, genotype_list, geno_inventory):
    chrom = f'{chrom}'
    if chrom.startswith('chr'):
        chrom = chrom[3:]
    if chrom not in geno_inventory:
        geno_file = f'{chrom}'
    else:
        geno_file = geno_inventory[chrom]
    if not os.path.isfile(geno_file):
        # relative path
        if not os.path.isfile(f'{genotype_list:ad}/' + geno_file):
            raise ValueError(f"Cannot find genotype file {geno_file}")
        else:
            geno_file = f'{genotype_list:ad}/' + geno_file
    return path(geno_file)


## Process of Genotype data

In [None]:
Generate a reciepe to document the qc-ed genotype 

### LOCO GRM by GCTA 
GRM was used due to the advantage of gcta where seperate sets of bfile can be feeded to generate the grm result without merging them. A list of bfile that are used to generate the GRM are listed

In [None]:
[GRM_1]
# List of Genotype file in plink trio format
input: genotype_list, for_each = "chrom"
output: f'{wd:a}/GRM/{name}_chr{_chrom}.loco.txt'
task: trunk_workers = 1, trunk_size = 1, walltime = '12h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
R: expand= "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, volumes = [f'{genotype_list:ad}:{genotype_list:ad}']
    library("dplyr")
    library("tibble")
    library("readr")
    library("purrr")
    geno = read_delim("$[_input]",delim = "\t")
    geno = geno%>%filter(`#chr` != $[_chrom])
    geno = geno%>%mutate(dir = map_chr(dir,~read.table(text = .x[[1]], sep = ".")$V1%>%as.character))%>%select(`#dir` = dir)
    geno%>%write_delim("$[_output[0]]",delim = "\t",col_names = FALSE)

Conduct the loco GRM and reformat the output according to APEX

In [None]:
[GRM_2]
# List of Genotype file in plink trio format
input: group_with = "chrom"
output: f'{wd:a}/GRM/{name}_chr{_chrom}.grm.gz'
task: trunk_workers = 1, trunk_size = 1, walltime = '12h',  mem = '60G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, volumes = [f'{genotype_list:ad}:{genotype_list:ad}']

   gcta64 \
   --mbfile $[_input] \
   --make-grm-gz \
   --out $[_output:nn]
   
[GRM_3]
# List of Genotype file in plink trio format
input: group_with = "chrom"
output: f'{wd:a}/GRM/{name}_chr{_chrom}.grm'
task: trunk_workers = 1, trunk_size = 1, walltime = '12h',  mem = '60G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, volumes = [f'{genotype_list:ad}:{genotype_list:ad}']
   gunzip -f -k $[_input]

R: expand= "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, volumes = [f'{genotype_list:ad}:{genotype_list:ad}']
   library("dplyr")
   library("tibble")
   library("readr")
   library("modelr")
   library("purrr")
    grm = read_delim("$[_output]","\t",col_names = FALSE)
    id = unlist(read_delim("$[_output].id","\t",col_names = FALSE)[,2])
    grm = grm[,c(1,2,4)]
    colnames(grm) = c("#id1","id2","kinship")
    grm = grm%>%mutate(`#id1` = map_chr(`#id1`, ~id[.x]),`id2` =  map_chr(`id2`, ~id[.x]))
    grm%>%write_delim("$[_output]","\t")

Generate the list of grm file for each chrm to direct future usage

In [None]:
[GRM_4]
# List of Genotype file in plink trio format
input: group_by = "all"
output: f'{wd:a}/GRM/{name}.grm_list.txt'
task: trunk_workers = 1, trunk_size = 1, walltime = '12h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
R: expand= "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, volumes = [f'{genotype_list:ad}:{genotype_list:ad}']
    library("dplyr")
    library("tibble")
    library("readr")
    library("modelr")
    library("purrr")
    chrom = c($[",".join(chrom)])
    dir = "$[_output:nn]"
    grm_list = tibble(`#chr` = chrom, dir = map_chr(`#chr`,~paste(c(dir,"_chr",.x,".grm"),collapse ="")))%>%arrange(`#chr`)
    grm_list%>%write_delim("$[_output]","\t")