# Genotype data reformatting
This is the plink 2 vcf part of data processing pipeline for xqtl workflow, containing the generation of:
. vcf genome type file

### Input
The input for this workflow is the collection of data for 1 conditions as described in the readme of this git repo
1. 1 collection of genotype data in plink format, partitioned by chrm
2. A list of genotype file documenting the location of 1.

### Output
For each collection, the output is 23 sets of :
1. genotype file in compress vcf format
2. tbi index for the compressed vcf



In [2]:
[global]
import os
# Work directory & output directory
parameter: wd = path
# The filename name for containers
parameter: container = 'gaow/twas'
# namefor the analysis output
parameter: name= str
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20
# List of Genotype file in plink trio format
parameter: genotype_list = path
geno_inventory = dict([x.strip().split() for x in open(genotype_list).readlines() if x.strip() and not x.strip().startswith('#')])

chrom_list = [x.strip().split() for x in open(genotype_list).readlines() if x.strip() and not x.strip().startswith('#')]
# Get the unique chormosome that have regions to be analyzed.
def extract(lst):
    return [item[0] for item in lst]
chrom = list(set(extract(chrom_list)))

import os
def get_genotype_file(chrom, genotype_list, geno_inventory):
    chrom = f'{chrom}'
    if chrom.startswith('chr'):
        chrom = chrom[3:]
    if chrom not in geno_inventory:
        geno_file = f'{chrom}'
    else:
        geno_file = geno_inventory[chrom]
    if not os.path.isfile(geno_file):
        # relative path
        if not os.path.isfile(f'{genotype_list:ad}/' + geno_file):
            raise ValueError(f"Cannot find genotype file {geno_file}")
        else:
            geno_file = f'{genotype_list:ad}/' + geno_file
    return path(geno_file)


## Process of Genotype data

### Plink to VCF transformation


In [None]:
[plink2vcf_1]
input: genotype_list, for_each = "chrom"
geno_file = get_genotype_file(_chrom,genotype_list,geno_inventory) 
output: f'{wd:a}/{name}_vcf_geno/{name}_chr{_chrom}.vcf.gz',
        f'{wd:a}/{name}_vcf_geno/{name}_chr{_chrom}.vcf.gz.tbi'
task: trunk_workers = 1, trunk_size = 1, walltime = '12h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, volumes = [f'{genotype_list:ad}:{genotype_list:ad}']
    plink --bfile $[geno_file:n] \
    --recode vcf-iid       --out $[_output[0]:nn] 
    bgzip $[_output[0]:n]  
    tabix -f -p vcf $[_output[0]]
    

In [None]:
[plink2vcf_2]
input: group_by = "all"
output: f'{wd:a}/{name}_vcf_geno/{name}.vcf_geno_list.txt'
task: trunk_workers = 1, trunk_size = 1, walltime = '12h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
R: expand= "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, volumes = [f'{genotype_list:ad}:{genotype_list:ad}']
    library("dplyr")
    library("tibble")
    library("readr")
    library("modelr")
    library("purrr")
    chrom = c($[",".join(chrom)])
    dir = "$[_output:nn]"
    geno_list = tibble(`#chr` = chrom, dir = map_chr(`#chr`,~paste(c(dir,"_chr",.x,".vcf.gz"),collapse ="")))%>%arrange(`#chr`)
    geno_list%>%write_delim("$[_output]","\t")
    