# Genotype and covariate data preprocessing
This is the data processing pipeline for xqtl workflow, containing the generation of:
1. Factor from expression
2. PCA from genotype
3. GRM from genotype
4. LD from genotype, filtered by grm [TBD]
5. Molecular_phenotype per chrom within selected regions in the format APEX and tensorQTL takes

### Input
The input for this workflow is the collection of data for 1 conditions as described in the readme of this git repo
1. 1 complete molecular_phenotype data
2. 1 collection of genotype data in plink format, partitioned by chrm
3. 1 file documenting the list of region to be analyzed
4. 

### Output
For each collection, the output is 23 sets of :
1. EXP file for selected region
2. genotype from vcf file

1 sets of
1. PCA + Factor + Covariate file

### Excutable:
This notebook depends on the scripts of multiple other notebook, the directory those are specify by exe_dir

In [None]:
nohup sos run /home/hs3163/GIT/ADSPFG-xQTL/workflow/Data_Processing/Data_Processing.ipynb region_extraction \
            --wd $[wd] \
            --container $[container] \
            --name $[name] \
            --numThreads $[numThreads] \
            --yml $[yml] \
            --queue $[queue] \
            --J $[J] \
            --exe_dir $[exe_dir] -s build &

In [2]:
[global]
import os
# Work directory & output directory
parameter: wd = path
# The filename name for output data
parameter: container = 'gaow/twas'
# name for the analysis output
parameter: name = str
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "24h"
# Memory expected
parameter: mem = "60G"
# Number of threads
parameter: numThreads = 20
# Diretory to the executable
parameter: exe_dir = path("~/GIT/ADSPFG-xQTL/workflow")
# yml template
parameter: yml = '/home/hs3163/GIT/ADSPFG-xQTL/code/csg.yml'
# queue for analysis
parameter: queue = "csg"
# Number of submission
parameter: J = 200
# Factor Options
parameter: factor_option: "APEX"

## Temp   
parameter: container_lmm = str
parameter: container_apex = str

parameter: region_list = path
regions = [x.strip().split() for x in open(region_list).readlines() if x.strip() and not x.strip().startswith('#')]
# Get the unique chormosome that have regions to be analyzed.
def extract(lst):
    return [item[0] for item in lst]
chrom = list(set(extract(regions)))
chrom.sort()

## Process of molecular phenotype file
This workflow produce a bed.gz+tabix file for all the molecular pheno data that are included in the region list to feed into APEX factor analysis

This workflow also produce a bed.gz+tabix for each chromosome for downstream QTL association analysis(bed.gz+tabiz for Apex and bed.gz for tensorQTL)

In [None]:
[Region_extraction_1]
# Path to the input molecular phenotype data.
parameter: molecular_pheno_whole = path

input: molecular_pheno_whole,region_list
output: molecular_pheno_whole_bed= f'{wd}/Phenotype/{name}.mol_phe.bed.gz',
        molecular_pheno_chr = [f'{wd}/Phenotype/{name}.chr{x}.mol_phe.bed.gz' for x in chrom]
task: trunk_workers = 1, trunk_size = 1, walltime = '4h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
        sos run $[exe_dir]/Data_Processing/Phenotype/Region_extraction.ipynb region_extraction \
            --wd $[wd]/Phenotype/ \
            --container $[container_apex] \
            --name $[name] \
            --numThreads $[numThreads] \
            --molecular_pheno_whole $[molecular_pheno_whole] \
            --region_list $[region_list] \
            -J $[J] -q $[queue] -c $[yml]

In [None]:
[Region_extraction_2]
input: named_output("molecular_pheno_chr")
output: molecular_pheno_chr_list = f'{wd}/Phenotype/{name}.mol_phe.chr_list'
python:expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
    import pandas as pd
    df = pd.DataFrame({ "molecular_pheno_chr" : [$[_input:r,]]})
    df.to_csv("$[_output]",sep = "\t",header = 0, index = 0)

## Process of Genotype data

### QC for Missingness and MAF


In [None]:
[genotype_qc_1]
# List of Genotype file in plink trio format
parameter: genotype_list = path
chrom_list = [x.strip().split() for x in open(genotype_list).readlines() if x.strip() and not x.strip().startswith('#')]

def extract(lst):
    return [item[0] for item in lst]
chrom = list(set(extract(chrom_list)))
geno_inventory = dict([x.strip().split() for x in open(genotype_list).readlines() if x.strip() and not x.strip().startswith('#')])
def get_genotype_file(chrom, genotype_list, geno_inventory):
    chrom = f'{chrom}'
    if chrom.startswith('chr'):
        chrom = chrom[3:]
    if chrom not in geno_inventory:
        geno_file = f'{chrom}'
    else:
        geno_file = geno_inventory[chrom]
    if not os.path.isfile(geno_file):
        # relative path
        if not os.path.isfile(f'{genotype_list:ad}/' + geno_file):
            raise ValueError(f"Cannot find genotype file {geno_file}")
        else:
            geno_file = f'{genotype_list:ad}/' + geno_file
    return path(geno_file)

# The path to the file that contains the list of samples to remove (format FID, IID)
parameter: remove_samples = path('.')
# The path to the file that contains the list of samples to keep (format FID, IID)
parameter: keep_samples = path('.')
# The path to the file that contains the list of variants to keep
parameter: keep_variants = path('.')
# minimum MAF filter to use. Notice that PLINK default is 0.01
parameter: maf_filter = 0.01
# maximum MAF filter to use
parameter: maf_max_filter = 0.0
# Maximum missingess per-variant
parameter: geno_filter = 0.01
# Maximum missingness per-sample
parameter: mind_filter = 0.02
# HWE filter 
parameter: hwe_filter = 5e-08



fail_if(not (keep_samples.is_file() or keep_samples == path('.')), msg = f'Cannot find ``{keep_samples}``')
fail_if(not (keep_variants.is_file() or keep_variants == path('.')), msg = f'Cannot find ``{keep_variants}``')
fail_if(not (remove_samples.is_file() or remove_samples == path('.')), msg = f'Cannot find ``{remove_samples}``')

input: genotype_list, for_each = "chrom"
geno_file = get_genotype_file(_chrom, genotype_list, geno_inventory)
output: f'{wd}/Genotype/{geno_file:bn}.{name}.filtered{".extracted" if keep_variants.is_file() else ""}.bed'
bash:  expand= "$[ ]", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
        sos run $[exe_dir]/Data_Processing/Genotype/GWAS_QC.ipynb qc:1 \
            --cwd $[wd]/Genotype/ \
            --container_lmm $[container_lmm] \
            --name $[name] \
            --numThreads $[numThreads] \
            --genoFile $[geno_file:a] \
            --remove_samples   $[remove_samples] \
            --keep_samples   $[keep_samples] \
            --keep_variants    $[keep_variants ] \
            --maf_filter   $[maf_filter] \
            --maf_max_filter   $[maf_max_filter] \
            --geno_filter   $[geno_filter] \
            --mind_filter   $[mind_filter] \
            --hwe_filter   $[hwe_filter] 

In [None]:
Generate a reciepe to document the filtered-ed genotype 

In [None]:
[genotype_qc_2]
# List of Genotype file in plink trio format
parameter: genotype_list = path
chrom_list = [x.strip().split() for x in open(genotype_list).readlines() if x.strip() and not x.strip().startswith('#')]
def extract(lst):
    return [item[0] for item in lst]
chrom = list(set(extract(chrom_list)))
input: group_by = "all"
output: qced_plink_genotype_list = f'{wd:a}/Genotype/{name}.filtered.plink.genotype_list'
R: expand= "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
    library("dplyr")
    library("tibble")
    library("readr")
    library("purrr")
    chrom = c('$["','".join(chrom)]')
    file = c($[_input:r,])
    table = tibble(`#chr` = chrom, dir = file)
    table%>%write_delim($[_output:r],"\t")

### PLINK2VCF
This step takes genotype_list documenting filtered plink file(output from genotype qc), turn them into vcf format so that APEX can used and also create a list documenting them

In [None]:
[plink2vcf]
# The output genotype list from genotype qc 2: 1.list of plink, 2.passed basic filtering
input: named_output("qced_plink_genotype_list")
output: qced_vcf_genotype_list = f'{wd:a}/Genotype/{name}_vcf_geno/{name}.vcf_geno_list.txt'
bash:  expand= "$[ ]", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    sos run $[exe_dir]/Data_Processing/Genotype/plink2VCF.ipynb plink2vcf \
            --wd $[wd]/Genotype/ \
            --container $[container_apex] \
            --name $[name] \
            --numThreads $[numThreads] \
            --genotype_list $[_input] \
            -J $[J] -q $[queue] -c $[yml]

### PCA
This is a five step workflow that act on a list qced genotype file to generate a genome wide PCA
1. Merge all the plink from the raw data via qc(same criterio as genotype_filtering)[FIXME: this can be simplify by using the qced file directly?].
2. Estimate and Filtered by whole genome kinship
3. LD prunning
4. PCA
5. Project back PCA result

In [None]:
[merge_plink]
# List of unfiltered Genotype file in plink trio format, not output from genotype_2
parameter: genotype_list = path
chrom_list = [x.strip().split() for x in open(genotype_list).readlines() if x.strip() and not x.strip().startswith('#')]
def extract(lst):
    return [item[0] for item in lst]
chrom = list(set(extract(chrom_list)))
geno_inventory = [x.strip().split() for x in open(genotype_list).readlines() if x.strip() and not x.strip().startswith('#')]
geno_file = [geno_inventory[i][1] for i in range(len(chrom)) ]

input: geno_file
output: merge_plink = f"{wd}/Genotype/PCA/{name}.bed"
bash:  expand= "$[ ]", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    sos run $[exe_dir]/Data_Processing/Genotype/GWAS_QC.ipynb merge_plink \
            --cwd $[wd]/Genotype/PCA/ \
            --container_lmm $[container_lmm] \
            --name $[name] \
            --numThreads $[numThreads] \
            --genoFile $[_input] \
            --merged_prefix $[name] 


In [None]:
[king]
# Merged Genotype file in plink trio format
# Name of the analysis
# kinship cutoff
parameter: kinship = 0.0625
# Otherwise (use `--no-maximize-unrelated`) the entire family will be removed
parameter: maximize_unrelated = "F"
input: output_from("merge_plink")["merge_plink"]
output: king =  f"{wd}/Genotype/PCA/{_input:bn}.{name}.related_id"
bash:  expand= "$[ ]", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    sos run $[exe_dir]/Data_Processing/Genotype/GWAS_QC.ipynb king \
            --cwd $[wd]/Genotype/PCA/ \
            --container_lmm $[container_lmm] \
            --name $[name] \
            --numThreads $[numThreads] \
            --genoFile $[_input:a] \
            --kinship $[kinship] 





In [None]:
[pca_unrelated_sample]
# Merged Genotype file in plink trio format
# minimum MAF filter to use. Notice that PLINK default is 0.01
parameter: maf_filter = 0.01
# maximum MAF filter to use
parameter: maf_max_filter = 0.0
# Maximum missingess per-variant
parameter: geno_filter = 0.01
# Maximum missingness per-sample
parameter: mind_filter = 0.02
# HWE filter 
parameter: hwe_filter = 5e-08

input: output_from("merge_plink") ,  output_from("king")
output: pca_unrelated_sample = f'{wd}/Genotype/PCA/{_input[0]:bn}.{name}.unrelated.bed', 
        keep_variants = f'{wd}/Genotype/PCA/{_input[0]:bn}.{name}.filtered.prune.in'
bash:  expand= "$[ ]", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
        sos run $[exe_dir]/Data_Processing/Genotype/GWAS_QC.ipynb qc \
            --cwd $[wd]/Genotype/PCA/ \
            --container_lmm $[container_lmm] \
            --name $[name] \
            --numThreads $[numThreads] \
            --genoFile $[_input[0]:a] \
            --remove_samples   $[_input[1]:a] \
            --maf_filter   $[maf_filter] \
            --maf_max_filter   $[maf_max_filter] \
            --geno_filter   $[geno_filter] \
            --mind_filter   $[mind_filter] \
            --hwe_filter   $[hwe_filter] \
            --merged_name $[_output[0]:n] 

In [None]:
[pca_related_sample]
# The path to the file that contains the list of samples to keep (format FID, IID)
parameter: keep_samples = f"{wd}/Genotype/PCA/cache/{name}.related_id"
parameter: geno_file = f"{wd}/Genotype/PCA/cache/{name}.bed"
# The path to the file that contains the list of variants to keep
input: output_from("merge_plink"), output_from("pca_unrelated_sample")["keep_variants"], output_from("king")
output: f'{_input[0]:ann}.related.bed'
bash:  expand= "$[ ]", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
        sos run $[exe_dir]/Data_Processing/Genotype/GWAS_QC.ipynb qc:1 \
            --cwd $[wd]/Genotype/PCA/ \
            --container_lmm $[container_lmm] \
            --name $[name] \
            --numThreads $[numThreads] \
            --genoFile $[_input[0]] \
            --keep_samples   $[_input[2]] \
            --keep_variants    $[_input[1]] \
            --maf_filter  0 \
            --maf_max_filter   0 \
            --geno_filter   0 \
            --mind_filter   0 \
            --hwe_filter   0 \
            --merged_name $[_output[0]:n] 

In [None]:
[phenoFile_mod]
# The phenotypic file, in bed
# Population lable: 2 column: IID + RACE
parameter: pop_file = "None"
input: output_from("Region_extraction_1")["molecular_pheno_whole_bed"]
output:  phenoFile_mod = f'{wd}/Genotype/PCA/{_input[0]:bn}.exp'
R: expand= "$[ ]", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
        library("tibble")
        library("readr")
        library("dplyr") 
        phenoFile = read_delim("$[_input]","\t")
        mtx = phenoFile[,5:ncol(phenoFile)]
        rownames(mtx) = phenoFile$gene_ID
        # each row is a sample each column is a gene
        mtx <- t(as.matrix(mtx, rownames = T))
        mtx = mtx%>%as_tibble(rownames = "IID") 
        phenoMtr = phenoFile%>%t()
        if("$[pop_file]" == "None"){
        # Make artificial pop lable
          output = mtx%>%mutate(RACE = "RACE_1")
          }else{
          pop = read_delim("$[pop_file]", "\t")
          output = inner_join(pop,mtx,by = "IID")
          }
          output%>%write_delim("$[_output]","\t")

In [None]:
[pca_model]
# Plink binary file
## The phenotypic file
# Number of Principal Components to output. Default is 10
parameter: k = 3
# Number of Principal Components based on which outliers should be evaluated. Default is 5 but this should be based on examine the scree plot
parameter: maha_k = 3
# POPS in pca
parameter: pops = "RACE_1"
# Homogeneity of populations. Set to --homogeneous when true and --no-homogeneous when false
parameter: homogeneous = False
# The path to the file that contains the list of samples to keep (format FID, IID)
parameter: keep_samples = f"{wd}/Genotype/PCA/cache/{name}.related_id"

input: output_from("pca_unrelated_sample")["pca_unrelated_sample"] ,  output_from("phenoFile_mod")["phenoFile_mod"]
output: pca_model =  f'{wd}/Genotype/PCA/{_input[1]:bn}.exp.pca.rds'
bash: expand= "$[ ]", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
        sos run $[exe_dir]/Data_Processing/Genotype/PCA.ipynb flashpca:1 \
            --cwd $[wd]/Genotype/PCA/ \
            --container_lmm $[container_lmm] \
            --name $[name] \
            --numThreads $[numThreads] \
            --genoFile $[_input[0]:a] \
            --phenoFile $[_input[1]:a] \
            --label_col "RACE" \
            --k $[k] \
            --maha_k $[maha_k] \
            --pop_col "RACE" \
            --pops $[pops]   \
            --homogeneous $[homogeneous] 

In [None]:
[project_sample]
## Plink binary file
## The phenotypic file, in bed
## The PCA model, in rds
## Population lable: 2 column: iid + RACE
# Number of Principal Components to output. Default is 10
parameter: k = 3
# Number of Principal Components based on which outliers should be evaluated. Default is 5 but this should be based on examine the scree plot
parameter: maha_k = 3
# Homogeneity of populations. Set to --homogeneous when true and --no-homogeneous when false
parameter: homogeneous = False
parameter: prob = 0.8
parameter: pval = 0.05

input: output_from("pca_unrelated_sample")["pca_unrelated_sample"],  output_from("phenoFile_mod")["phenoFile_mod"] ,  output_from("pca_model")["pca_model"]
output: project_sample = f'{_input[1]:n}.pca.projected.rds'
bash: expand= "$[ ]", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
        sos run $[exe_dir]/Data_Processing/Genotype/PCA.ipynb project_samples:1 \
            --cwd $[wd]/Genotype/PCA/ \
            --container_lmm $[container_lmm] \
            --name $[name] \
            --numThreads $[numThreads] \
            --genoFile $[_input[0]:a] \
            --phenoFile $[_input[1]:a] \
            --pca_model $[_input[2]:a] \
            --pop_col "RACE" \
            --label_col "RACE" \
            --prob $[prob] \
            --pval $[pval] \
            --k $[k] \
            --maha_k $[maha_k]

In [None]:
[pca_factor]
## PCA models
input: output_from("project_sample")["project_sample"], output_from("Factor_analysis")["Factor_analysis"]
output: pca_factor = f'{_input[1]:n}.pca.cov'
R: expand= "$[ ]", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
        library("dplyr")
        library("tibble")
        library("readr")
        library("modelr")
        library("purrr")
        pca_output = readRDS("$[_input[0]]")$pc_scores
        mtx = pca_output%>%select(contains("PC"))%>%t()
        colnames(mtx) <- pca_output$IID
        mtx = mtx%>%as_tibble()%>%mutate("#id" = rownames(mtx))%>%select("#id",everything())
        factor_cov = read_delim("$[_input[1]]","\t")
        output = bind_rows(factor_cov,mtx)
        output%>%write_delim("$[_output]","\t")

### LOCO GRM by GCTA 
GRM was used due to the advantage of gcta where seperate sets of bfile can be feeded to generate the grm result without merging them. A list of bfile that are used to generate the GRM are listed

In [None]:
[GRM]
# List of Genotype file in plink trio format
parameter: genotype_list = path
input: genotype_list
output: GRM =  f'{wd:a}/Genotype/GRM/{name}.grm_list.txt'
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
        sos run $[exe_dir]/Data_Processing/Genotype/LOCO_GRM.ipynb GRM \
            --wd $[wd]/Genotype/ \
            --container $[container] \
            --name $[name] \
            --numThreads $[numThreads] \
            --genotype_list $[genotype_list] \
            -J $[J] -q $[queue] -c $[yml]

### Whole genome QC for PCA

## Process of Factor analysis
Based on the selection of method, PEER vs APEX, for factor analysis, the input file will be different. For apex, the input file is a bed.gz file, with tbi index. For peer, the molecular pheno file itself shall suffice

In [None]:
[Factor_analysis]
parameter: covariate = ""
# N PEER factors, If do not specify or specified as 0, default values suggested by 
# UCSC (based on different sample size) Will be used
parameter: N = 4
# Default values from PEER:
## The number of iteration
parameter: max_iter = 30
## Prior parameters
parameter: Alpha_a = 0.001
parameter: Alpha_b = 0.1
parameter: Eps_a = 0.1
parameter: Eps_b = 10.
## Tolarance parameters
parameter: tol = 0.001
parameter: var_tol = 1e-08
input: output_from("Region_extraction_1")["molecular_pheno_whole_bed"], output_from("plink2vcf")["qced_vcf_genotype_list"]
output: Factor_analysis = f'{wd:a}/Factor_and_Covariate/{name}.{factor_option}.cov'
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
        sos run $[exe_dir]/Data_Processing/Factor_and_Covariate/factor.ipynb $[factor_option]  \
            --wd $[wd]/Factor_and_Covariate/ \
            --container_apex $[container_apex] \
            --name $[name] \
            --numThreads $[numThreads] \
            --molecular_pheno $[_input[0]] \
            --genotype_list $[_input[1]] \
            --N $[N] \
            --Alpha_a $[Alpha_a]  \
            --Alpha_b $[Alpha_b] \
            --Eps_a  $[Eps_a] \
            --Eps_b  $[Eps_b] \
            --tol  $[tol] \
            --var_tol $[var_tol] \
            -J $[J] -q $[queue] -c $[yml] $[f'--covariate {covariate}' if os.path.exists(covariate) else f'']

## Output Recipe Generation
This step generate a list to document all the output of this step, so that they could be easily accessible by the next step

In [1]:
[Recipe]
# Molecular_pheno
input:  output_from("Region_extraction_2")["molecular_pheno_chr_list"],
        output_from("GRM")["GRM"],
        output_from("genotype_qc_2")["qced_plink_genotype_list"],
        output_from("plink2vcf")["qced_vcf_genotype_list"],
        output_from("pca_factor")
output: f'{wd}/{name}.data_proc_output_recipe.tsv'
python: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
    import pandas as pd 
    data = [[$[_input:r,]]]
    df = pd.DataFrame(data,columns = ["molecular_pheno_chr_list","grm_list",
                                        "qced_plink_genotype_list","qced_vcf_genotype_list","covariate_factor_pca"]  ).assign(name = "$[name]")
    df.to_csv("$[_output]",index = 0,sep = "\t" )