# Exome filtering

Filter exome bed files so they contain only SNPs with 2 alleles and non-duplicated variants

This final file will contain both common and rare variants

In [None]:
[global]
# the output directory for generated files
parameter: cwd = path
# BED Plink files for exome data
parameter: bedfiles = paths
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Number of threads
parameter: numThreads = 1
# Software container option
parameter: container_lmm = 'statisticalgenetics/lmm:1.5'

In [None]:
#Filter snps and remove duplicates with Plink2
[plink_filter]
input: bedfiles, group_by=1
output: f'{cwd}/{_input:bn}.snps_nondups.bed'
task: trunk_workers = 1, walltime = '48h', mem = '90G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container_lmm, expand= "${ }", stderr = f'{_output:n}.err', stdout = f'{_output:n}.out'
    awk -F' ' '{print $4}' ${_input:n}.bim | sort |uniq -d > ${_output:nn}.dup_pos
    awk 'NR==FNR{a[$1]; next} ($4 in a) {print $2}' ${_output:nn}.dup_pos ${_input:n}.bim  > ${_output:nn}.dup_vars  
    plink2 \
      --bfile ${_input:n} \
      --snps-only just-acgt\
      --exclude ${_output:nn}.dup_vars\
      --threads ${numThreads} \
      --make-bed \
      --out ${_output:n} \
      --memory 90000

In [None]:
#Merge all the filtered files with Plink1.9
[plink_merge]
input: bedfiles, group_by = 'all'
output: f'{cwd}/{_input[0].name.split("_c")[0]}.merged_allchr.bed'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '90G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container_lmm, expand= "${ }", stderr = f'{_output:n}.err', stdout = f'{_output:n}.out'
    echo -e ${' '.join([str(x)[:-4] for x in _input[1:]])} | sed 's/ /\n/g' > ${_output:n}.merge_list
    plink \
    --bfile ${_input[0]:n} \
    --merge-list ${_output:n}.merge_list \
    --make-bed \
    --out ${_output:n} \
    --threads ${numThreads} \
    --memory 90000

In [None]:
## Write samples that pass QC to select them in following steps
[plink_qc]
parameter: bfile = path
input: bfile
output: f'{cwd}/{_input:bnn}.keep_samples.id'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '90G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container_lmm, expand= "${ }", stderr = f'{_output:n}.err', stdout = f'{_output:n}.out'
    plink2 \
      --bfile ${_input:n} \
      --mind 0.1 \
      --write-samples --no-id-header \
      --out ${_output:n}

### Scripts to run the pipeline

In [2]:
tpl_file=../farnam.yml
bedfiles=`echo /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_c{1..22}_b0_v1.bed`
plink_sos=~/project/UKBB_GWAS_dev/workflow/plink_snps_only.ipynb
plink_sbatch=../output/$(date +"%Y-%m-%d")_plink_snps_only.sbatch
cwd=/gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/exome_files_snpsonly
job_size=1
numThreads=22
container_lmm=/gpfs/gibbs/pi/dewan/data/UKBiobank/lmm_v1_5.sif
plink_args="""plink
    --cwd $cwd 
    --bedfiles $bedfiles  
    --job_size $job_size
    --numThreads $numThreads
    --container_lmm $container_lmm
"""
    
sos run ~/project/bioworkflows/GWAS/Get_Job_Script.ipynb farnam \
    --template-file $tpl_file \
    --workflow-file $plink_sos \
    --to-script $plink_sbatch \
    --args "$plink_args"

INFO: Running [32mfarnam[0m: Configuration for Yale `farnam` cluster
INFO: [32mfarnam[0m (index=0) is [32mignored[0m due to saved signature
INFO: [32mfarnam[0m output:   [32m../output/2021-01-08_plink_snps_only.sbatch[0m
INFO: Workflow farnam (ID=1333be6e8ff392a0) is ignored with 1 ignored step.



In [1]:
tpl_file=/mnt/mfs/statgen/pbs_template/csg.yml 
bedfiles=`echo /mnt/mfs/statgen/UKBiobank/data/exome_files/ukb23155_c{1..22}_b0_v1.bed`
plink_sos=~/project/UKBB_GWAS_dev/workflow/plink_snps_only.ipynb
plink_sbatch=../output/plink_snps_only_$(date +"%Y-%m-%d").sbatch
cwd=/mnt/mfs/statgen/UKBiobank/data/exome_files/exome_files_snpsonly
job_size=1
numThreads=22
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
plink_args="""plink
    --cwd $cwd 
    --bedfiles $bedfiles  
    --job_size $job_size
    --numThreads $numThreads
    --container_lmm $container_lmm
"""
    
sos run ~/project/bioworkflows/GWAS/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $plink_sos \
    --to-script $plink_sbatch \
    --args "$plink_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/plink_snps_only_2021-01-29.sbatch[0m
INFO: Workflow csg (ID=05e57d59b0bd60d5) is executed successfully with 1 completed step.



## Running the VCF-QC'ed data

In [2]:
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
bedfiles=`echo /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/ukb23156_c{1..22}.merged.filtered.bed`
plink_sos=~/project/UKBB_GWAS_dev/workflow/plink_snps_only.ipynb
plink_sbatch=../output/plink_merge_allchrs_$(date +"%Y-%m-%d").sbatch
cwd=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/
job_size=1
numThreads=22
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
plink_args="""plink_merge
    --cwd $cwd 
    --bedfiles $bedfiles  
    --job_size $job_size
    --numThreads $numThreads
    --container_lmm $container_lmm
"""
    
sos run ~/project/bioworkflows/admin/Get_Job_Script.ipynb  csg \
    --template-file $tpl_file \
    --workflow-file $plink_sos \
    --to-script $plink_sbatch \
    --args "$plink_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/plink_merge_allchrs_2021-05-21.sbatch[0m
INFO: Workflow csg (ID=w3c9030245032932a) is executed successfully with 1 completed step.



## Running the QC on the samples and getting a list of samples that pass QC

In [4]:
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
bfile=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/ukb23156_c1.merged.filtered.bed.merged_allchr.bed
bedfiles=`echo /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/ukb23156_c{1..22}.merged.filtered.bed`
plink_sos=~/project/UKBB_GWAS_dev/workflow/plink_snps_only.ipynb
plink_sbatch=../output/plink_qc_allchrs_$(date +"%Y-%m-%d").sbatch
cwd=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/
job_size=1
numThreads=22
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
plink_args="""plink_qc
    --cwd $cwd 
    --bedfiles $bedfiles 
    --bfile $bfile 
    --job_size $job_size
    --numThreads $numThreads
    --container_lmm $container_lmm
"""
    
sos run ~/project/bioworkflows/admin/Get_Job_Script.ipynb  csg \
    --template-file $tpl_file \
    --workflow-file $plink_sos \
    --to-script $plink_sbatch \
    --args "$plink_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/plink_qc_allchrs_2021-05-24.sbatch[0m
INFO: Workflow csg (ID=w461ddcee18ec0d41) is executed successfully with 1 completed step.



In [None]:
#!/bin/bash
files=$(ls -v *.dups_pos)
for i in $files; do
   wc -l $i >> dups_pos_counts.txt
done

files=$(ls -v *.dup_vars)
for i in $files; do
   wc -l $i >> dups_vars_counts.txt
done

files=$(ls -v ukb23155_c{1..22}_b0_v1.bim)
for i in $files; do
   wc -l $i >> bim_counts.txt
done

files=$(ls -v ukb23155_c{1..22}_b0_v1.snps_nondups.bim)
for i in $files; do
   wc -l $i >> bim_counts_snps_nondups.txt
done

paste bim_counts.txt dups_pos_counts.txt dups_vars_counts.txt > stats_vars_removed.txt
