# Extract regions from imputed data using the coordinates from the 1725 Ld blocks

In [None]:
[global]
# Working directory
parameter: cwd = path
#Bgen file from which to extract the regions
parameter: genofile = path
#regions to be extracted (one line)
parameter: region_file = path
#Number of jobs
parameter: job_size = 20
parameter: walltime = '2h'
parameter: mem = '10G'
parameter: numThreads = 1
parameter: container = '/mnt/vast/hpc/csg/containers/lmm.sif'
fail_if(not region_file.is_file(), msg = 'Cannot find regions to extract. Please specify them using ``--region-file`` option.')

In [None]:
[bgenix_1 (extract region)]
import pandas as pd
df=pd.read_csv(region_file,header=0,sep="\t", names=["chr", "start", "stop"], dtype=str)
df['regions'] = df['chr'].str.replace(r'chr','')+':'+df['start']+'-'+df['stop']
df['regions'] = df['regions'].str.replace(' ', '')
df['regions'].to_csv(f"{cwd}/{region_file:bn}.bgenix", index=False, header=None)
regions= list(set([line.rstrip('\n') for line in df['regions']]))

input: genofile, for_each = 'regions'
output: region_bgen=f'{cwd}/{_input:bn}.{_regions}.bgen'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container, expand = "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    bgenix \
    -g ${_input} \
    -incl-range ${_regions} > ${_output}

In [None]:
[bgenix_2 (create index)]
input: named_output('region_bgen')
output: f'{cwd}/{_input:bn}.bgi'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container, expand = "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    bgenix -g ${_input} -index

In [None]:
[bgenix_3 (create variants)]
input: named_output('region_bgen')
output:  f'{cwd}/{_input:bn}.variants'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container, expand = "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    bgenix -g ${_input} -list > ${_output}

In [None]:
[bgenix_4 (select specific snps)]
parameter:rsid_file = path('.')
input: named_output('region_bgen')
output:  f'{cwd}/{_input:bn}.subset.bgen'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container, expand = "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    bgenix \
    -g ${_input} \
    -incl-rsids ${rsid_file} > ${_output}

In [66]:
import pandas as pd
region_file=pd.read_csv('/mnt/vast/hpc/csg/UKBiobank/data/ldblocks/EUR/fourier_ls-chr1.bed',header=0,sep="\t", names=["chr", "start", "stop"], dtype=str)

In [67]:
region_file

Unnamed: 0,chr,start,stop
0,chr1,10583,1892607
1,chr1,1892607,3582736
2,chr1,3582736,4380811
3,chr1,4380811,5913893
4,chr1,5913893,7247335
...,...,...,...
128,chr1,241582220,242071602
129,chr1,242071602,244109499
130,chr1,244109499,245506746
131,chr1,245506746,247344518


In [68]:
region_file['regions'] = region_file['chr'].str.replace(r'chr','')+':'+region_file['start']+'-'+region_file['stop']

In [69]:
region_file['regions']= region_file['regions'].str.replace(' ', '')

In [72]:
region_file['regions']

0            1:10583-1892607
1          1:1892607-3582736
2          1:3582736-4380811
3          1:4380811-5913893
4          1:5913893-7247335
               ...          
128    1:241582220-242071602
129    1:242071602-244109499
130    1:244109499-245506746
131    1:245506746-247344518
132    1:247344518-249239466
Name: regions, Length: 133, dtype: object

In [73]:
cwd='/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen'

In [74]:
region_filename='fourier_ls-chr1.bed'

In [75]:
region_file.to_csv(f"{cwd}/{region_filename}.bgenix", index=False, header=False, columns=['regions'])

In [76]:
region_file='/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen/fourier_ls-chr1.bed.bgenix'

In [95]:
cwd=/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen
sbatch=/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen/regions_test$(date +"%Y-%m-%d").sbatch
genofile=/mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr22_v3.bgen
tpl_file=~/project/bioworkflows/admin/csg.yml
region_file=/mnt/vast/hpc/csg/UKBiobank/data/ldblocks/EUR/fourier_ls-chr22.bed
bgenix_sos=~/project/UKBB_GWAS_dev/workflow/113022_bgenix_ldblocks.ipynb
job_size=10


bgenix_args="""bgenix
    --cwd $cwd
    --genofile $genofile
    --region_file $region_file
    --job_size $job_size
    --container $container
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $bgenix_sos \
    --to-script $sbatch \
    --args "$bgenix_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen/regions_test2022-12-01.sbatch[0m
INFO: Workflow csg (ID=wa628d1d493b0f831) is executed successfully with 1 completed step.



In [2]:
cat /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen/regions_test2022-12-01.sbatch

#!/bin/sh
#$ -l h_rt=36:00:00
#$ -l h_vmem=16G
#$ -N regions_test2022-12-01
#$ -o /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen/regions_test2022-12-01-$JOB_ID.out
#$ -e /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen/regions_test2022-12-01-$JOB_ID.err  
#$ -j y
#$ -q csg.q
#$ -S /bin/bash
export PATH=$HOME/miniconda3/bin:$PATH
module load Singularity/3.5.3
sos run /home/dmc2245/project/UKBB_GWAS_dev/workflow/113022_bgenix_ldblocks.ipynb \
    bgenix\
    --cwd /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen\
    --genofile /mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr22_v3.bgen\
    --region_file /mnt/vast/hpc/csg/UKBiobank/data/ldblocks/EUR/fourier_ls-chr22.bed \
    -c /home/dmc2245/project/bioworkflows/admin/csg.yml \
    -q csg -s force \
    &> /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen/regions_test2022-12-01.log




In [3]:
qsub /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen/regions_test2022-12-01.sbatch

Your job 3181518 ("regions_test2022-12-01") has been submitted



In [None]:
export PATH=$HOME/miniconda3/bin:$PATH
module load Singularity/3.5.3
sos dryrun /home/dmc2245/project/UKBB_GWAS_dev/workflow/113022_bgenix_ldblocks.ipynb \
    bgenix:3\
    --cwd /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen\
    --genofile /mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr22_v3.bgen\
    --region_file /mnt/vast/hpc/csg/UKBiobank/data/ldblocks/EUR/fourier_ls-chr22.bed \
    -s build