### Download multiple files from NCBI using multiple CPUs in SLURM

- **Developed by:** Daniel Reisenbüchler
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- v230309

### Rationale:
- fast download using multiple sbatch jobs 
- https://www.ncbi.nlm.nih.gov/sra/docs/sradownload/

### Import required modules

In [None]:
import os 
os.write(1, b"text\n")
import glob
import pandas as pd

### Setup working environment

In [None]:
#e.g. seeds

### Download data using list of SRAs 

In [None]:
raw_file_path = f'/lustre/groups/talaveralopez/datasets/tuberculosis/rna-seq/das2021/metadata/das2021_metadata_raw.txt'

raw_data = pd.read_csv(raw_file_path)

raw_data.iloc[0]

In [None]:
raw_data

In [None]:
download_run_list = raw_data['Run'].tolist()
download_run_list[:5]

In [None]:
### SBATCH SCRIPT FUNCTION

def submit_job_script(job_id, job_dir, SRA, nice=10000, out_dir=''):
    
    job_name = f'download_{job_id}'
    job_file = f'{job_dir}/job_{job_name}_job_id_{job_id}.cmd'
    out_file = f'{job_dir}/out_{job_name}.txt'
    err_file = f'{job_dir}/err_{job_name}.txt'
    
    with open(job_file, 'w') as handle:
        handle.writelines("#!/bin/bash\n")
        handle.writelines(f'#SBATCH -J {job_name}\n')
        handle.writelines(f'#SBATCH -o {out_file}\n')
        handle.writelines(f'#SBATCH -e {err_file}\n')
        handle.writelines('#SBATCH -t 47:00:00\n')
        handle.writelines('#SBATCH -p cpu_p\n')
        handle.writelines('#SBATCH -c 16\n')    
        handle.writelines('#SBATCH --mem=64GB\n')                    
        handle.writelines(f'#SBATCH --nice={nice}\n')#change!!!   
        #handle.writelines("source $HOME/.bashrc\n")
        #handle.writelines("conda activate pyt11x\n")
        handle.writelines("cd /\n")
        handle.writelines(f"cd {out_dir}\n")
        handle.writelines(f"/home/haicu/reisenbuechler/sratoolkit/sratoolkit.3.0.0-ubuntu64/bin/fasterq-dump {SRA} --threads 16 -O {out_dir}\n")
        
    os.system(f'sbatch {job_file}')

############

sbatch_job_dir = '/lustre/groups/talaveralopez/projects/mairi.mcclean/bin'
out_dir = '/lustre/groups/talaveralopez/datasets/tuberculosis/rna-seq/das2021/rna-seq_raw_reads'

for job_id,SRA in enumerate(download_run_list):
    submit_job_script(job_id=job_id, job_dir=sbatch_job_dir, SRA=SRA, nice=20000, out_dir=out_dir)

In [None]:
!squeue -u mairi.mcclean