In [1]:
from glob import glob
import os
import pandas as pd
import json

# table print options
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 1000)

# Prepare ATAC-seq configuration file of .fastq data 
Parses data data directory names and .fastq files in proejct directory, extracting metadata.
The script generates a per sample config .json file which is subsequently used for the ENCODE ATAC-seq pipeline.

In [2]:
data_dir = "/lustre/scratch126/cellgen/team205/sk29/matthias_fb/data/atac_simon/proj068/FASTQ_Generation_2022-12-17_02_34_43Z-640535898"

In [6]:
# Load library directories
libs = pd.DataFrame(glob(data_dir + "/*"), columns=["path"])

# Get directory names, which are parsed for sample information
libs['basename'] = libs.path.apply(os.path.basename)

# Get filename lists of paired-end R1 and R2 fastq files
libs['fastq_R1'] = [os.path.basename(glob(path + "/*_R1_*.fastq*")[0]) for path in libs.path]
libs['fastq_R2'] = [os.path.basename(glob(path + "/*_R2_*.fastq*")[0]) for path in libs.path]

# name extracted columns from regex capture groups
libs[['condition', 'sample_id', 'lane']] = libs.basename.str.extract(r'(.*)_([0-9]+)_(L[0-9]*)')

libs.head()

Unnamed: 0,path,basename,fastq_R1,fastq_R2,condition,sample_id,lane
0,/lustre/scratch126/cellgen/team205/sk29/matthias_fb/data/atac_simon/proj068/FASTQ_Generation_2022-12-17_02_34_43Z-640535898/unconditioned_5_L001-ds.46edbd96addd4c7bbcf669fbdd201e29,unconditioned_5_L001-ds.46edbd96addd4c7bbcf669fbdd201e29,unconditioned-5_S5_L001_R1_001.fastq.gz,unconditioned-5_S5_L001_R2_001.fastq.gz,unconditioned,5,L001
1,/lustre/scratch126/cellgen/team205/sk29/matthias_fb/data/atac_simon/proj068/FASTQ_Generation_2022-12-17_02_34_43Z-640535898/conditioned_DMSO_22_L002-ds.3f381fa67ef345fe81257f9d7fc3ed00,conditioned_DMSO_22_L002-ds.3f381fa67ef345fe81257f9d7fc3ed00,conditioned-DMSO-22_S16_L002_R1_001.fastq.gz,conditioned-DMSO-22_S16_L002_R2_001.fastq.gz,conditioned_DMSO,22,L002
2,/lustre/scratch126/cellgen/team205/sk29/matthias_fb/data/atac_simon/proj068/FASTQ_Generation_2022-12-17_02_34_43Z-640535898/conditioned_DMSO_21_L002-ds.d6b480e7b7104216b998a896f63665dd,conditioned_DMSO_21_L002-ds.d6b480e7b7104216b998a896f63665dd,conditioned-DMSO-21_S15_L002_R1_001.fastq.gz,conditioned-DMSO-21_S15_L002_R2_001.fastq.gz,conditioned_DMSO,21,L002
3,/lustre/scratch126/cellgen/team205/sk29/matthias_fb/data/atac_simon/proj068/FASTQ_Generation_2022-12-17_02_34_43Z-640535898/conditioned_Belinostat_18_L004-ds.898be0a8019544ce8e9206a811da1756,conditioned_Belinostat_18_L004-ds.898be0a8019544ce8e9206a811da1756,conditioned-Belinostat-18_S12_L004_R1_001.fastq.gz,conditioned-Belinostat-18_S12_L004_R2_001.fastq.gz,conditioned_Belinostat,18,L004
4,/lustre/scratch126/cellgen/team205/sk29/matthias_fb/data/atac_simon/proj068/FASTQ_Generation_2022-12-17_02_34_43Z-640535898/conditioned_DMSO_19_L004-ds.f14158dcc3324931b3edfeb0245a862c,conditioned_DMSO_19_L004-ds.f14158dcc3324931b3edfeb0245a862c,conditioned-DMSO-19_S13_L004_R1_001.fastq.gz,conditioned-DMSO-19_S13_L004_R2_001.fastq.gz,conditioned_DMSO,19,L004


In [8]:
# shared arguments to pipeline
# for additional arguments and documentation: https://github.com/ENCODE-DCC/atac-seq-pipeline/blob/master/docs/input.md
fixed_config = {
    'atac.pipeline_type': 'atac',
    'atac.genome_tsv': 'https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/hg38.tsv',
    'atac.paired_end': 'true',
    'atac.auto_detect_adapter': 'true',
    'atac.enable_xcor': 'true'
}

# Loop over each condition, generate a config file to submit to ENCODE ATAC-seq pipeline
for cond in set(libs.condition):
    # subset of biological replicates and lanes associated with treatment condition
    libs_sub = libs[libs.condition == cond]
    
    config = {**fixed_config, 'atac.title': cond}  # ** dictionary unapcking to new dict
    
    # Loop over biological replicates, parsing fastq files into technical and biological replicates
    for k, sample in enumerate(sorted(set(libs_sub.sample_id))):
        replicate = libs_sub[libs_sub.sample_id == sample]
        
        # format list of paths of fastq files for replicate, paired-end
        config[f'atac.fastqs_rep{k+1}_R1'] = (replicate.path + "/" + replicate.fastq_R1).to_list()
        config[f'atac.fastqs_rep{k+1}_R2'] = (replicate.path + "/" + replicate.fastq_R2).to_list()

    with open(f'config/{cond}.json', 'w', encoding='utf-8') as f:
        json.dump(config, f, ensure_ascii=False, indent=4)