In [1]:
import csv
import pandas as pd
import glob
import math
from pathlib import Path

In [2]:
hpc_codes_dir = "/Volumes/Huitian/Projects/T_Cell_ChIP/codes_hpc"

#--- Get list of sra files, retrive Run IDs
sra_dir = "/Volumes/Huitian/Projects/T_Cell_ChIP/202012_ChIP/1_SRA_Run_Table_simplified"
sra_files = glob.glob("%s/*.csv"%sra_dir)

srr_list = []
srr_layout = []
for file in sra_files:
    srr_list += pd.read_csv(file)['Run'].tolist()
    srr_layout += pd.read_csv(file)['LibraryLayout'].tolist()

## 0_0_fastq-dump

In [3]:
#--- Write Hpc script
out_file = "0_0_fastq-dump"
out_dir = hpc_codes_dir + "/" + out_file
Path(out_dir).mkdir(parents=True, exist_ok=True)

hpc_wkdir = "/gpfs/group/pipkin/hdiao/T_Cell_ChIP/0_fastq"
# Write one script for every 16 files
for i in range(0, math.ceil(len(srr_list)/16)):
    i_outname = "%s/%s-%s.sh"%(out_dir, out_file,i)
    with open(i_outname, "w") as fout:
        wfout = csv.writer(fout, delimiter="\t",lineterminator='\n')
        wfout.writerow(["#!/bin/bash"])
        wfout.writerow(["#SBATCH --nodes=1"])
        wfout.writerow(["#SBATCH --ntasks=8"])
        wfout.writerow(["#SBATCH --mem=16gb"])
        wfout.writerow([])
        wfout.writerow(["module load sra-tools"])
        wfout.writerow(["cd %s"%hpc_wkdir])
        wfout.writerow([])
        
        for j in range(i*16,min((i+1) * 16, len(srr_list))):
            j_srr = srr_list[j]
            if ((j+1) % 8 != 0) and (j < min((i+1) * 16, len(srr_list))-1):
                wfout.writerow(["fastq-dump -I --split-files %s &"%j_srr])
            elif j < min((i+1) * 16, len(srr_list))-1:
                wfout.writerow(["fastq-dump -I --split-files %s "%j_srr])
                wfout.writerow(["wait"])
            else:
                wfout.writerow(["fastq-dump -I --split-files %s "%j_srr])

## 1_0_trim_alignment_convert_filterBlacklist

In [11]:
#--- Write Hpc script
out_file = "1_0_trim_alignment_flb"
out_dir = hpc_codes_dir + "/" + out_file
Path(out_dir).mkdir(parents=True, exist_ok=True)

#--- HPC setup
hpc_wkdir = "/gpfs/group/pipkin/hdiao/T_Cell_ChIP/1_bowtie2"
hpc_inputdir = "/gpfs/group/pipkin/hdiao/T_Cell_ChIP/0_fastq"
bowtie2_index = "/gpfs/group/pipkin/hdiao/ref_resources/mm/release102/GRCm38"
blacklisted_bed = "/gpfs/group/pipkin/hdiao/ref_resources/mm/mm10_blacklisted_2016_nochr.bed"

# Write one script for every 4 files
for i in range(0, len(srr_list)):
    i_outname = "%s/%s-%s.sh"%(out_dir, out_file,i)
    srr_i = srr_list[i]
    srr_layout_i = srr_layout[i]
    
    with open(i_outname, "w") as fout:
        wfout = csv.writer(fout, delimiter="\t",lineterminator='\n')
        wfout.writerow(["#!/bin/bash"])
        wfout.writerow(["#SBATCH --nodes=1"])
        wfout.writerow(["#SBATCH --ntasks=16"])
        wfout.writerow(["#SBATCH --mem=8gb"])
        wfout.writerow([])
        wfout.writerow(["module load fastqc"])
        wfout.writerow(["module load trimgalore"])
        wfout.writerow(["module load samtools"])
        wfout.writerow(["module load bowtie2"])
        wfout.writerow(["module load bedtools"])
        
        ### Fastqc untrimmed
        wfout.writerow([])
        wfout.writerow(["### Fastqc for untrimmed files"])
        wfout.writerow(["cd %s"%hpc_inputdir])
        if srr_layout_i == "PAIRED":
            wfout.writerow(["fastq_untrimmed_1=%s_1.fastq"%srr_i])
            wfout.writerow(["fastq_untrimmed_2=%s_2.fastq"%srr_i])
            wfout.writerow(["fastqc $fastq_untrimmed_1"])
            wfout.writerow(["fastqc $fastq_untrimmed_2"])
        else:
            wfout.writerow(["fastq_untrimmed_1=%s_1.fastq"%srr_i])
            wfout.writerow(["fastqc $fastq_untrimmed_1"])
        
        ### Trim galore
        wfout.writerow([])
        wfout.writerow(["### Trim Galore"])
        if srr_layout_i == "PAIRED":
            wfout.writerow(["trim_galore --paired --length 24 --stringency 3 $fastq_untrimmed_1 $fastq_untrimmed_2"])
            wfout.writerow(["trim_fastq_end1=%s/%s_1_val_1.fq"%(hpc_inputdir, srr_i)])
            wfout.writerow(["trim_fastq_end2=%s/%s_2_val_2.fq"%(hpc_inputdir, srr_i)])
        else:        
            wfout.writerow(["trim_galore --length 24 --stringency 3 $fastq_untrimmed_1"])  
            wfout.writerow(["trim_fastq_end1=%s/%s_1_trimmed.fq"%(hpc_inputdir, srr_i)])
            
        ### Trimmed file fastqc
        wfout.writerow([])
        wfout.writerow(["### Fastqc for trimmed files"])
        if srr_layout_i == "PAIRED":
            wfout.writerow(["fastqc $trim_fastq_end1"])
            wfout.writerow(["fastqc $trim_fastq_end2"])
        else:
            wfout.writerow(["fastqc $trim_fastq_end1"])
        
        ### Bowtie2 alignment
        wfout.writerow([])
        wfout.writerow(["### Bowtie2 alignment"])
        wfout.writerow(["cd %s"%hpc_wkdir])
        wfout.writerow(["bowtie2_index=%s"%bowtie2_index])
        wfout.writerow(["sam_name=%s.sam"%srr_i])
        if srr_layout_i == "PAIRED":
            wfout.writerow(["bowtie2 -p 16 -x $bowtie2_index -X 1000 --fr -1 $trim_fastq_end1 -2 $trim_fastq_end2 -S $sam_name"])
        else:
            wfout.writerow(["bowtie2 -p 16 -x $bowtie2_index -U $trim_fastq_end1 -S $sam_name"])
        
        ### Convert & Sort & Filter
        wfout.writerow([])
        wfout.writerow(["### Convert/sort/filter"])
        wfout.writerow(["bam_name=%s.bam"%srr_i])
        wfout.writerow(["bam_name_srt=%s_srt.sam"%srr_i])
        wfout.writerow(["sam_name_srt_dupr=%s_srt_dupr.sam"%srr_i])
        wfout.writerow(["bam_name_srt_dupr=%s_srt_dupr.bam"%srr_i])
        wfout.writerow(["flb_bam_name=%s_srt_dupr_flb.bam"%srr_i])
        wfout.writerow(["blacklist_bed=%s"%blacklisted_bed])
        wfout.writerow([])
        wfout.writerow(["samtools view -bS $sam_name > $bam_name"])
        wfout.writerow(["samtools sort $bam_name -o $bam_name_srt"])
        wfout.writerow(["samtools rmdup -S $bam_name_srt $sam_name_srt_dupr"])
        wfout.writerow(["samtools view -bS $sam_name_srt_dupr > $bam_name_srt_dupr"])
        wfout.writerow(["bedtools intersect -abam $bam_name_srt_dupr -b $blacklist_bed -v > $flb_bam_name"])
        
        
        ### Delete intermediate files
        wfout.writerow([])
        wfout.writerow(["### Remove intermediate files"])
        wfout.writerow(["filesize=$(stat -c%s $flb_bam_name)"])
        wfout.writerow(["if (( filesize > 10000 )) "])
        wfout.writerow(["then"])
        wfout.writerow(["    rm $sam_name"])
        wfout.writerow(["    rm $bam_name"])
        wfout.writerow(["    rm $bam_name_srt"])
        wfout.writerow(["    rm $sam_name_srt_dupr"])
        wfout.writerow(["    rm $bam_name_srt_dupr"])
        wfout.writerow(["    rm $trim_fastq_end1"])
        if srr_layout_i == "PAIRED":
            wfout.writerow(["    rm $trim_fastq_end2"])
        wfout.writerow(["fi"])
                
        

## 1_0_trim_alignment_convert_filterBlacklist
**Redo for failed, try without trimming**

In [6]:
failed_alignment = pd.read_csv("/Volumes/Huitian/Projects/\
T_Cell_ChIP/codes_hpc/1_0_trim_alignment_flb_check/failed_alignments.csv")
failed_alignment_run = failed_alignment['Run'].tolist()

In [10]:
#--- Write Hpc script
out_file = "1_0_trim_alignment_flb_redo"
out_dir = hpc_codes_dir + "/" + out_file
Path(out_dir).mkdir(parents=True, exist_ok=True)

#--- HPC setup
hpc_wkdir = "/gpfs/group/pipkin/hdiao/T_Cell_ChIP/1_bowtie2"
hpc_inputdir = "/gpfs/group/pipkin/hdiao/T_Cell_ChIP/0_fastq"
bowtie2_index = "/gpfs/group/pipkin/hdiao/ref_resources/mm/release102/GRCm38"
blacklisted_bed = "/gpfs/group/pipkin/hdiao/ref_resources/mm/mm10_blacklisted_2016_nochr.bed"
srr_list = list(set(srr_list) & set(failed_alignment_run))

# Write one script for every 4 files
for i in range(0, len(srr_list)):
    i_outname = "%s/%s-%s.sh"%(out_dir, out_file,i)
    srr_i = srr_list[i]
    srr_layout_i = srr_layout[i]
    
    with open(i_outname, "w") as fout:
        wfout = csv.writer(fout, delimiter="\t",lineterminator='\n')
        wfout.writerow(["#!/bin/bash"])
        wfout.writerow(["#SBATCH --nodes=1"])
        wfout.writerow(["#SBATCH --ntasks=16"])
        wfout.writerow(["#SBATCH --mem=8gb"])
        wfout.writerow([])
        wfout.writerow(["module load fastqc"])
        wfout.writerow(["module load trimgalore"])
        wfout.writerow(["module load samtools"])
        wfout.writerow(["module load bowtie2"])
        wfout.writerow(["module load bedtools"])
        
        ### Fastqc untrimmed
        wfout.writerow([])
        wfout.writerow(["### Fastqc for untrimmed files"])
        if srr_layout_i == "PAIRED":
            wfout.writerow(["fastq_untrimmed_1=%s/%s_1.fastq"%(hpc_inputdir,srr_i)])
            wfout.writerow(["fastq_untrimmed_2=%s/%s_2.fastq"%(hpc_inputdir,srr_i)])
            wfout.writerow(["#fastqc $fastq_untrimmed_1"])
            wfout.writerow(["#fastqc $fastq_untrimmed_2"])
        else:
            wfout.writerow(["fastq_untrimmed_1=%s/%s_1.fastq"%(hpc_inputdir,srr_i)])
            wfout.writerow(["#fastqc $fastq_untrimmed_1"])
        
        
        ### Bowtie2 alignment
        wfout.writerow([])
        wfout.writerow(["### Bowtie2 alignment"])
        wfout.writerow(["cd %s"%hpc_wkdir])
        wfout.writerow(["bowtie2_index=%s"%bowtie2_index])
        wfout.writerow(["sam_name=%s.sam"%srr_i])
        if srr_layout_i == "PAIRED":
            wfout.writerow(["bowtie2 -p 16 -x $bowtie2_index -X 1000 --fr -1 $fastq_untrimmed_1 -2 $fastq_untrimmed_2 -S $sam_name"])
        else:
            wfout.writerow(["bowtie2 -p 16 -x $bowtie2_index -U $fastq_untrimmed_1 -S $sam_name"])
        
        ### Convert & Sort & Filter
        wfout.writerow([])
        wfout.writerow(["### Convert/sort/filter"])
        wfout.writerow(["bam_name=%s.bam"%srr_i])
        wfout.writerow(["bam_name_srt=%s_srt.sam"%srr_i])
        wfout.writerow(["sam_name_srt_dupr=%s_srt_dupr.sam"%srr_i])
        wfout.writerow(["bam_name_srt_dupr=%s_srt_dupr.bam"%srr_i])
        wfout.writerow(["flb_bam_name=%s_srt_dupr_flb.bam"%srr_i])
        wfout.writerow(["blacklist_bed=%s"%blacklisted_bed])
        wfout.writerow([])
        wfout.writerow(["samtools view -bS $sam_name > $bam_name"])
        wfout.writerow(["samtools sort $bam_name -o $bam_name_srt"])
        wfout.writerow(["samtools rmdup -S $bam_name_srt $sam_name_srt_dupr"])
        wfout.writerow(["samtools view -bS $sam_name_srt_dupr > $bam_name_srt_dupr"])
        wfout.writerow(["bedtools intersect -abam $bam_name_srt_dupr -b $blacklist_bed -v > $flb_bam_name"])
        
        
        ### Delete intermediate files
        wfout.writerow([])
        wfout.writerow(["### Remove intermediate files"])
        wfout.writerow(["filesize_preFlb=$(stat -c%s $bam_name_srt_dupr)"])
        wfout.writerow(["filesize=$(stat -c%s $flb_bam_name)"])
        wfout.writerow(["echo $filesize_preFlb $filesize >> %s_bamSizes_pre_post_flb.txt"%srr_i])
        wfout.writerow(["if (( filesize > 10000 )) "])
        wfout.writerow(["then"])
        wfout.writerow(["    rm $sam_name"])
        wfout.writerow(["    rm $bam_name"])
        wfout.writerow(["    rm $bam_name_srt"])
        wfout.writerow(["    rm $sam_name_srt_dupr"])
        wfout.writerow(["    rm $bam_name_srt_dupr"])
        wfout.writerow(["fi"])

## 1_0_trim_alignment_convert_filterBlacklist
**Redo for solid sequences, try with / without trimming**

In [3]:
#--- Get list of sra files, retrive Run IDs
sra_dir = "/Volumes/Huitian/Projects/T_Cell_ChIP/202012_ChIP/1_SRA_Run_Table_simplified"
solid_file = sra_dir + "/2015_IMMUNITY_Martinez_simplified.csv"
srr_list = pd.read_csv(solid_file)['Run'].tolist()
srr_layout = pd.read_csv(solid_file)['LibraryLayout'].tolist()

In [4]:
#--- Write Hpc script
out_file = "1_0_trim_alignment_flb_solid"
out_dir = hpc_codes_dir + "/" + out_file
Path(out_dir).mkdir(parents=True, exist_ok=True)

#--- HPC setup
hpc_wkdir = "/gpfs/group/pipkin/hdiao/T_Cell_ChIP/1_bowtie2"
hpc_inputdir = "/gpfs/group/pipkin/hdiao/T_Cell_ChIP/0_fastq"
bowtie2_index = "/gpfs/group/pipkin/hdiao/ref_resources/mm/release102/GRCm38"
blacklisted_bed = "/gpfs/group/pipkin/hdiao/ref_resources/mm/mm10_blacklisted_2016_nochr.bed"

# Write one script for every 4 files
for i in range(0, len(srr_list)):
    i_outname = "%s/%s-%s.sh"%(out_dir, out_file,i)
    srr_i = srr_list[i]
    srr_layout_i = srr_layout[i]
    
    with open(i_outname, "w") as fout:
        wfout = csv.writer(fout, delimiter="\t",lineterminator='\n')
        wfout.writerow(["#!/bin/bash"])
        wfout.writerow(["#SBATCH --nodes=1"])
        wfout.writerow(["#SBATCH --ntasks=16"])
        wfout.writerow(["#SBATCH --mem=8gb"])
        wfout.writerow([])
        wfout.writerow(["module load fastqc"])
        wfout.writerow(["module load trimgalore"])
        wfout.writerow(["module load samtools"])
        wfout.writerow(["module load bowtie2"])
        wfout.writerow(["module load bedtools"])
        
        ### Fastqc untrimmed
        wfout.writerow([])
        wfout.writerow(["### Fastqc for untrimmed files"])
        wfout.writerow(["cd %s"%hpc_inputdir])
        if srr_layout_i == "PAIRED":
            wfout.writerow(["fastq_untrimmed_1=%s/%s_1_solidCvt.fastq"%(hpc_inputdir, srr_i)])
            wfout.writerow(["fastq_untrimmed_2=%s/%s_2_solidCvt.fastq"%(hpc_inputdir, srr_i)])
            wfout.writerow(["fastqc $fastq_untrimmed_1"])
            wfout.writerow(["fastqc $fastq_untrimmed_2"])
        else:
            wfout.writerow(["fastq_untrimmed_1=%s/%s_1_solidCvt.fastq"%(hpc_inputdir, srr_i)])
            wfout.writerow(["fastqc $fastq_untrimmed_1"])
        
        ### Trim galore
        wfout.writerow([])
        wfout.writerow(["### Trim Galore"])
        if srr_layout_i == "PAIRED":
            wfout.writerow(["trim_galore --paired --length 24 --stringency 3 $fastq_untrimmed_1 $fastq_untrimmed_2"])
            wfout.writerow(["trim_fastq_end1=%s/%s_1_solidCvt_val_1.fq"%(hpc_inputdir, srr_i)])
            wfout.writerow(["trim_fastq_end2=%s/%s_2_solidCvt_val_2.fq"%(hpc_inputdir, srr_i)])
        else:        
            wfout.writerow(["trim_galore --length 24 --stringency 3 $fastq_untrimmed_1"])  
            wfout.writerow(["trim_fastq_end1=%s/%s_1_solidCvt_trimmed.fq"%(hpc_inputdir, srr_i)])
            
        ### Trimmed file fastqc
        wfout.writerow([])
        wfout.writerow(["### Fastqc for trimmed files"])
        if srr_layout_i == "PAIRED":
            wfout.writerow(["fastqc $trim_fastq_end1"])
            wfout.writerow(["fastqc $trim_fastq_end2"])
        else:
            wfout.writerow(["fastqc $trim_fastq_end1"])
        
        ### Test if trimming is successful
        wfout.writerow([])
        wfout.writerow(["### Test if trimming is successful"])
        if srr_layout_i == "PAIRED":
            wfout.writerow(["filesize=$(stat -c%s $trim_fastq_end2)"])
        else:
            wfout.writerow(["filesize=$(stat -c%s $trim_fastq_end1)"])
        # If trimming did not succeed, use the original files
        wfout.writerow(["if (( filesize < 10000 )) "])
        wfout.writerow(["then"])
        wfout.writerow(["    trim_fastq_end1=$fastq_untrimmed_1"])
        if srr_layout_i == "PAIRED":
            wfout.writerow(["    trim_fastq_end2=$fastq_untrimmed_2"])
        wfout.writerow(["fi"])        
        
        ### Bowtie2 alignment
        wfout.writerow([])
        wfout.writerow(["### Bowtie2 alignment"])
        wfout.writerow(["cd %s"%hpc_wkdir])
        wfout.writerow(["bowtie2_index=%s"%bowtie2_index])
        wfout.writerow(["sam_name=%s.sam"%srr_i])
        if srr_layout_i == "PAIRED":
            wfout.writerow(["bowtie2 -p 16 -x $bowtie2_index -X 1000 --fr -1 $trim_fastq_end1 -2 $trim_fastq_end2 -S $sam_name"])
        else:
            wfout.writerow(["bowtie2 -p 16 -x $bowtie2_index -U $trim_fastq_end1 -S $sam_name"])
        
        ### Convert & Sort & Filter
        wfout.writerow([])
        wfout.writerow(["### Convert/sort/filter"])
        wfout.writerow(["bam_name=%s.bam"%srr_i])
        wfout.writerow(["bam_name_srt=%s_srt.sam"%srr_i])
        wfout.writerow(["sam_name_srt_dupr=%s_srt_dupr.sam"%srr_i])
        wfout.writerow(["bam_name_srt_dupr=%s_srt_dupr.bam"%srr_i])
        wfout.writerow(["flb_bam_name=%s_srt_dupr_flb.bam"%srr_i])
        wfout.writerow(["blacklist_bed=%s"%blacklisted_bed])
        wfout.writerow([])
        wfout.writerow(["samtools view -bS $sam_name > $bam_name"])
        wfout.writerow(["samtools sort $bam_name -o $bam_name_srt"])
        wfout.writerow(["samtools rmdup -S $bam_name_srt $sam_name_srt_dupr"])
        wfout.writerow(["samtools view -bS $sam_name_srt_dupr > $bam_name_srt_dupr"])
        wfout.writerow(["bedtools intersect -abam $bam_name_srt_dupr -b $blacklist_bed -v > $flb_bam_name"])
        
        
        ### Delete intermediate files
        wfout.writerow([])
        wfout.writerow(["### Remove intermediate files"])
        wfout.writerow(["filesize_preFlb=$(stat -c%s $bam_name_srt_dupr)"])
        wfout.writerow(["filesize=$(stat -c%s $flb_bam_name)"])
        wfout.writerow(["echo $filesize_preFlb $filesize >> %s_bamSizes_pre_post_flb.txt"%srr_i])
        wfout.writerow(["if (( filesize > 10000 )) "])
        wfout.writerow(["then"])
        wfout.writerow(["    rm $sam_name"])
        wfout.writerow(["    rm $bam_name"])
        wfout.writerow(["    rm $bam_name_srt"])
        wfout.writerow(["    rm $sam_name_srt_dupr"])
        wfout.writerow(["    rm $bam_name_srt_dupr"])
        wfout.writerow(["    rm $trim_fastq_end1"])
        if srr_layout_i == "PAIRED":
            wfout.writerow(["    rm $trim_fastq_end2"])
        wfout.writerow(["fi"])
                
        