In [1]:
import numpy as np
import pandas as pd
import os
import glob
import csv

In [6]:
code_dir = os.getcwd()
base_dir = code_dir.replace("/z_References/codes_local", "")
base_dir

'/media/pipkin/Yolanda/SARS2_Sc'

## 0. Fastq dump

In [2]:
###----- First trial
wkdir = base_dir + '/z_References/codes_hpc'
os.chdir(wkdir)

input_dir = base_dir + '/z_References/0_Analysis_from_GEO'
sra_files = glob.glob("%s/*SraRunTable.csv"%input_dir)

srr_info = []
for i in sra_files:
    i_df = pd.read_csv(i)
    srr_info += i_df.iloc[:,0].values.tolist()

dump_script = "0_0_fastq-dump.sh"
dump_dir = '/gpfs/group/pipkin/hdiao/SARS_Sc_references/fastq'

with open(dump_script, "w") as fout:
    wfout = csv.writer(fout, delimiter=" ")
    wfout.writerow(["#!/bin/bash"])
    wfout.writerow(["#PBS", "-l", "nodes=1:ppn=16"])
    wfout.writerow(["#PBS", "-l", "mem=32gb"])
    wfout.writerow([])
    wfout.writerow(["cd", dump_dir])
    wfout.writerow(["module", "load", "sra-tools"])
    wfout.writerow([])
    for i in range(0, len(srr_info)):
        if ((i+1) % 16 != 0):
            wfout.writerow(["fastq-dump", "-I", "--split-files", srr_info[i], "&"])
        else:
            wfout.writerow(["fastq-dump", "-I", "--split-files", srr_info[i]])
            wfout.writerow(["wait"])

In [31]:
###----- Redo
wkdir = base_dir + '/z_References/codes_hpc'
os.chdir(wkdir)

input_dir = base_dir + '/z_References/0_Analysis_from_GEO'
sra_files = glob.glob("%s/*SraRunTable_redo.csv"%input_dir)

srr_info = []
for i in sra_files:
    i_df = pd.read_csv(i)
    srr_info += i_df.iloc[:,0].values.tolist()

dump_script = "0_0_fastq-dump.redo.sh"
dump_dir = '/gpfs/group/pipkin/hdiao/SARS_Sc_references/fastq_redo'

with open(dump_script, "w") as fout:
    wfout = csv.writer(fout, delimiter=" ")
    wfout.writerow(["#!/bin/bash"])
    wfout.writerow(["#PBS", "-l", "nodes=1:ppn=16"])
    wfout.writerow(["#PBS", "-l", "mem=32gb"])
    wfout.writerow([])
    wfout.writerow(["cd", dump_dir])
    wfout.writerow(["module", "load", "sra-tools"])
    wfout.writerow([])
    for i in range(0, len(srr_info)):
        if ((i+1) % 16 != 0):
            wfout.writerow(["fastq-dump", "-I", "--split-files", srr_info[i], "&"])
        else:
            wfout.writerow(["fastq-dump", "-I", "--split-files", srr_info[i]])
            wfout.writerow(["wait"])

## 1. Salmon quant

In [34]:
###----- First trial
wkdir = base_dir + '/z_References/codes_hpc'
os.chdir(wkdir)

input_dir = base_dir + '/z_References/0_Analysis_from_GEO'

info_file = base_dir + '/z_References/codes_local/GEO_dataset_info.csv'
info_df = pd.read_csv(info_file).set_index('Dataset')

for ds in info_df.index.values:
    ds_out = "0_1_salmon_%s.sh"%ds

    hpc_wkdir = "/gpfs/group/pipkin/hdiao/SARS_Sc_references/salmon"
    hpc_inputdir = "/gpfs/group/pipkin/hdiao/SARS_Sc_references/fastq"
    hpc_hs_idx = "/gpfs/group/pipkin/hdiao/ref_resources/hs/release100/GRCh38.salmon.index"
    hpc_mm_idx = "/gpfs/group/pipkin/hdiao/ref_resources/mm/release100/GRCm38.salmon.index"
    with open(ds_out, "w") as fout:
        ds_run = input_dir + "/" + ds + '_SraRunTable.csv'
        ds_run_df = pd.read_csv(ds_run)
        runs = ds_run_df.iloc[:,0].values

        wfout = csv.writer(fout, delimiter="|")
        wfout.writerow(["#!/bin/bash"])
        wfout.writerow(["#PBS -l nodes=1:ppn=12"])
        wfout.writerow(["#PBS -l mem=32gb"])
        wfout.writerow([])
        wfout.writerow(["###----- Load module"])
        wfout.writerow(["module load salmon"])
        wfout.writerow([])
        wfout.writerow(["###----- Working directory"])
        wfout.writerow(["cd %s"%hpc_wkdir])
        wfout.writerow(["mkdir %s"%ds])
        wfout.writerow(["cd %s" %ds])
        wfout.writerow([])
        wfout.writerow(["###----- Reference"])
        if info_df.loc[ds][1] == "Human":
            wfout.writerow(["salmon_index=%s"%hpc_hs_idx])
        elif info_df.loc[ds][1] == "Mouse":
            wfout.writerow(["salmon_index=%s"%hpc_mm_idx])
        wfout.writerow([])
        wfout.writerow(["# Run"])
        if info_df.loc[ds][0] == "Single":
            for run in runs:
                run_file = hpc_inputdir + "/" + run
                wfout.writerow(["salmon quant -i $salmon_index -l A -r %s_1.fastq  -p 12 --validateMappings -o %s" %(run_file, run)])
        elif info_df.loc[ds][0] == "Paired":
            for run in runs:
                run_file = hpc_inputdir + "/" + run
                wfout.writerow(["salmon quant -i $salmon_index -l A -1 %s_1.fastq -2 %s_2.fastq -p 12 --validateMappings -o %s" %(run_file, run_file, run)])

In [38]:
###----- Redo
wkdir = base_dir + '/z_References/codes_hpc'
os.chdir(wkdir)

input_dir = base_dir + '/z_References/0_Analysis_from_GEO'

info_file = base_dir + '/z_References/codes_local/GEO_dataset_info.csv'
info_df = pd.read_csv(info_file).set_index('Dataset')

for ds in info_df.index.values:
    ds_out = "0_1_salmon_%s.redo.sh"%ds
    ds_run = input_dir + "/" + ds + '_SraRunTable_redo.csv'
    
    if os.path.exists(ds_run):
        hpc_wkdir = "/gpfs/group/pipkin/hdiao/SARS_Sc_references/salmon"
        hpc_inputdir = "/gpfs/group/pipkin/hdiao/SARS_Sc_references/fastq_redo"
        hpc_hs_idx = "/gpfs/group/pipkin/hdiao/ref_resources/hs/release100/GRCh38.salmon.index"
        hpc_mm_idx = "/gpfs/group/pipkin/hdiao/ref_resources/mm/release100/GRCm38.salmon.index"

        with open(ds_out, "w") as fout:
            ds_run_df = pd.read_csv(ds_run)
            runs = ds_run_df.iloc[:,0].values

            wfout = csv.writer(fout, delimiter="|")
            wfout.writerow(["#!/bin/bash"])
            wfout.writerow(["#PBS -l nodes=1:ppn=12"])
            wfout.writerow(["#PBS -l mem=32gb"])
            wfout.writerow([])
            wfout.writerow(["###----- Load module"])
            wfout.writerow(["module load salmon"])
            wfout.writerow([])
            wfout.writerow(["###----- Working directory"])
            wfout.writerow(["cd %s"%hpc_wkdir])
            wfout.writerow(["mkdir %s"%ds])
            wfout.writerow(["cd %s" %ds])
            wfout.writerow([])
            wfout.writerow(["###----- Reference"])
            if info_df.loc[ds][1] == "Human":
                wfout.writerow(["salmon_index=%s"%hpc_hs_idx])
            elif info_df.loc[ds][1] == "Mouse":
                wfout.writerow(["salmon_index=%s"%hpc_mm_idx])
            wfout.writerow([])
            wfout.writerow(["# Run"])
            if info_df.loc[ds][0] == "Single":
                for run in runs:
                    run_file = hpc_inputdir + "/" + run
                    wfout.writerow(["salmon quant -i $salmon_index -l A -r %s_1.fastq  -p 12 --validateMappings -o %s" %(run_file, run)])
            elif info_df.loc[ds][0] == "Paired":
                for run in runs:
                    run_file = hpc_inputdir + "/" + run
                    wfout.writerow(["salmon quant -i $salmon_index -l A -1 %s_1.fastq -2 %s_2.fastq -p 12 --validateMappings -o %s" %(run_file, run_file, run)])

## Check salmon output

In [43]:
salmon_dir = base_dir + '/z_References/1_salmon'

redo_n = 0
for ds in info_df.index.values:
    ds_out = "0_1_salmon_%s.sh"%ds
    ds_run = input_dir + "/" + ds + '_SraRunTable.csv'
    ds_run_df = pd.read_csv(ds_run)
    runs = ds_run_df.iloc[:,0].values

    ds_salmon_files = glob.glob("%s/%s/*/quant.sf"%(salmon_dir, ds), recursive=True)
    ds_salmon_files_runs = [x.split("/")[-2] for x in ds_salmon_files]
    incomplete_runs = list(set(runs) - set(ds_salmon_files_runs))
    
    if len(incomplete_runs) >= 1:
        redo_n += 1
        print(incomplete_runs)
        ds_run_df_redo = ds_run_df.set_index(ds_run_df.columns[0]).loc[incomplete_runs]
        ds_run_redo = input_dir + "/" + ds + '_SraRunTable_redo2.csv'
        ds_run_df_redo.to_csv(ds_run_redo)
        
print("Incomplete number: %s"%redo_n)

Incomplete number: 0
