# Variables

In [1]:
### Managing work ###

NAME = 'ChickenPhages'

#!ws_allocate {NAME} 30
#!ws_list -a
#!ws_find {NAME}
#!ws_extend {NAME}
#!ws_release {NAME} #delete, remove content first


VARS = (
    f'WS=/pfs/10/work/ho_graaf20-{NAME}/\n'
    f'PS=/pfs/10/project/bw18d010/ho_graaf20/{NAME}/\n'
    f'DB=/pfs/10/project/bw18d010/ho_graaf20/Databases/\n'
    f'REFS=/pfs/10/project/bw18d010/ho_graaf20/Databases/Reference_Genomes\n'
    f'LOGS=/pfs/10/project/bw18d010/ho_graaf20/{NAME}/Logs\n'
    f'RR=/pfs/10/project/bw18d010/ho_graaf20/{NAME}/Raw_reads\n'
    f'CR=/pfs/10/project/bw18d010/ho_graaf20/{NAME}/Clean_reads\n'
)

CONDA = '$HOME/miniforge3/etc/profile.d/conda.sh'

###############################################################################
################################ DO NOT CHANGE ################################
###############################################################################

import pandas as pd
import numpy as np
import os

#extract bash variables for python
for var in VARS.strip().split('\n'):
    vars()[var.split('=')[0]]=var.split('=')[1]

!mkdir -p {CR} {LOGS} {RR} {DB}
!ln -sf -T {PS} ./'PS'
!ln -sf -T {WS} ./'WS'
!ln -sf -T {DB} ~/'DB'

    
#Binac function
source = f'#!/bin/bash\nsource {CONDA}'
COMM = 'export TMPDIR=$TMPDIR\n#Custom variables and commands\ncd $WS\n'
STRING = '\n'.join([source, VARS, COMM])

# binac function that will submit the job to server
def binac2(tasks=1, ppn=1, time='00:15:00', mem=2, name='myjob', logdir=LOGS,
          queue='compute', todo='echo "Command"', STRING=STRING, nodes=1):
    text = '\n'.join([STRING, todo])
    script = f'{logdir}/{name}.sh'
    !mkdir -p {logdir}
    with open(script, "w") as text_file:
        text_file.write(text)
    !chmod +x {script}
    run = f"""sbatch -t {time} -p {queue} --mem={mem}g -J {name} --output={logdir}/{name}.log \
              --error={logdir}/{name}.error --ntasks-per-node={tasks} -N {nodes} {script}"""
    !$run

cleaning = !rm -r $WS/*/*/.ipynb_c* $WS/*/.ipynb_c* $WS/.ipynb_c* $PS/*/*/.ipynb_c* $PS/*/.ipynb_c* $PS/.ipynb_c*

###############################################################################
######################## Here you can change something ########################
###############################################################################

meta = pd.read_csv("PS/MAGs_dataset_hosts.tsv", sep='\t', index_col=0)
hosts = meta.isolation_host.unique().tolist()
sras = meta.index.tolist()
b = 50
batches = {f'batch{i+1}-{i+b}': sras[i:i + b] for i in range(0, len(sras), b)}

raws = []
if os.path.exists(RR): 
    raws = [f.rsplit('_', 1)[0] for f in os.listdir(RR) if '.gz' in f]
    raws = sorted(list(set(raws)))
    print('Number of raw samples: ', len(raws))

cleans = []
if os.path.exists(CR): 
    cleans = [f.rsplit('_', 1)[0] for f in os.listdir(CR) if '.gz' in f]
    cleans = sorted(list(set(cleans)))
    print('Number of clean samples: ', len(cleans))

bad = []

Number of raw samples:  0
Number of clean samples:  242


In [2]:
len(meta)

242

In [33]:
for host in hosts:
    print(host, len(meta.loc[meta.isolation_host==host]))

cow 37
duck 42
human 45
pig 48
turkey 70


# Download samples by Hosts

In [None]:
#conda create -n SRA -c bioconda sra-tools pigz

In [44]:
#use only 1 thread or bad things may happen (Binac will freeze)!

THREADS = 2
logdir = f'{LOGS}/SRAs'

todo = '''
conda activate SRA

mkdir -p sra_temp

echo "Processing {BATCH}"
sras=({IDS})
for sra in ${{sras[@]}}; do

    #skip processed files
    if [ -f $RR/${{sra}}_1.fastq.gz ]; then
        echo "$sra already exists. Skipping."
        continue
    fi
    
    if [ -f $RR/${{sra}}.fastq.gz ]; then
        echo "$sra already exists. Skipping."
        continue
    fi
    
    if [ -f $CR/${{sra}}_1.fq.gz ]; then
        echo "$sra already exists. Skipping."
        continue
    fi
    
    if [ -f $CR/${{sra}}.fq.gz ]; then
        echo "$sra already exists. Skipping."
        continue
    fi
    
    #download files
    echo "Prefetch $sra"
    prefetch $sra -O sra_temp --verify yes --max-size 50G

    echo "Convert to fastq $sra"
    fasterq-dump sra_temp/$sra --split-files --outdir $RR --threads 1
    rm -r sra_temp/$sra

    echo "Gzip $sra"
    pigz $RR/${{sra}}_*.fastq
    pigz $RR/${{sra}}.fastq

    
done
echo "All done, check the outputs"
'''

for batch, ids in batches.items():
    if batch in ['batch1-50']:
        continue
        
    IDS = ' '.join([i for i in ids if i not in bad])
    binac2(tasks=THREADS, time='90:00:00', mem=8, name=f'{batch}_SRA', logdir=logdir, 
           todo=todo.format(BATCH=batch, IDS=IDS))


Submitted batch job 38549
Submitted batch job 38550
Submitted batch job 38551
Submitted batch job 38552


# Remove host DNA

In [None]:
#conda create -n QC -c bioconda -c conda-forge trim-galore bowtie2 -y

In [22]:
#create index

THREADS = 8
logdir = f'{LOGS}/Bowtie2'

todo = '''
conda activate QC
cd $REFS/{index}

#skip if index exists
if [ -f $REFS/{index}/*.bt2 ]
then
    echo "Index {index} already exists."
else
    pwd
    echo "Creating index {index}..."
    bowtie2-build ./*.fna.gz {index} --threads {THREADS}
fi
'''

REF_GENS = ['Chicken', 'Corn', 'Cow', 'Duck', 'Human', 'Pig', 'Turkey']

for index in REF_GENS:
    if index == REF_GENS[0]:
        continue
    binac2(tasks=THREADS, time='10:00:00', mem=16, name=f'{index}_BW2_ind', logdir=logdir, 
           todo=todo.format(index=index, THREADS=THREADS))

Submitted batch job 38620
Submitted batch job 38621
Submitted batch job 38622
Submitted batch job 38623
Submitted batch job 38624
Submitted batch job 38625


In [128]:
#Read QC and host DNA removal

THREADS = 8
logdir = f"{LOGS}/QC"

todo = '''
conda activate QC

echo "Processing {BATCH}"
sras=({IDS})
for name in ${{sras[@]}}; do
    ind_host={ind_host}
    ind_feed={ind_feed}
    preQC=$PS/FastQC/preQC/$name
    postQC=$PS/FastQC/postQC/$name
    outdir=$CR/$name
    
    echo
    echo "############################################################"
    echo "Processing $name "
    echo "Check if reads are paired or single" 

    #skip processed files
    if [ -d $outdir ]; then
        echo "$name already busy. Skipping."
        continue
    fi

    cd $WS
    if [ -f $RR/${{name}}_1.f*q.gz ]; then
        echo "$name are paired reads."
        RR1=$RR/${{name}}_1.f*q.gz
        RR2=$RR/${{name}}_2.f*q.gz
        CR1=$CR/${{name}}_1.fq.gz
        CR2=$CR/${{name}}_2.fq.gz
    
        #skip processed files
        if [ -f $CR1 ]; then
            echo "$name already cleaned. Skipping."
            continue
        fi
        
        rm -rf $outdir $preQC $postQC $CR1 $CR2
        mkdir -p $preQC $postQC $outdir 
    
        # Fastq report before QC
        echo "Report before QC..."
        fastqc -q -t {THREADS} -o $preQC -f fastq $RR1 $RR2
        
        echo
        echo "Running Trim-Galore for QC"
        echo   
        cd $outdir
        trim_galore --paired $RR1 $RR2 -j {THREADS}
    
        # Fastq report after QC
        echo "Report after QC..."
        cd
        cd $WS
        fastqc -q -t {THREADS} -o $postQC -f fastq $outdir/*_val_1.f*q.gz $outdir/*_val_2.f*q.gz
        
        echo
        echo "Removing Host and Feed DNA from reads"
        echo "Mapping reads to the host reference..."
        cd
        cd $REFS/$ind_host
        bowtie2 -p {THREADS} -x $ind_host -1 $outdir/*_val_1.f*q.gz -2 $outdir/*_val_2.f*q.gz \
        --un-conc-gz $outdir/no_host > $outdir/host.sam
        echo "Host DNA removed"
        
        echo "Mapping reads to the feed reference..."
        cd
        cd $REFS/$ind_feed
        bowtie2 -p {THREADS} -x $ind_feed -1 $outdir/no_host.1 -2 $outdir/no_host.2 \
        --un-conc-gz $outdir/no_feed > $outdir/feed.sam
        echo "Feed DNA removed"
        
        echo
        echo "Move and clean!"
        cd
        cd $WS
        mv $outdir/no_feed.1 $CR1
        mv $outdir/no_feed.2 $CR2
        rm -rf $outdir
    
        #remove processed files
        if [ -f $CR1 ]; then
            echo "Removing raw read since $name successfully cleaned."
            rm $RR1 $RR2
        fi
    elif [ -f $RR/${{name}}.f*q.gz ]; then
        echo "$name are single reads."
        RR1=$RR/${{name}}.f*q.gz
        CR1=$CR/${{name}}.fq.gz
    
        #skip processed files
        if [ -f $CR1 ]; then
            echo "$name already cleaned. Skipping."
            continue
        fi
        
        rm -rf $outdir $preQC $postQC $CR1 $CR2
        mkdir -p $preQC $postQC $outdir 
    
        # Fastq report before QC
        echo "Report before QC..."
        fastqc -q -t {THREADS} -o $preQC -f fastq $RR1
        
        echo
        echo "Running Trim-Galore for QC"
        echo   
        cd $outdir
        trim_galore $RR1 -j {THREADS}
    
        # Fastq report after QC
        echo "Report after QC..."
        cd
        cd $WS
        fastqc -q -t {THREADS} -o $postQC -f fastq $outdir/*_trimmed.f*q.gz
        
        echo
        echo "Removing Host and Feed DNA from reads"
        echo "Mapping reads to the host reference..."
        cd
        cd $REFS/$ind_host
        bowtie2 -p {THREADS} -x $ind_host -U $outdir/*_trimmed.f*q.gz \
        --un-gz $outdir/no_host > $outdir/host.sam
        echo "Host DNA removed"
        
        echo "Mapping reads to the feed reference..."
        cd
        cd $REFS/$ind_feed
        bowtie2 -p {THREADS} -x $ind_feed -U $outdir/no_host \
        --un-gz $outdir/no_feed > $outdir/feed.sam
        echo "Feed DNA removed"
        
        echo
        echo "Move and clean!"
        cd
        cd $WS
        mv $outdir/no_feed $CR1
        rm -rf $outdir
    
        #remove processed files
        if [ -f $CR1 ]; then
            echo "Removing raw read since $name successfully cleaned."
            rm $RR1
        fi
        
    else
        echo "There is something wrong with that sample (neither paired reads or single read were found)"
        continue
        
    fi
done

echo
echo "############################################################"
echo "###                   All done! Enjoy!                   ###"
echo "### Citations:                                           ###"
echo "### 1. Cutadapt: https://doi.org/10.14806/ej.17.1.200    ###"
echo "### 2. Bowtie: https://doi.org/10.1038/nmeth.1923        ###"
echo "### 3. FastQC (optional):                                ###"
echo "###    https://doi.org/10.1038/nmeth.1923                ###"
echo "### 4. TrimGalore (optional):                            ###"
echo "###    https://github.com/FelixKrueger/TrimGalore        ###"
echo "############################################################"
'''


#launch samples
for host in hosts:
    #if host not in ['turkey']:
    #    continue
        
    IDS = ' '.join([i for i in meta.loc[meta.isolation_host == host].index if i in raws])
            
    binac2(tasks=THREADS, time='100:00:00', mem=32, name=f'{host}_QC8', logdir=logdir, 
           todo=todo.format(BATCH=host, IDS=IDS, ind_host=host.capitalize(), ind_feed='Corn', THREADS=THREADS))


Submitted batch job 38719
Submitted batch job 38720
Submitted batch job 38721
Submitted batch job 38722
Submitted batch job 38723


# CoverM

In [50]:
#conda install -c bioconda -c conda-forge coverm 

Channels:
 - bioconda
 - conda-forge
Platform: linux-64
doneecting package metadata (repodata.json): - 
doneing environment: / 

## Package Plan ##

  environment location: /home/ho/ho_genetics/ho_graaf20/miniforge3

  added / updated specs:
    - coverm


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    bwa-0.7.18                 |       h577a1d6_2         193 KB  bioconda
    coverm-0.7.0               |       hb4818e0_2         2.2 MB  bioconda
    dashing-1.0                |       h5b0a936_3         2.9 MB  bioconda
    fastani-1.34               |       hb66fcc3_5         133 KB  bioconda
    gsl-2.7                    |       he838d99_0         3.2 MB  conda-forge
    htslib-1.21                |       h566b1c6_1         3.0 MB  bioconda
    k8-1.2                     |       h6618dc6_3         7.2 MB  bioconda
    kernel-headers_linux-64-3.10.0|      he073ed8_18         921 KB  co

In [16]:
THREADS = 14
logdir = f'{LOGS}/CoverM'

todo = '''
outdir={OUTDIR}
mkdir -p $outdir

echo "Processing {BATCH}"
sras=({IDS})
for sra in ${{sras[@]}}; do
    out=$outdir/${{sra}}.txt
    CR1=$CR/${{sra}}_1.fq.gz
    CR2=$CR/${{sra}}_2.fq.gz

    #skip processed files
    if [ -f $out ]; then
        echo "$sra already exists. Skipping."
        continue
    else
        echo "Processing $sra"
        coverm contig --methods rpkm -t {THREADS} -c $CR1 $CR2 -r $PS/representative_viral.fasta -o $out \
            --min-read-percent-identity 90 --min-read-aligned-percent 75 --min-covered-fraction 75
    fi
done
echo "All done, check the outputs"
'''

for host in hosts:
    if host not in ['turkey']:
        continue
        
    IDS = ' '.join([i for i in meta.loc[meta.isolation_host == host].index]) # if i in cleans])
    binac2(tasks=THREADS, time='60:00:00', mem=32, name=f'{host}_CoverM', logdir=logdir, 
           todo=todo.format(BATCH=host, IDS=IDS, OUTDIR=f'{PS}/CoverM', THREADS=THREADS))


Submitted batch job 38736


In [181]:
#!rm -r {PS}/CoverM/CoverM_mapped.tsv

In [35]:
#collect mapping states

coverm_mapped = pd.DataFrame()

for log in os.listdir('PS/Logs/CoverM'):
    if log.startswith('.'):
        continue
    elif log.endswith('.error'):
        host = log.split('_')[0]
        with open(f'PS/Logs/CoverM/{log}') as error:
            maps = [m.split('/CoverM/')[-1] for m in error.read().split(' total (')[:-1] if 'Skipping' not in m]
        for mp in maps:
            sample = mp.split('.txt')[0].strip()
            mapped = mp.split('found ')[-1].split(' reads')[0]
            total = mp.split('out of ')[-1].strip()
            coverm_mapped.loc[sample, ['Host', 'Mapped', 'Total']] = host, int(mapped), int(total)

coverm_mapped = coverm_mapped.loc[coverm_mapped['Total'] != 0]
coverm_mapped['Perc_mapped'] = coverm_mapped.Mapped*100/coverm_mapped.Total
coverm_mapped['Perc_mapped'] = coverm_mapped['Perc_mapped'].apply(lambda x: round(x, 3))
coverm_mapped.sort_values(by=['Host', 'Perc_mapped'], ascending=[True, False], inplace=True)

coverm_mapped.to_csv('PS/CoverM_mapped.tsv', sep='\t')

In [32]:
len(coverm_mapped)

239

In [33]:
#collect rpkm values

dfs = []
for txt in os.listdir('PS/CoverM'):
    if txt.startswith('.') or os.path.getsize(f'PS/CoverM/{txt}') == 0:
        continue
    elif txt.endswith('.txt'):
        df = pd.read_csv(f'PS/CoverM/{txt}', sep='\t', index_col=0)
        df.columns = [c.split('/')[-1].split('_')[0] for c in df.columns]
        dfs.append(df)

coverm_rpkm = pd.concat(dfs, axis=1)        
coverm_rpkm.to_csv('PS/CoverM_RPKM.tsv', sep='\t')

In [28]:
len(os.listdir('PS/CoverM'))

243

# Status

In [30]:
!squeue -l -u ho_graaf20

Fri Jan 24 16:00:35 2025
             JOBID PARTITION     NAME     USER    STATE       TIME TIME_LIMI  NODES NODELIST(REASON)


In [140]:
!scancel 38632

In [None]:
#
!scontrol show job 38626

In [27]:
!scontrol show job 38626

JobId=38626 JobName=cow_QC
   UserId=ho_graaf20(900741) GroupId=ho_genetics(500055) MCS_label=N/A
   Priority=7000 Nice=0 Account=bw18d010 QOS=normal
   JobState=COMPLETED Reason=None Dependency=(null)
   Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
   RunTime=00:00:01 TimeLimit=10-00:00:00 TimeMin=N/A
   SubmitTime=2025-01-20T13:30:52 EligibleTime=2025-01-20T13:30:52
   AccrueTime=2025-01-20T13:30:52
   StartTime=2025-01-20T13:30:53 EndTime=2025-01-20T13:30:54 Deadline=N/A
   PreemptEligibleTime=2025-01-20T13:30:53 PreemptTime=None
   SuspendTime=None SecsPreSuspend=0 LastSchedEval=2025-01-20T13:30:53 Scheduler=Main
   Partition=compute AllocNode:Sid=login02.cluster:100071
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList=node2-092
   BatchHost=node2-092
   NumNodes=1 NumCPUs=8 NumTasks=8 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
   ReqTRES=cpu=8,mem=32G,node=1,billing=520
   AllocTRES=cpu=8,mem=32G,node=1,billing=520
   Socks/Node=* NtasksPerN:B:S:C=8:0:*:* CoreSpec=*
   MinCPUsNod

In [11]:
time = 1 
!echo sbatch -t $time -p {queue} --mem={mem}g -J {name} --output={logdir} --error={logdir} \
     --ntasks-per-node={tasks} -N {nodes} {script}

sbatch -t -p {queue} --mem={mem}g -J {name} --output={logdir} --error={logdir} --ntasks-per-node={tasks} -N {nodes} {script}


In [5]:
!ln --help

Usage: ln [OPTION]... [-T] TARGET LINK_NAME
  or:  ln [OPTION]... TARGET
  or:  ln [OPTION]... TARGET... DIRECTORY
  or:  ln [OPTION]... -t DIRECTORY TARGET...
In the 1st form, create a link to TARGET with the name LINK_NAME.
In the 2nd form, create a link to TARGET in the current directory.
In the 3rd and 4th forms, create links to each TARGET in DIRECTORY.
Create hard links by default, symbolic links with --symbolic.
By default, each destination (name of new link) should not already exist.
When creating hard links, each TARGET must exist.  Symbolic links
can hold arbitrary text; if later resolved, a relative link is
interpreted in relation to its parent directory.

Mandatory arguments to long options are mandatory for short options too.
      --backup[=CONTROL]      make a backup of each existing destination file
  -b                          like --backup but does not accept an argument
  -d, -F, --directory         allow the superuser to attempt to hard link
                       