In [1]:
%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname 
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh

TODAY'S DATE:
Wed Nov 28 12:01:27 PST 2018
------------

Distributor ID:	Ubuntu
Description:	Ubuntu 16.04.5 LTS
Release:	16.04
Codename:	xenial

------------
HOSTNAME: 
swoose

------------
Computer Specs:

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                24
On-line CPU(s) list:   0-23
Thread(s) per core:    2
Core(s) per socket:    6
Socket(s):             2
NUMA node(s):          1
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 44
Model name:            Intel(R) Xeon(R) CPU           X5670  @ 2.93GHz
Stepping:              2
CPU MHz:               2925.866
BogoMIPS:              5851.93
Virtualization:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              12288K
NUMA node0 CPU(s):     0-23
Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr

No LSB modules are available.


### Set variables

In [None]:
# Set data directories
%env data_dir=/home/sam/data/P_generosa
%env fasta=/home/sam/data/P_generosa/Panopea-generosa-vv0.74.a4.5d9637f372b5d-publish.genes.fna
%env out_dir=/home/sam/analyses/20200730_pgen_primer_design

# Needed for primer3-2.4.0
%env thermo_params_dir=/home/sam/software/primer3-2.4.0/src/primer3_config/

# Programs
%env primer3=/home/sam/programs/primer3-2.4.0/src/primer3_core
%env primersearch=/home/sam/programs/EMBOSS-6.6.0/emboss/primersearch

#### Make directories

In [2]:
%%bash
mkdir --parents "${out_dir}"

### Download P.generosa genes FastA file from OSF repo (https://osf.io/ct623/)

In [5]:
%%bash
wget --quiet "https://files.osf.io/v1/resources/yem8n/providers/osfstorage/5db35d9abc32f4000e0b70c2?action=download&direct&version=1" \
--output-document ${fasta}

ls -lh

-------------------
total 271M
-rw-r--r-- 1 sam users 271M Nov 23 15:42 20180827_trinity_geoduck.fasta.transdecoder.cds
drwxrwxr-x 2 sam sam   4.0K Nov 28 12:04 transdecoder_fasta_splits


### Extract target sequences from FastA

In [6]:
%%bash

timestamp=$(date +"%Y%m%d")

cd "${out_dir}"

# Associative array to associate gene names with sequence ids
# Requires >= Bash 4.0
declare -A seqid_array

# Populate associative array [gene_abbreviation]=seqid
seqid_array=(
[TIF3s6b]=PGEN_.00g000750-vv0.74.a \
[TIF3s12]=PGEN_.00g025890-vv0.74.a \
[APLP]=PGEN_.00g070040-vv0.74.a \
[TIF3s7]=PGEN_.00g079690-vv0.74.a \
[TIF3s5]=PGEN_.00g082590-vv0.74.a \
[NFIP1]=PGEN_.00g088260-vv0.74.a \
[GSK3B]=PGEN_.00g114060-vv0.74.a \
[TIF3s8-1]=PGEN_.00g132030-vv0.74.a \
[TIF3s8-2]=PGEN_.00g132040-vv0.74.a \
[FEN1]=PGEN_.00g188130-vv0.74.a \
[ECHD3]=PGEN_.00g194630-vv0.74.a \
[GLYG]=PGEN_.00g224740-vv0.74.a \
[TIF3s10]=PGEN_.00g245080-vv0.74.a \
[SPTN1]=PGEN_.00g280110-vv0.74.a \
[RPL5]=PGEN_.00g287540-vv0.74.a \
[TIF3s4a]=PGEN_.00g288180-vv0.74.a \
[NSF]=PGEN_.00g338640-vv0.74.a
)

# Individual FastAs array
fasta_array=()

# Extract sequences to individual FastA files
for gene_name in "${!seqid_array[@]}"
do
    # Set output file names
    out_file="${seqid_array[$gene_name]}"_"${gene_name}".fna

    # Run faidx
    faidx  "${fasta}" "${seqid_array[$gene_name]}" \
    --out "${out_file}"
    
    # Add FastA to array
    fasta_array+=(${out_file})
    
    ## Check output
    # Count number of entries in output FastA (should be = 1)
    # -H displays filename - is compatible with OSX
    grep --count -H ">" "${out_file}"
    
    # Check each FastA header
    echo ""
    echo "${out_file}: $(head -n1 ${out_file})"
    
done

# Run Primer3
for fna in "${fasta_array[@]}"
do

    # Store sequence only from desired FastA.
    # Print all lines after the first line and then delete newlines
    # because sequence needs to be on single line for Primer3 params file
    sequence=$(tail -n +2 "${fna}" | tr -d '\n')
    
    # Remove suffix from FastA file to use as sequence ID
    seq_id=${fna%%.*}
    
    
    params_out="${timestamp}_${seq_id}_primer3_params.txt"
    primer3_def_out="${timestamp}_${seq_id}_primers_default_format.txt"
    emboss_primers="${timestamp}_${seq_id}_emboss_primers.txt"
    primersearch_out="${timestamp}_${seq_id}_primersearch.txt"
    
    # Use heredoc to create Primer3 parameters file
    cat << EOF > "${params_out}"
    SEQUENCE_ID=${seq_id}
    SEQUENCE_TEMPLATE=${sequence}
    PRIMER_TASK=generic
    PRIMER_PICK_LEFT_PRIMER=3
    PRIMER_PICK_RIGHT_PRIMER=3
    PRIMER_OPT_SIZE=18
    PRIMER_MIN_SIZE=15
    PRIMER_MAX_SIZE=21
    PRIMER_MAX_NS_ACCEPTED=1
    PRIMER_PRODUCT_SIZE_RANGE=75-150
    P3_FILE_FLAG=1
    PRIMER_EXPLAIN_FLAG=1
    PRIMER_THERMODYNAMIC_PARAMETERS_PATH=${thermo_params_dir}
    =
    EOF
    
    # Run Primer3 with default output format
    ${primer3} \
    --output="${primer3_def_out}" \
    "${params_out}"
    
    # Create tab-delimited primer file for primersearch
    sequence_id=$(grep "SEQUENCE_ID=" "${params_out}" | sed 's/SEQUENCE_ID=//')
    left_primer=$(grep "PRIMER_LEFT_0_SEQUENCE=" "${params_out}" | sed 's/PRIMER_LEFT_0_SEQUENCE=//')
    right_primer=$(grep "PRIMER_RIGHT_0_SEQUENCE=" "${params_out}" | sed 's/PRIMER_RIGHT_0_SEQUENCE=//')
    
    printf "%s\t" "${seq_id}" "${left_primer}" "${right_primer}" > "${emboss_primers}"
    
    # Add required newline to end of file
    printf "\n" >> "${emboss_primers}"
    
    # Run EMBOSS primersearch
    ${primersearch} \
    -seqall "${fasta}" \
    -infile "${emboss_primers}" \
    -mismatchpercent 20 \
    -outfile "${primersearch_out}" \
    -auto
    

done

-------------------
NUMBER OF SEQUENCES IN ORIGINAL FASTA
210586
-------------------


-------------------
NUMBER OF INDIVIDUAL FASTA FILES
210586
-------------------



real	0m43.789s
user	0m36.340s
sys	0m6.656s
