In [3]:
import pandas as pd
import numpy as np
import subprocess

In [4]:
sra = pd.read_csv("../SraRunTable.txt", sep = ",")

Here, we use the single-end data only as it contains a variety of treatments (and it slims down the RNAseq dataset which is, otherwise, way too large to run...)

In [None]:
for i in sra["Run"][sra["LibraryLayout"] == "SINGLE"]:
    subprocess.run(["mv /storage/home/nsl5160/scratch/sanger_reannotate/sra/" + 
                    i + "_1.fastq /storage/home/nsl5160/scratch/sanger_reannotate/sra/" + 
                    i + "_single.fastq"], shell = True)

In [25]:
!sed '/^>/ s/ .*//' ../repeatmasked/GCA_964030605.1_jaAcrPala1.1_genomic.fa.masked \
    > ../repeatmasked/GCA_964030605.1_jaAcrPala1.1_genomic_shortened_headers.fa

In [36]:
!sed '/^>/ s/ .*//' ../repeatmasked/GCA_964030605.1_jaAcrPala1.1_genomic.fa.masked | sed '/^>/ s/\..*//' \
    > ../repeatmasked/GCA_964030605.1_jaAcrPala1.1_genomic_shortened_headers.fa

In [14]:
task = "trimgalore_single"
mem = "256"
cpus = "40"
inpath = "/storage/home/nsl5160/scratch/sanger_reannotate/sra"
outpath = "/storage/home/nsl5160/scratch/sanger_reannotate/trimmed"
runtable = "../SraRunTable.txt"
subprocess.run(["sbatch --mem=" + mem + "g --ntasks=" + 
     cpus + " ../" + task + ".sh " + 
     inpath + " " + outpath + " " + runtable], shell=True)

Submitted batch job 14028015


CompletedProcess(args=['sbatch --mem=256g --ntasks=40 ../trimgalore_single.sh /storage/home/nsl5160/scratch/sanger_reannotate/sra /storage/home/nsl5160/scratch/sanger_reannotate/trimmed ../SraRunTable.txt'], returncode=0)

In [1]:
!cat ../funannotate_train.sh ; sbatch ../funannotate_train.sh

#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=40
#SBATCH --mem=400GB
#SBATCH --time=48:00:00
#SBATCH --job-name=funan

source ~/.bashrc
conda activate funannotate_fixed
cd /storage/group/ibb3/default/sanger_reannotation/
mkdir -p ./funannotate_annotation
funannotate train \
	-i /storage/group/ibb3/default/sanger_reannotation/repeatmasked/GCA_964030605.1_jaAcrPala1.1_genomic_shortened_headers.fa \
	-o ./funannotate_annotation \
	--single /scratch/nsl5160/sanger_reannotate/trimmed/*_single_trimmed.fq \
	--memory 400G \
	--no_trimmomatic \
	--pasa_db mysql \
	--max_intronlen 100000 \
	--species "Acropora palmata" \
	--cpus 40


In [2]:
!cat ../funannotate_predict.sh ; sbatch ../funannotate_predict.sh

#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=35
#SBATCH --mem=400GB
#SBATCH --time=48:00:00
#SBATCH --job-name=funan

source ~/.bashrc
conda activate funannotate_aug34
cd /storage/group/ibb3/default/sanger_reannotation/

funannotate predict \
    -i /storage/group/ibb3/default/sanger_reannotation/repeatmasked/GCA_964030605.1_jaAcrPala1.1_genomic_shortened_headers.fa \
    -o ./funannotate_annotation \
    --species "Acropora palmata" \
    --repeats2evm \
    --repeat_filter none \
    --transcript_evidence /storage/group/ibb3/default/sanger_reannotation/Apal_transcriptome_Polato.fa \
    /storage/group/ibb3/default/sanger_reannotation/Osborne2023_Apalmata_transcriptome.fasta \
    /storage/group/ibb3/default/AP_AC_genome_seqs/dovetail_Apalm/HiC_improvement/Apalm_assembly_v3.1_200911.mrna.fasta \
    --organism other \
    --max_intronlen 100000 \
    --busco_db metazoa \
    --busco_seed_species amphimedon \
    --GENEMARK_PATH /storage/group/ibb3/default/san

In [8]:
!cat ../funannotate_update.sh ; sbatch ../funannotate_update.sh

#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=20
#SBATCH --mem=256GB
#SBATCH --time=48:00:00
#SBATCH --job-name=funan

source ~/.bashrc
conda activate funannotate_fixed
cd /storage/group/ibb3/default/sanger_reannotation/

funannotate update \
    -i ./funannotate_annotation \
    --cpus 20 \
    --memory 250G \
    --pasa_db mysql \
    --max_intronlen 100000 \
    --PASAHOME /storage/group/ibb3/default/.conda/envs/funannotate/opt/pasa-2.5.2/
Submitted batch job 15640769


In [None]:
%%bash
cd ~/scratch/
mkdir -p ./interproscan
cd ./interproscan
wget https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.68-100.0/interproscan-5.68-100.0-64-bit.tar.gz
wget https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.68-100.0/interproscan-5.68-100.0-64-bit.tar.gz.md5
md5sum -c interproscan-5.68-100.0-64-bit.tar.gz.md5
tar -pxvzf interproscan-5.68-100.0-*-bit.tar.gz
python3 setup.py -f interproscan.properties

In [4]:
%%bash
cd /storage/home/nsl5160/scratch/interproscan/interproscan-5.68-100.0
python3 setup.py -f interproscan.properties

In [5]:
!cat ../funannotate_iprscan.sh ; sbatch ../funannotate_iprscan.sh

#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=48
#SBATCH --mem=256GB
#SBATCH --time=48:00:00
#SBATCH --job-name=funan

source ~/.bashrc
conda activate funannotate
cd /storage/group/ibb3/default/sanger_reannotation

funannotate iprscan \
    -i ./funannotate_annotation \
    --method local \
    --out ./funannotate_annotation/Acropora_palmata.proteins.fa.xml \
    --iprscan_path /storage/home/nsl5160/scratch/interproscan/interproscan-5.68-100.0/interproscan.sh \
    --cpus 48
Submitted batch job 16000352


In [32]:
!cat ../proteinfer_apal.sh ; sbatch ../proteinfer_apal.sh

#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --mem=64GB
#SBATCH --time=48:00:00
#SBATCH --job-name=proteinfer

source ~/.bashrc
conda activate proteinfer
cd /storage/group/ibb3/default/acer_cnat_dcyl_ssid_assembly/proteinfer
python proteinfer.py \
    --i /storage/group/ibb3/default/sanger_reannotation/funannotate_annotation/update_results/Acropora_palmata.proteins.fa \
    --o /storage/group/ibb3/default/sanger_reannotation/funannotate_annotation/update_results/Acropora_palmata.proteinfer.tsv \
Submitted batch job 16004684


In [57]:
!cat ../funannotate_annotate.sh ; sbatch ../funannotate_annotate.sh

#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=40
#SBATCH --mem=256GB
#SBATCH --time=12:00:00
#SBATCH --job-name=funan

source ~/.bashrc
conda activate funannotate
cd /storage/group/ibb3/default/sanger_reannotation

export EGGNOG_DATA_DIR=/storage/group/ibb3/default/acer_cnat_dcyl_ssid_assembly/eggnog-map/data

funannotate annotate \
    -i ./funannotate_annotation \
    --iprscan ./funannotate_annotation/Acropora_palmata.proteins.fa.xml \
    --cpus 40 \
    --busco_db metazoa
Submitted batch job 16025910


In [75]:
import pandas as pd
import numpy as np
import subprocess

In [80]:
proteinfer = pd.read_csv("../funannotate_annotation/update_results/Acropora_palmata.proteinfer.tsv", sep = "\t")
proteinfer.columns = ["TranscriptID", "predicted_label", "confidence", "description"]
annotations = pd.read_csv("../funannotate_annotation/annotate_results/Acropora_palmata.annotations.txt", sep = "\t")
annotations_adjusted = annotations.set_index("TranscriptID").copy()
goterms = proteinfer[proteinfer["predicted_label"].str.contains("^GO:")].reset_index(drop=True)
goterms["annotation_format"] = "GO_component: " + goterms["predicted_label"] + " - " + goterms["description"] + " [ProteInfer]"
goterms_dict = goterms.groupby("TranscriptID")["annotation_format"].apply(list).apply(";".join).to_dict()
goterms = pd.DataFrame(goterms_dict.items())
goterms.columns = ["TranscriptID", "GO Terms"]
goterms = goterms.set_index("TranscriptID")
pfam = proteinfer[proteinfer["predicted_label"].str.contains("^Pfam:PF")].reset_index(drop=True)
pfam["annotation_format"] = pfam["predicted_label"].str.split(":", expand = True)[1]
pfam_dict = pfam.groupby("TranscriptID")["annotation_format"].apply(list).apply(";".join).to_dict()
pfam = pd.DataFrame(pfam_dict.items())
pfam.columns = ["TranscriptID", "PFAM"]
pfam = pfam.set_index("TranscriptID")
annotations_adjusted = annotations_adjusted.fillna(goterms).fillna(pfam)
annotations_adjusted = annotations_adjusted.reset_index(drop=False)
annotations_adjusted = annotations_adjusted[['GeneID', 'TranscriptID', 'Feature', 'Contig', 'Start', 'Stop',
       'Strand', 'Name', 'Product', 'Alias/Synonyms', 'EC_number', 'BUSCO',
       'PFAM', 'InterPro', 'EggNog', 'COG', 'GO Terms', 'Secreted', 'Membrane',
       'Protease', 'CAZyme', 'Notes', 'gDNA', 'mRNA', 'CDS-transcript',
       'Translation']]
annotations_adjusted.to_csv("../funannotate_annotation/annotate_results/Acropora_palmata.annotations_proteinfer.txt", sep = "\t", index = False)