In [1]:
import pandas as pd
import numpy as np
import subprocess

In [22]:
!cat ../proteinfer_acer.sh ; sbatch ../proteinfer_acer.sh

#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --mem=64GB
#SBATCH --time=48:00:00
#SBATCH --job-name=proteinfer

source ~/.bashrc
conda activate proteinfer
cd /storage/group/ibb3/default/acer_cnat_dcyl_ssid_assembly/proteinfer
python proteinfer.py \
    --i ../funannotate_annotation/acer_lepwrap_annotation/annotate_results/Acropora_cervicornis.proteins.fa \
    --o ../funannotate_annotation/acer_lepwrap_annotation/annotate_results/Acropora_cervicornis.proteinfer.tsv
Submitted batch job 14350444


In [100]:
!cat ../proteinfer_apal.sh ; sbatch ../proteinfer_apal.sh

#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --mem=64GB
#SBATCH --time=48:00:00
#SBATCH --job-name=proteinfer

source ~/.bashrc
conda activate proteinfer
cd /storage/group/ibb3/default/acer_cnat_dcyl_ssid_assembly/proteinfer
python proteinfer.py \
    --i ../../AP_AC_genome_seqs/dovetail_Apalm/HiC_improvement/v3.1_noasterisk.protein.fasta \
    --o ../../AP_AC_genome_seqs/dovetail_Apalm/HiC_improvement/Apalm_assembly_v3.1_200911.proteinfer.tsv
Submitted batch job 14517960


In [79]:
for spp in ["../funannotate_annotation/acer_lepwrap_annotation/annotate_results/Acropora_cervicornis"]:
    proteinfer = pd.read_csv(spp + ".proteinfer.tsv", sep = "\t")
    proteinfer.columns = ["TranscriptID", "predicted_label", "confidence", "description"]
    annotations = pd.read_csv(spp + ".annotations.txt", sep = "\t")
    annotations_adjusted = annotations.set_index("TranscriptID").copy()
    goterms = proteinfer[proteinfer["predicted_label"].str.contains("^GO:")].reset_index(drop=True)
    goterms["annotation_format"] = "GO_component: " + goterms["predicted_label"] + " - " + goterms["description"] + " [ProteInfer]"
    goterms_dict = goterms.groupby("TranscriptID")["annotation_format"].apply(list).apply(";".join).to_dict()
    goterms = pd.DataFrame(goterms_dict.items())
    goterms.columns = ["TranscriptID", "GO Terms"]
    goterms = goterms.set_index("TranscriptID")
    pfam = proteinfer[proteinfer["predicted_label"].str.contains("^Pfam:PF")].reset_index(drop=True)
    pfam["annotation_format"] = pfam["predicted_label"].str.split(":", expand = True)[1]
    pfam_dict = pfam.groupby("TranscriptID")["annotation_format"].apply(list).apply(";".join).to_dict()
    pfam = pd.DataFrame(pfam_dict.items())
    pfam.columns = ["TranscriptID", "PFAM"]
    pfam = pfam.set_index("TranscriptID")
    annotations_adjusted = annotations_adjusted.fillna(goterms).fillna(pfam)
    annotations_adjusted = annotations_adjusted.reset_index(drop=False)
    annotations_adjusted = annotations_adjusted[['GeneID', 'TranscriptID', 'Feature', 'Contig', 'Start', 'Stop',
           'Strand', 'Name', 'Product', 'Alias/Synonyms', 'EC_number', 'BUSCO',
           'PFAM', 'InterPro', 'EggNog', 'COG', 'GO Terms', 'Secreted', 'Membrane',
           'Protease', 'CAZyme', 'Notes', 'gDNA', 'mRNA', 'CDS-transcript',
           'Translation']]
    annotations_adjusted.to_csv(spp + ".annotations_proteinfer.txt", sep = "\t", index = False)

In [23]:
proteinfer = pd.read_csv("../../AP_AC_genome_seqs/dovetail_Apalm/HiC_improvement/Apalm_assembly_v3.1_200911.proteinfer.tsv", sep = "\t")
proteinfer.columns = ["Gene ID v3", "predicted_label", "confidence", "description"]
annotations = pd.read_csv("../../AP_AC_genome_seqs/dovetail_Apalm/HiC_improvement/APdov_v3.1_GeneAnnotation_combined.txt", sep = "\t")
goterms = proteinfer[proteinfer["predicted_label"].str.contains("^GO:")].reset_index(drop=True)
goterms["annotation_format"] = "GO_component: " + goterms["predicted_label"] + " - " + goterms["description"] + " [ProteInfer]"
goterms_dict = goterms.groupby("Gene ID v3")["annotation_format"].apply(list).apply(";".join).to_dict()
goterms = pd.DataFrame(goterms_dict.items())
goterms.columns = ["Gene ID v3", "GO Terms"]
pfam = proteinfer[proteinfer["predicted_label"].str.contains("^Pfam:PF")].reset_index(drop=True)
pfam["annotation_format"] = pfam["predicted_label"].str.split(":", expand = True)[1]
pfam_dict = pfam.groupby("Gene ID v3")["annotation_format"].apply(list).apply(";".join).to_dict()
pfam = pd.DataFrame(pfam_dict.items())
pfam.columns = ["Gene ID v3", "PFAM"]
annotations_adjusted = annotations.merge(goterms, how = "left", on = "Gene ID v3")
annotations_adjusted = annotations_adjusted.merge(pfam, how = "left", on = "Gene ID v3")
annotations_adjusted.to_csv("../../apal_acer_genomes/resources/Apalmata.annotations.proteinfer.txt", sep = "\t", index = False)

  annotations = pd.read_csv("../../AP_AC_genome_seqs/dovetail_Apalm/HiC_improvement/APdov_v3.1_GeneAnnotation_combined.txt", sep = "\t")
