Idea is to read in the annotation file of emapper.py and pull out the annotations.
This was run like:
python /home/benjamin/anaconda3/envs/funannotate/eggnog-mapper-0.99.2/emapper.py -i ../DK_0911_v01_p_ctg.proteiniprs.fa -d euk --output DK_0911_v01_p_ctg --cpu 12


In [2]:
import pandas as pd
import os
import re
from Bio import SeqIO
import pysam
from Bio.SeqRecord import SeqRecord
from pybedtools import BedTool
import numpy as np
import pybedtools
import time
import matplotlib.pyplot as plt
import sys
import subprocess
import shutil

In [61]:
p_genome = 'DK_0911_v01_p_ctg'
p_genome_filtered = 'DK_0911_v04LT_p_ctg'
BASE_FOLDER = '/home/benjamin/genome_assembly/Warrior/annotation/DK_0911_v01_p_ctg/eggnog/'
EGGNOG_BLAST_FILE = '%s.emapper.annotations' % (p_genome)
#EGGNOG_DIAMON_FILE = '%s_diamond.emapper.annotations' % (p_genome)
BASE_AA_PATH = '/home/benjamin/genome_assembly/Warrior/DK0911_v04'
BASE_A_PATH = '/home/benjamin/genome_assembly/Warrior/genome_v04'
OUT_PATH = os.path.join(BASE_AA_PATH, 'protein_annotation',p_genome_filtered, 'eggnog-mapper', 'parsed')
EGGNOG_PATH = OUT_PATH
INTERPRO_PATH = os.path.join(BASE_AA_PATH, 'protein_annotation',p_genome_filtered, 'interpro', 'parsed')
FUN_PATH = os.path.join(BASE_AA_PATH, 'protein_annotation',p_genome_filtered, 'funannotate', 'parsed')
COMB_OUT_PATH = os.path.join(BASE_AA_PATH, 'protein_annotation',p_genome_filtered, 'combined')
if not os.path.exists(OUT_PATH):
    os.makedirs(OUT_PATH)
if not os.path.exists(OUT_PATH_COMB):
    os.makedirs(OUT_PATH_COMB)
if not os.path.exists(OUT_PATH):
    os.makedirs(OUT_PATH)
if not os.path.exists(COMB_OUT_PATH):
    os.makedirs(COMB_OUT_PATH)
if not os.path.exists(FUN_PATH):
    print('Please run funnanotate notebook before running the last steps of this notebook.')
if not os.path.exists(INTERPRO_PATH):
    print('Please run the interpro parser notebook before running the last stpes of this notebook.')

In [9]:
#pull out all proteins that are in the final assembly
p_protein_list = []
protein_fa_file = [x for x in os.listdir(BASE_A_PATH) if p_genome_filtered in x and x.endswith('.protein.fa')][0]
for protein in SeqIO.parse(os.path.join(BASE_A_PATH, protein_fa_file) , 'fasta'):
    p_protein_list.append(protein.id)

In [13]:
eggnog_blast_header = 'query_name seed_eggNOG_ortholog seed_ortholog_evalue seed_ortholog_score predicted_gene_name \
GO_terms KEGG_pathways Annotation_tax_scope OGs bestOG|evalue|score COG cat eggNOG annot'.split(' ')

In [30]:
eggnog_blast_df = pd.read_csv(os.path.join(BASE_FOLDER, EGGNOG_BLAST_FILE), sep ='\t', header=None, names=eggnog_blast_header, skiprows=3)

In [31]:
eggnog_blast_df.shape

(10205, 14)

In [32]:
eggnog_blast_df.iloc[1,:]

query_name                                                   DK0911_18058
seed_eggNOG_ortholog                                        5297.EFP84543
seed_ortholog_evalue                                             1.7e-264
seed_ortholog_score                                                 872.3
predicted_gene_name                                                  AAT2
GO_terms                GO:0003674,GO:0003824,GO:0004069,GO:0005575,GO...
KEGG_pathways           map00250,map00270,map00330,map00350,map00360,m...
Annotation_tax_scope                                            fuNOG[21]
OGs                     0928H@basNOG,0PGES@fuNOG,12PGV@opiNOG,COG1448@...
bestOG|evalue|score                                KOG1411|1.7e-199|668.9
COG                                                                     E
cat                                            Aspartate aminotransferase
eggNOG                                                                NaN
annot                                 

In [33]:
eggnog_blast_df.fillna(0, inplace =True)

In [34]:
#here filter out all the proteins that are TE related and not in genome version v04 
eggnog_blast_df = eggnog_blast_df[eggnog_blast_df.query_name.isin(p_protein_list)].reset_index(drop=True)

In [35]:
eggnog_blast_df.shape

(7295, 14)

In [36]:
eggnog_blast_df.columns

Index(['query_name', 'seed_eggNOG_ortholog', 'seed_ortholog_evalue',
       'seed_ortholog_score', 'predicted_gene_name', 'GO_terms',
       'KEGG_pathways', 'Annotation_tax_scope', 'OGs', 'bestOG|evalue|score',
       'COG', 'cat', 'eggNOG', 'annot'],
      dtype='object')

In [37]:
#pick all annotation columns. One at a time and write them out as tab file or annotations files. The later can be used
#to annotate gff files using gag.py
DBs = [x for x in eggnog_blast_df.columns.tolist()[4:] if x not in ['GO_terms','KEGG_pathways', 'OGs','bestOG|evalue|score' ] ]
eggnog_blast_df['note'] = 'note'
for db in DBs:
    if len(eggnog_blast_df[eggnog_blast_df[db] !=0]) > 0:
        eggnog_blast_df[eggnog_blast_df[db] !=0].loc[:,['query_name',db]]\
        .to_csv(os.path.join(OUT_PATH, db+'_terms.tab'), sep='\t', header =None, index = None)
        eggnog_blast_df[eggnog_blast_df[db] !=0].loc[:,['query_name', 'note', db]]\
        .to_csv(os.path.join(OUT_PATH, 'annotations.' +db+'.txt'), sep='\t', header =None, index = None)
    
    

In [38]:
#now look after the remaining columns which have potentially multiple entries as values eg. GOterm, GOterm, GOterm
#those values need to be split into a list and this made into a new dataframe before writting it out again.
eggnog_blast_df = pd.read_csv(os.path.join(BASE_FOLDER, EGGNOG_BLAST_FILE), sep ='\t', header=None, names=eggnog_blast_header, skiprows=3)
eggnog_blast_df = eggnog_blast_df[eggnog_blast_df.query_name.isin(p_protein_list)].reset_index(drop=True)
#['GO_terms','KEGG_pathways', 'OGs']
for db in ['GO_terms','KEGG_pathways', 'OGs']:
    eggnog_blast_df[db] = eggnog_blast_df[db].str.split(',')
    all_term_list = []
    all_query_list = []
    eggnog_blast_df[db].fillna(0, inplace=True)
    for key in eggnog_blast_df[eggnog_blast_df[db] != 0 ].index:
        for term in eggnog_blast_df.loc[key,db]:
            all_term_list.append(term)
            all_query_list.append(eggnog_blast_df.loc[key,'query_name'])
    tmp_df = pd.DataFrame([all_query_list, all_term_list]).T
    tmp_df.rename(columns={0:'query_name', 1:'DB_ID'}, inplace=True)
    tmp_df.to_csv(os.path.join(OUT_PATH, db+'_terms.tab'), sep='\t', header =None, index = None)
    tmp_df['note'] = 'note'
    tmp_df.loc[:, ['query_name', 'note', 'DB_ID']].to_csv(os.path.join(OUT_PATH, 'annotations.' +db+'.txt'), sep='\t', header =None, index = None)

In [45]:
#combine KEGG annotations, and GO terms
egg_GO_file = [os.path.join(OUT_PATH, x) for x in os.listdir(OUT_PATH) if x == 'annotations.GO_terms.txt'][0] 
egg_KEGG_file = [os.path.join(OUT_PATH, x) for x in os.listdir(OUT_PATH) if x == 'annotations.KEGG_pathways.txt'][0]
interpro_GO_file = [os.path.join(INTERPRO_PATH, x)\
                      for x in os.listdir(INTERPRO_PATH) if x == 'annotations.GO.txt'][0]
interpro_KEGG_file = [os.path.join(INTERPRO_PATH, x)\
                      for x in os.listdir(INTERPRO_PATH) if x == 'annotations.Pathway.txt'][0]

In [46]:
GO_df = pd.concat([pd.read_csv(egg_GO_file, header=None, sep='\t'), pd.read_csv(interpro_GO_file, header=None, sep='\t')])

In [47]:
GO_df.drop_duplicates().to_csv(os.path.join(OUT_PATH, 'annotations.GO_combined.txt'), header=None, index =None, sep ='\t')

In [48]:
interpro_pathway_df = pd.read_csv(interpro_KEGG_file, header = None, sep='\t', names=['query_name','note', 'DB'])

In [49]:
interpro_pathway_df.head()

Unnamed: 0,query_name,note,DB
0,DK0911_03949,note,Reactome: R-HSA-168276
1,DK0911_03949,note,Reactome: R-HSA-3108214
2,DK0911_03949,note,Reactome: R-HSA-159236
3,DK0911_03949,note,Reactome: R-HSA-4615885
4,DK0911_03949,note,Reactome: R-HSA-170822


In [50]:
interpro_KEGG_df = interpro_pathway_df[interpro_pathway_df.DB.str.contains('KEGG')]

In [51]:
interpro_KEGG_df['new_KEGG'] = 'map'+interpro_KEGG_df.DB.str.extract(r'KEGG: ([0-9]+)+')

  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [53]:
KEGG_df = pd.concat([interpro_KEGG_df.loc[:,['query_name', 'note', 'new_KEGG']],\
                     pd.read_csv(egg_KEGG_file, header=None, sep='\t', names=['query_name','note', 'new_KEGG'])])

In [54]:
KEGG_df.drop_duplicates().to_csv(os.path.join(OUT_PATH, 'annotations.KEGG_combined.txt'), header=None, index =None, sep ='\t')

In [56]:
interpro_pathway_df[~interpro_pathway_df.DB.str.contains('KEGG')].to_csv(os.path.join(INTERPRO_PATH,\
                                                                         'annotations.Pathway_no_KEGG.txt'), header=None, index=None, sep='\t')

In [104]:
#filter out other annotations files including busco, dbCAN, merops, swissprot
funannotate_files = [os.path.join(FUN_PATH, x) for x in os.listdir(FUN_PATH) \
                    if x.startswith('annotations') and (x.endswith('busco.txt') or x.endswith('dbCAN.txt') or x.endswith('merops.txt') or x.endswith('swissprot.txt'))]
#now get all eggnog files to write out the annotations as 'gene ID;ID'
eggnog_files = [os.path.join(EGGNOG_PATH, x) for x in os.listdir(EGGNOG_PATH) if x.startswith('annotations')\
                and ('GO_terms' not in x and 'KEGG_pathways' not in x)]
interpro_files = [os.path.join(INTERPRO_PATH, x) for x in os.listdir(INTERPRO_PATH) if x.startswith('annotations')\
                 and ('.GO.' not in x and '.Pathway.' not in x)]

In [107]:
#do the writing out as 'gene \t ID;ID'
for file in (funannotate_files + eggnog_files + interpro_files):
    tmp_df = pd.read_csv(file, header = None, sep ='\t', names=['query_name', 'note', 'annotation'])
    tmp_dict = {}
    #now do a quick check if the annotations  contains some DATABASE:ID description
    #if that is the case remove the database name.
    if any(':' in x for x in tmp_df['annotation']):
        for name, group in tmp_df.groupby('query_name'):
            tmp_list = group['annotation'].tolist()
            for index, value in enumerate(tmp_list):
                if ':' in value:
                    tmp_list[index] = value[value.index(':')+1:].lstrip()
            tmp_dict[name] = ';'.join(tmp_list)
    else:
        for name, group in tmp_df.groupby('query_name'):
            tmp_dict[name] = ';'.join(group['annotation'].tolist())
    fn = file.split('/')[-1]
    fn = fn.replace('annotations', p_genome_filtered)
    fn = fn.replace('.txt', '.tablist')
    fn = os.path.join(COMB_OUT_PATH, fn)
    tmp_fn = open(fn, 'w')
    for key, value in tmp_dict.items():
        print('%s\t%s' %(key, value) ,file=tmp_fn)
    tmp_fn.close()