Idea is to read in the .tsv file of interproscan and pull out the following when having run interproscan with the following option.
This was run like:
$INTPRO/interproscan.sh -i DK_0911_v01_p_ctg.proteiniprs.fa -iprlookup -goterms -pa -d intrpro/
and produced following file
DK_0911_v01_p_ctg.proteiniprs.fa.tsv 


In [1]:
import pandas as pd
import os
import re
from Bio import SeqIO
import pysam
from Bio.SeqRecord import SeqRecord
from pybedtools import BedTool
import numpy as np
import pybedtools
import time
import matplotlib.pyplot as plt
import sys
import subprocess
import shutil

In [27]:
#define some variables up front. E.g. where to find files and what to parse out
BASE_FOLDER = '/home/benjamin/genome_assembly/Warrior/annotation/DK_0911_v01_p_ctg/intrpro'
p_genome = 'DK_0911_v01_p_ctg'
p_genome_filtered = 'DK_0911_v04LT_p_ctg'
INTERPRO_TSV_FILE = 'DK_0911_v01_p_ctg.proteiniprs.fa.tsv'
BASE_AA_PATH = '/home/benjamin/genome_assembly/Warrior/DK0911_v04'
BASE_A_PATH = '/home/benjamin/genome_assembly/Warrior/genome_v04'
OUT_PATH = os.path.join(BASE_AA_PATH, 'protein_annotation',p_genome_filtered, 'interpro', 'parsed')
if not os.path.exists(OUT_PATH):
    os.makedirs(OUT_PATH)

In [4]:
#pull out all proteins that are in the final assembly
p_protein_list = []
protein_fa_file = [x for x in os.listdir(BASE_A_PATH) if p_genome_filtered in x and x.endswith('.protein.fa')][0]
for protein in SeqIO.parse(os.path.join(BASE_A_PATH, protein_fa_file) , 'fasta'):
    p_protein_list.append(protein.id)

In [5]:
interpro_header = ['Protein_ID' , 'MD5', 'Length', 'DB', 'DB_accession', 'DB_description', 'Start_position', 'Stop_position', \
                  'e-value', 'Match Status', 'date', 'InterPro_ID', 'InterPro_description','GO_terms', 'Pathway_IDs' ]

In [6]:
interpro_df = pd.read_csv(os.path.join(BASE_FOLDER, INTERPRO_TSV_FILE), sep ='\t', header=None, names=interpro_header)

In [7]:
interpro_df.shape

(118923, 15)

In [9]:
interpro_df.columns

Index(['Protein_ID', 'MD5', 'Length', 'DB', 'DB_accession', 'DB_description',
       'Start_position', 'Stop_position', 'e-value', 'Match Status', 'date',
       'InterPro_ID', 'InterPro_description', 'GO_terms', 'Pathway_IDs'],
      dtype='object')

In [12]:
#filter it out to everything that is included in the final genome annotation
interpro_df = interpro_df[interpro_df.Protein_ID.isin(p_protein_list)].reset_index(drop=True)

In [13]:
interpro_df.shape

(94593, 15)

In [14]:
len(interpro_df.Protein_ID.unique())

13538

In [15]:
len(p_protein_list)

15070

In [16]:
interpro_df.GO_terms.fillna(0, inplace = True)
interpro_df.Pathway_IDs.fillna(0, inplace =True)
interpro_df.InterPro_ID.fillna(0,inplace =True)

In [17]:
interpro_by_protein = interpro_df.groupby('Protein_ID')

In [18]:
DBs = interpro_df.DB.unique()
interpro_df['Dbxref'] = 'Dbxref'
for db in DBs:
    interpro_df[(interpro_df.DB == db)].loc[:,['Protein_ID', 'DB_accession']]\
    .to_csv(os.path.join(OUT_PATH, db+'_terms.tab'), sep='\t', header =None, index = None)
    interpro_df[(interpro_df.DB == db)].loc[:,['Protein_ID', 'Dbxref','DB_accession']]\
    .to_csv(os.path.join(OUT_PATH, 'annotations.' +db+'.txt'), sep='\t', header =None, index = None)
    

In [19]:
#write out the interpro domains
interpro_df['InterPro'] = 'InterPro'
interpro_df[(interpro_df.InterPro_ID != 0) ].loc[:,['Protein_ID', 'InterPro_ID']]\
.to_csv(os.path.join(OUT_PATH, 'iprscan_terms.tab'), sep='\t', header =None, index = None)
#write out annotations
interpro_df[(interpro_df.InterPro_ID != 0) ]\
.loc[:,['Protein_ID', 'InterPro','InterPro_ID']]\
.to_csv(os.path.join(OUT_PATH, 'annotations.iprscan.txt'), sep='\t', header =None, index = None)

In [24]:
#process the pathway files
interpro_by_protein_KEGG = interpro_df[interpro_df.Pathway_IDs.str.contains('KEGG' or 'MetaCyc' or 'Reactome').fillna(False)]

interpro_by_protein_KEGG = interpro_by_protein_KEGG.groupby('Protein_ID')

#pull out all the KEGG terms and write them out as annotation files 
interpro_by_protein_KEGG = interpro_by_protein.Pathway_IDs.apply(set)

#remove everything without KEGG term attached
interpro_by_protein_KEGG = interpro_by_protein_KEGG[~(interpro_by_protein_KEGG == {0})]

interpro_by_protein_KEGG_dict = dict(zip(interpro_by_protein_KEGG.index, interpro_by_protein_KEGG))

ALL_KEGG_LIST = []
ALL_PROTEIN_INDEX_LIST = []
for key in list(interpro_by_protein_KEGG_dict.keys()):
    _tmp_list = list(interpro_by_protein_KEGG_dict[key])
    #remove 0 
    _tmp_list = [x for x in _tmp_list if x != 0]
    new_value = []
    for x in _tmp_list: 
        if '|' in x:
            _list = x.split('|')
            for y in _list:
                new_value.append(y)
        else:
            new_value.append(x)
    new_value = list(set(new_value))
    new_index = [key]*len(new_value)
    ALL_KEGG_LIST += new_value
    ALL_PROTEIN_INDEX_LIST += new_index


KEGG_df = pd.DataFrame([ALL_PROTEIN_INDEX_LIST, ALL_KEGG_LIST]).T
KEGG_df.rename(columns={0:'Protein_ID', 1:'DB_ID'}, inplace=True)
KEGG_df.to_csv(os.path.join(OUT_PATH, 'Pathway_terms_ipr.tab') , sep = '\t', header =None, index=None)
#write out annotations
KEGG_df['Transfer_ID'] = 'note'

KEGG_df.loc[:,['Protein_ID', 'Transfer_ID','DB_ID']]\
.to_csv(os.path.join(OUT_PATH, 'annotations.Pathway.txt') , sep = '\t', header =None, index=None)

In [22]:
#pull out all the GO terms and write them out as annotation files 
interpro_by_protein_GO = interpro_by_protein.GO_terms.apply(set)

#remove everything without GO term attached
interpro_by_protein_GO = interpro_by_protein_GO[~(interpro_by_protein_GO == {0})]

interpro_by_protein_GO_dict = dict(zip(interpro_by_protein_GO.index, interpro_by_protein_GO))

ALL_GO_LIST = []
ALL_PROTEIN_INDEX_LIST = []
for key in list(interpro_by_protein_GO_dict.keys()):
    _tmp_list = list(interpro_by_protein_GO_dict[key])
    #remove 0 
    _tmp_list = [x for x in _tmp_list if x != 0]
    new_value = []
    for x in _tmp_list: 
        if '|' in x:
            _list = x.split('|')
            for y in _list:
                new_value.append(y)
        else:
            new_value.append(x)
    new_value = list(set(new_value))
    new_index = [key]*len(new_value)
    ALL_GO_LIST += new_value
    ALL_PROTEIN_INDEX_LIST += new_index


GO_df = pd.DataFrame([ALL_PROTEIN_INDEX_LIST, ALL_GO_LIST]).T
GO_df.rename(columns={0:'Protein_ID', 1:'DB_ID'}, inplace=True)

GO_df.to_csv(os.path.join(OUT_PATH, 'GO_terms_ipr.tab') , sep = '\t', header =None, index=None)
#write out annotations
GO_df['Transfer_ID'] = 'note'

GO_df.loc[:,['Protein_ID', 'Transfer_ID','DB_ID']].to_csv(os.path.join(OUT_PATH, 'annotations.GO.txt') , sep = '\t', header =None, index=None)