In [1]:
import json
from collections import defaultdict

import numpy as np
import pandas as pd
from BCBio import GFF
from Bio import SeqIO

from IPython.display import display

In [2]:
pd.options.display.max_colwidth = 1000
pd.options.display.max_rows = 500

In [3]:
with open('data/candidate_homologues.json', 'r') as handle:
    candidate_homologues = json.load(handle)

with open('data/candidate_scaffolds.json', 'r') as handle:
    candidate_scaffolds = json.load(handle)

In [4]:
GENOMES = dict()
GENOMES['MNH120'] = SeqIO.to_dict(GFF.parse('data/augustus.hints.GeneWithUTR.withNCBI_Fungi.gff3'))
GENOMES['B04'] = SeqIO.to_dict(GFF.parse('data/B04.genes.gff3'))
GENOMES['USR5'] = SeqIO.to_dict(GFF.parse('data/I5V.PredictedPass.gff3'))

for genome, scaffolds in GENOMES.items():
    for scaffold, seq in scaffolds.items():
        seq.features.sort(key=lambda f: f.location.start)

In [5]:
tpsi_cols = [
    'qid',
    'qlen',
    'search_method',
    'db_name',
    'sid',
    'qstart',
    'qend',
    'sstart',
    'send',
    'pid',
    'psim',
    'HSP_score',
    'bitscore',
    'description',
    'qframe',
    'qstrand',
    'slen',
    'evalue',
    'pvalue',
    ]
cazy_cols = ['seqid', 'subfamily', 'family_name', 'desc', 'notes']
location_cols = ['seqid', 'location', 'membrane']
panther_cols = ['seqid', 'family_id', 'family_name']
superfamilies_cols = ['seqid', 'category', 'short_catname', 
                      'long_catname', 'id', 'name']
go_cols = ['seqid', 'goid', 'goname', 'godomain']
ips_cols = ['seqid', 'md5', 'length', 'analysis', 'accession',
            'description', 'start', 'end', 'evalue', 'status',
            'date', 'ipracc', 'iprdesc', 'goterms', 'pathterms']

ANALYSES = defaultdict(dict)
ANALYSES['MNH120']['cazy'] = pd.read_table(
    'data/MNH120.cazy_fams.tsv',
    names=cazy_cols
    )
ANALYSES['MNH120']['tpsi'] = pd.read_table(
    'data/MNH120.combined.TPSI.topHits',
    names=tpsi_cols
    )
ANALYSES['MNH120']['location'] = pd.read_table(
    'data/MNH120.location.tsv',
    names=location_cols
    )
ANALYSES['MNH120']['panther'] = pd.read_table(
    'data/MNH120.panther_fams.tsv',
    names=panther_cols
    )
ANALYSES['MNH120']['superfamilies'] = pd.read_table(
    'data/MNH120.superfamilies.tsv',
    names=superfamilies_cols
    )
ANALYSES['MNH120']['go'] = pd.read_table(
    'data/MNH120.goterms.tsv',
    names=go_cols
    )
ANALYSES['MNH120']['ips'] = pd.read_table(
    'data/MNH120.combined.tsv',
    names=ips_cols
    )
ANALYSES['MNH120']['pfr'] = pd.read_table(
    'data/atg.Description.txt',
    names=['seqid', 'description']
    )


ANALYSES['B04']['cazy'] = pd.read_table(
    'data/B04.cazy_fams.tsv',
    names=cazy_cols
    )
ANALYSES['B04']['tpsi'] = pd.read_table(
    'data/B04.combined.TPSI.topHits',
    names=tpsi_cols
    )
ANALYSES['B04']['location'] = pd.read_table(
    'data/B04.location.tsv',
    names=location_cols
    )
ANALYSES['B04']['panther'] = pd.read_table(
    'data/B04.panther_fams.tsv',
    names=panther_cols
    )
ANALYSES['B04']['superfamilies'] = pd.read_table(
    'data/B04.superfamilies.tsv',
    names=superfamilies_cols
    )
ANALYSES['B04']['go'] = pd.read_table(
    'data/B04.goterms.tsv',
    names=go_cols
    )
ANALYSES['B04']['ips'] = pd.read_table(
    'data/B04.combined.tsv',
    names=ips_cols
    )
ANALYSES['B04']['pfr'] = pd.read_table(
    'data/B04.Description.txt',
    names=['seqid', 'description']
    )

ANALYSES['USR5']['cazy'] = pd.read_table(
    'data/I5V.cazy_fams.tsv',
    names=cazy_cols
    )
ANALYSES['USR5']['tpsi'] = pd.read_table(
    'data/I5V.combined.TPSI.topHits',
    names=tpsi_cols
    )
ANALYSES['USR5']['location'] = pd.read_table(
    'data/I5V.location.tsv',
    names=location_cols
    )
ANALYSES['USR5']['panther'] = pd.read_table(
    'data/I5V.panther_fams.tsv',
    names=panther_cols
    )
ANALYSES['USR5']['superfamilies'] = pd.read_table(
    'data/I5V.superfamilies.tsv',
    names=superfamilies_cols
    )
ANALYSES['USR5']['go'] = pd.read_table(
    'data/I5V.goterms.tsv',
    names=go_cols
    )
ANALYSES['USR5']['ips'] = pd.read_table(
    'data/I5V.combined.tsv', 
    names=ips_cols)

In [6]:
def get(seqid):
    idcols = {
        'panther': 'seqid',
        'cazy': 'seqid',
        'tpsi': 'sid',
        'location': 'seqid',
        'superfamilies': 'seqid',
        'go': 'seqid',
        'ips': 'seqid',
        'pfr': 'seqid'
        }
    for isolate, analyses in ANALYSES.items():
        for analysis, table in analyses.items():
            filtered = table[table[idcols[analysis]] == seqid]
            if len(filtered) == 0:
                continue
            else:
                print(isolate, analysis)
                display(filtered)

def getblast(seqid, isolate, thresh=1e-10):
    files = {
        'MNH120': 'data/MNH120.swiss.combined.tsv',
        'B04': 'data/B04.swiss.combined.tsv',
        'USR5': 'data/I5V.swiss.combined.tsv',
        }
    cols = (
        "qseqid qlen sallseqid sgi sacc saccver slen qstart "
        "qend sstart send qseq sseq evalue bitscore score "
        "length pident nident mismatch positive gapopen gaps "
        "ppos frames qframe sframe btop staxids sscinames "
        "scomnames sblastnames sskingdoms stitle salltitles "
        "sstrand qcovs qcovhsp"
        ).split(' ')
    interesting_cols = [
        'qseqid', 'qlen', 'sacc', 'slen', 'qstart', 'qend',
        'sstart', 'send', 'evalue', 'bitscore', 'sscinames',
        'salltitles',
        ]
    out = list()
    with open(files[isolate], 'r') as handle:
        for line in handle:
            if not line.startswith(seqid):
                continue
            line = line.rstrip('\n')
            line = line.split('\t')
            line = dict(zip(cols, line))
            if line['qseqid'] == seqid:
                if float(line['evalue']) < thresh:
                    out.append(line)
    if len(out) > 0:
        return pd.DataFrame(out)[interesting_cols]
    else:
        print('No matches')

In [7]:
def subset_features(record, start, end):
    """ Filters features to include what's in the bounds.

    Because the slice operator on SeqRecord objects does
    not handle features well, we need this function.

    Keyword arguments:
    record -- a SeqRecord object containing features.
    start -- the lower bound to include.
    end -- the upper bound to include.

    returns:
    A list of features.
    """

    new_features = list()
    for feature in record.features:
        f_start = feature.location.start
        f_end = feature.location.end
        if (
                (start <= f_start < end) or
                (start < f_end <= end) or
                ((f_start < start) and (f_end > end))
                ):
            new_features.append(feature)
    return new_features

In [28]:
def scaf_to_homologues(scaf):
    isolate = None
    for i, d in GENOMES.items():
        if scaf in d:
            qisolate = i
        
    seq = GENOMES[qisolate][scaf]
    features = [f for f in seq.features if f.type == 'gene']
    features.sort(key=lambda f: f.location.start)
    ids = [f.id for f in features]
    table = list()
    for id_ in ids:
        try:
            homologues = candidate_homologues[id_]
        except KeyError:
            continue
        group = dict()
        for isolate, hlogs in homologues.items():
            if isolate not in GENOMES or len(hlogs) == 0:
                continue
            elif isolate == qisolate:
                scaffold = scaf
                group[qisolate + '_scaffold'] = scaffold
                group[qisolate + '_id'] = id_
                continue
                
            hlog = hlogs[0]
            scaffold = hlog['scaffold']
            start = hlog['start']
            end = hlog['end']
            if scaffold not in GENOMES[isolate]:
                continue
            features = subset_features(GENOMES[isolate][scaffold], start, end)
            features = [f.id for f in features if f.type == 'gene']
            if len(features) > 0:
                feature = features[0]
            else:
                continue
            group[isolate + '_scaffold'] = scaffold
            group[isolate + '_id'] = feature
        table.append(group)
    return pd.DataFrame(table)

# Region surrounding atg140

### scaffold 38

In [417]:
"""{
"MNH120 scaffold":, "MNH120 id":, "MNH120 function":, 
"B04 scaffold":, "B04 id":, "B04 function":, 
"USR5 scaffold":, "USR5 id":, "USR5 function":,
}"""
table = [
    {
     "MNH120 scaffold": 'scaffold_38', "MNH120 id": 'atg136', "MNH120 function": "Unknown",
     "B04 scaffold": 'B04S72', "B04 id": 'B04S72.g5566', "B04 function": 'Unknown', 
     "USR5 scaffold": 'NODE_28888_length_496775_cov_15.218104', "USR5 id": 'NS.04813', "USR5 function": 'Unknown',
    },
    {
     "USR5 scaffold": 'NODE_28888_length_496775_cov_15.218104', "USR5 id": 'NS.04814', "USR5 function": 'Unknown',
    },
    {
     "USR5 scaffold": 'NODE_28888_length_496775_cov_15.218104', "USR5 id": 'NS.04815', "USR5 function": 'Unknown',
    },
    {
     "MNH120 scaffold": 'scaffold_38', "MNH120 id": 'atg137', "MNH120 function": "Membrane associated protein with unknown function",
     "B04 scaffold": 'B04S72', "B04 id": 'B04S72.g5565', "B04 function": 'Membrane associated protein with unknown function', 
     "USR5 scaffold": 'NODE_28888_length_496775_cov_15.218104', "USR5 id": 'NS.04816', "USR5 function": 'Membrane associated protein with unknown function',
    },
    {
     "USR5 scaffold": 'NODE_28888_length_496775_cov_15.218104', "USR5 id": 'NS.04817', "USR5 function": 'Unknown',
    },
    {
     "USR5 scaffold": 'NODE_28888_length_496775_cov_15.218104', "USR5 id": 'NS.04818', "USR5 function": 'Unknown',
    },
    {
     "MNH120 scaffold": 'scaffold_38', "MNH120 id": 'atg138', "MNH120 function": "DNA binding cell division control protein",
     "B04 scaffold": 'B04S72', "B04 id": 'B04S72.g5564', "B04 function": 'DNA binding cell division control protein', 
     "USR5 scaffold": 'NODE_28888_length_496775_cov_15.218104', "USR5 id": 'NS.04819', "USR5 function": 'DNA binding cell division control protein',
    },
    {
     "MNH120 scaffold": 'scaffold_38', "MNH120 id": 'atg139', "MNH120 function": "Putative V-type ATP synthase subunit I homologue",
     "B04 scaffold": 'B04S72', "B04 id": 'B04S72.g5563', "B04 function": 'Putative V-type ATP synthase subunit I homologue', 
     "USR5 scaffold": 'NODE_28888_length_496775_cov_15.218104', "USR5 id": 'NS.04820', "USR5 function": 'Unknown',
    },
    {
     "MNH120 scaffold": 'scaffold_38', "MNH120 id": 'atg140', "MNH120 function": "Secreted protein with unknown function",
     "B04 scaffold": 'B04S196', "B04 id": 'B04S196.g9853', "B04 function": 'Secreted protein with unknown function', 
     "USR5 scaffold": 'NODE_35582_length_606476_cov_15.460932', "USR5 id": 'exon.CUFF.10985.1.77', "USR5 function": 'Secreted protein with unknown function',
    },
]
table = pd.DataFrame(table)
table = table[["MNH120 scaffold", "MNH120 id", "USR5 scaffold", "USR5 id", "B04 scaffold", "B04 id", "MNH120 function", "USR5 function", "B04 function"]]
table.to_csv('scaffold_38.function.tsv', sep='\t', index=False)
table.to_latex('scaffold_38.function.tex', index=False, na_rep='', longtable=True)
table

Unnamed: 0,MNH120 scaffold,MNH120 id,USR5 scaffold,USR5 id,B04 scaffold,B04 id,MNH120 function,USR5 function,B04 function
0,scaffold_38,atg136,NODE_28888_length_496775_cov_15.218104,NS.04813,B04S72,B04S72.g5566,Unknown,Unknown,Unknown
1,,,NODE_28888_length_496775_cov_15.218104,NS.04814,,,,Unknown,
2,,,NODE_28888_length_496775_cov_15.218104,NS.04815,,,,Unknown,
3,scaffold_38,atg137,NODE_28888_length_496775_cov_15.218104,NS.04816,B04S72,B04S72.g5565,Membrane associated protein with unknown function,Membrane associated protein with unknown function,Membrane associated protein with unknown function
4,,,NODE_28888_length_496775_cov_15.218104,NS.04817,,,,Unknown,
5,,,NODE_28888_length_496775_cov_15.218104,NS.04818,,,,Unknown,
6,scaffold_38,atg138,NODE_28888_length_496775_cov_15.218104,NS.04819,B04S72,B04S72.g5564,DNA binding cell division control protein,DNA binding cell division control protein,DNA binding cell division control protein
7,scaffold_38,atg139,NODE_28888_length_496775_cov_15.218104,NS.04820,B04S72,B04S72.g5563,Putative V-type ATP synthase subunit I homologue,Unknown,Putative V-type ATP synthase subunit I homologue
8,scaffold_38,atg140,NODE_35582_length_606476_cov_15.460932,exon.CUFF.10985.1.77,B04S196,B04S196.g9853,Secreted protein with unknown function,Secreted protein with unknown function,Secreted protein with unknown function


### scaffolds containing most probable atg140 homologue

In [416]:
"""{
"MNH120 scaffold":, "MNH120 id":, "MNH120 function":, 
"B04 scaffold":, "B04 id":, "B04 function":, 
"USR5 scaffold":, "USR5 id":, "USR5 function":,
}"""
table = [
    {
    "MNH120 scaffold": 'scaffold_997', "MNH120 id": 'atg10842', "MNH120 function": "Membrane associated PQ loop repeat protein",
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9843', "B04 function": "Membrane associated PQ loop repeat protein", 
    "USR5 scaffold": 'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06676', "USR5 function":"Membrane associated PQ loop repeat protein",
    },
    {
    "MNH120 scaffold":'atg10843', "MNH120 id":'scaffold_997', "MNH120 function":"Pyridine nucleotide-disulphide oxidoreductase", 
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9844', "B04 function":"Pyridine nucleotide-disulphide oxidoreductase",
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06675', "USR5 function":"Pyridine nucleotide-disulphide oxidoreductase",
    },
    {
    "MNH120 scaffold":'scaffold_997', "MNH120 id":'atg10844', "MNH120 function":"Sodium-dependent phosphate transporter", 
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9845', "B04 function":"Sodium-dependent phosphate transporter", 
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06674', "USR5 function":"Sodium-dependent phosphate transporter",
    }, ## Possibly need to split into USR5 and MNH120 vs Bo4
    {
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06673',
    },
    {
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06672', "USR5 function":"Transcription factor",
    },
    {
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06671',
    }, # Possible split with NS.06670
    {
    "MNH120 scaffold":'scaffold_997', "MNH120 id":'atg10846', "MNH120 function":"NMT1/THI5 like protein, pyrimidine biosynthesis", 
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9846', ## Unknown
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06670', "USR5 function":'NMT1/THI5 like protein, pyrimidine biosynthesis',
    },
    {
    "MNH120 scaffold":'scaffold_997', "MNH120 id":'atg10847', "MNH120 function":'DNA replication licensing factor MCM5', 
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9847', "B04 function":"DNA replication licensing factor MCM5", 
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'exon.CUFF.10994.1.84', "USR5 function":"DNA replication licensing factor MCM5",
    },
    {
    "MNH120 scaffold":'scaffold_997', "MNH120 id":'atg10848', "MNH120 function":"Cysteine proteinase", 
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9848', "B04 function":"Cysteine proteinase", 
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06669', "USR5 function":"Cysteine proteinase",
    },
    {
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'exon.CUFF.10998.1.83',
    },
    {
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9849',
    },
    {
    "MNH120 scaffold":'scaffold_997', "MNH120 id":'atg10849', "MNH120 function":"Eukaryotic translation initiation factor 4E", 
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9850', "B04 function":"Eukaryotic translation initiation factor 4E",
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06668', "USR5 function":"Eukaryotic translation initiation factor 4E",
    },
    {
    "MNH120 scaffold":'scaffold_997', "MNH120 id":'atg10850', "MNH120 function": "Protein phosphatase inhibitor", 
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9851', "B04 function":'Protein phosphatase inhibitor', 
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'exon.CUFF.11014.2.81', "USR5 function":'Protein phosphatase inhibitor',
    },
    {
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06667',
    },
    {
    "MNH120 scaffold":'scaffold_997', "MNH120 id":'atg10851', 
    },
    {
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06666', "USR5 function":"Aminopeptidase",
    }, # possible split with 'exon.CUFF.11014.2.80'
    {
    "MNH120 scaffold":'scaffold_997', "MNH120 id":'atg10852', "MNH120 function":"Aminopeptidase", 
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9852', "B04 function":"Aminopeptidase", 
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'exon.CUFF.11014.2.80', "USR5 function":"Aminopeptidase",
    },
    {
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06665',
    },
    {
    "MNH120 scaffold": 'scaffold_38', "MNH120 id": 'atg140', "MNH120 function": "Secreted protein with unknown function",
    "B04 scaffold": 'B04S196', "B04 id": 'B04S196.g9853', "B04 function": 'Secreted protein with unknown function', 
    "USR5 scaffold": 'NODE_35582_length_606476_cov_15.460932', "USR5 id": 'exon.CUFF.10985.1.77', "USR5 function": 'Secreted protein with unknown function',
    },
    {
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06664',
    },
    {
    "MNH120 scaffold":'scaffold_887', "MNH120 id":'atg7428', "MNH120 function":"Transmembrane ion transporter", 
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9854', "B04 function":"Transmembrane ion transporter", 
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06663', "USR5 function":"Transmembrane ion transporter",
    },
    {
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9855', 
    },
    {
    "MNH120 scaffold":'scaffold_887', "MNH120 id":'atg7429', "MNH120 function":"Methylenetetrahydrofolate reductase", 
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9856', "B04 function":"Methylenetetrahydrofolate reductase", 
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'exon.CUFF.10988.1.76',
    },
    {
    "MNH120 scaffold":'scaffold_887', "MNH120 id":'atg7430', "MNH120 function":'Mannosyltransferase', 
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'exon.CUFF.10988.1.75', "USR5 function":'Mannosyltransferase',
    }, ## Split with over 'B04S196.g9856'
    {
    "MNH120 scaffold":'scaffold_887', "MNH120 id":'atg7431', "MNH120 function":"Gamma-tubulin protein", 
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9857', "B04 function":"Gamma-tubulin protein", 
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'exon.CUFF.10983.1.74', "USR5 function":"Gamma-tubulin protein",
    },
    {
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06662',
    },
    {
    "MNH120 scaffold":'scaffold_887', "MNH120 id":'atg7432', 
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'exon.CUFF.10983.1.73',
    },
    {
    "MNH120 scaffold":'scaffold_244', "MNH120 id":'atg1389', 
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9858', 
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06661',
    },
    {
    "MNH120 scaffold":'scaffold_244', "MNH120 id":'atg1390', "MNH120 function":'Transcription factor', 
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9859', "B04 function":'Transcription factor', 
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'exon.CUFF.10989.1.72', "USR5 function":'Transcription factor',
    },
    {
    "MNH120 scaffold":'scaffold_244', "MNH120 id":'atg1391',
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06660',
    },
    {
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'exon.CUFF.10986.1.71', "USR5 function": "DNA binding protein",
    },
    {
    "MNH120 scaffold":'scaffold_878', "MNH120 id":'atg7619', "MNH120 function":'Dimethylaniline monooxygenase', 
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9860', "B04 function": 'Flavin mononucleotide binding protein', 
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'exon.CUFF.11171.1.68', "USR5 function":'Flavin mononucleotide binding protein',
    },
    {
    "MNH120 scaffold":'scaffold_878', "MNH120 id":'atg7620', "MNH120 function": "Zn/Fe transporter", 
    "B04 scaffold":'B04S196', "B04 id":'B04S196.g9861', "B04 function": "Zn/Fe transporter", 
    "USR5 scaffold":'NODE_35582_length_606476_cov_15.460932', "USR5 id":'NS.06659', "USR5 function": "Zn/Fe transporter",
    },
    ]
table = pd.DataFrame(table)
table = table[["MNH120 scaffold", "MNH120 id", "USR5 scaffold", "USR5 id", "B04 scaffold", "B04 id", "MNH120 function", "USR5 function", "B04 function"]]
table.to_csv('atg140.function.tsv', sep='\t', index=False)
table.to_latex('atg140.function.tex', index=False, na_rep='', longtable=True)
table

Unnamed: 0,MNH120 scaffold,MNH120 id,USR5 scaffold,USR5 id,B04 scaffold,B04 id,MNH120 function,USR5 function,B04 function
0,scaffold_997,atg10842,NODE_35582_length_606476_cov_15.460932,NS.06676,B04S196,B04S196.g9843,Membrane associated PQ loop repeat protein,Membrane associated PQ loop repeat protein,Membrane associated PQ loop repeat protein
1,atg10843,scaffold_997,NODE_35582_length_606476_cov_15.460932,NS.06675,B04S196,B04S196.g9844,Pyridine nucleotide-disulphide oxidoreductase,Pyridine nucleotide-disulphide oxidoreductase,Pyridine nucleotide-disulphide oxidoreductase
2,scaffold_997,atg10844,NODE_35582_length_606476_cov_15.460932,NS.06674,B04S196,B04S196.g9845,Sodium-dependent phosphate transporter,Sodium-dependent phosphate transporter,Sodium-dependent phosphate transporter
3,,,NODE_35582_length_606476_cov_15.460932,NS.06673,,,,,
4,,,NODE_35582_length_606476_cov_15.460932,NS.06672,,,,Transcription factor,
5,,,NODE_35582_length_606476_cov_15.460932,NS.06671,,,,,
6,scaffold_997,atg10846,NODE_35582_length_606476_cov_15.460932,NS.06670,B04S196,B04S196.g9846,"NMT1/THI5 like protein, pyrimidine biosynthesis","NMT1/THI5 like protein, pyrimidine biosynthesis",
7,scaffold_997,atg10847,NODE_35582_length_606476_cov_15.460932,exon.CUFF.10994.1.84,B04S196,B04S196.g9847,DNA replication licensing factor MCM5,DNA replication licensing factor MCM5,DNA replication licensing factor MCM5
8,scaffold_997,atg10848,NODE_35582_length_606476_cov_15.460932,NS.06669,B04S196,B04S196.g9848,Cysteine proteinase,Cysteine proteinase,Cysteine proteinase
9,,,NODE_35582_length_606476_cov_15.460932,exon.CUFF.10998.1.83,,,,,


### atg12487

In [412]:
"""{
"MNH120 scaffold":, "MNH120 id":, "MNH120 function":, 
"B04 scaffold":, "B04 id":, "B04 function":, 
"USR5 scaffold":, "USR5 id":, "USR5 function":,
}"""
table = [
    { ## Very weak match to Fbox domain
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12496",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "exon.CUFF.11385.1.31",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12495", "MNH120 function": "WD40 repeat domain-containing protein", 
    "B04 scaffold": "B04S84", "B04 id": "B04S84.g6037", "B04 function": "WD40 repeat domain-containing protein", 
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "exon.CUFF.11385.1.32", "USR5 function": "WD40 repeat domain-containing protein",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12494", "MNH120 function": "Cell division control protein", 
    "B04 scaffold": "B04S84", "B04 id": "B04S84.g6038", "B04 function": "Cell division control protein", 
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "exon.CUFF.11385.1.33", "USR5 function": "Cell division control protein",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12493", "MNH120 function": "Cell division control protein", 
    "B04 scaffold": "B04S84", "B04 id": "B04S84.g6038", "B04 function": "Cell division control protein", 
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "exon.CUFF.11385.1.34", "USR5 function": "Cell division control protein",
    },
    { # B04S84.g6039 is a small gene within B04S84.g6038, no rearrangement
    "B04 scaffold": "B04S84", "B04 id": "B04S84.g6039",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12492", "MNH120 function": "WD40 repeat domain-containing protein, possibly ribosome associated, possible protease activity", 
    "B04 scaffold": "B04S84", "B04 id": "B04S84.g6040", "B04 function": "WD40 repeat domain-containing protein, possibly ribosome associated, possible protease activity",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "NS.06862", "USR5 function": "WD40 repeat domain-containing protein, possibly ribosome associated, possible protease activity",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12491",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "NS.06861",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12490", "MNH120 function": "GATS-like protein", 
    "B04 scaffold": "B04S84", "B04 id": "B04S84.g6041", "B04 function": "GATS-like protein", 
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "exon.CUFF.11309.1.28", "USR5 function": "GATS-like protein",
    },
    {
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "NS.06860",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12489", 
    "B04 scaffold": "B04S84", "B04 id": "B04S84.g6042", 
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "NS.06859",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12488",  
    "B04 scaffold": "B04S84", "B04 id": "B04S84.g6043",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "NS.06858",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12488",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "NS.06857",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12487", "MNH120 function": "Cellobiose dehydrogenase", 
    "B04 scaffold": "B04S84", "B04 id": "B04S84.g6044", "B04 function": "Cellobiose dehydrogenase", 
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "exon.CUFF.11325.4.26", "USR5 function": "Cellobiose dehydrogenase",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12486", "MNH120 function": "GDP-mannose transporter", 
    "B04 scaffold": "B04S84", "B04 id": "B04S84.g6045", "B04 function": "GDP-mannose transporter", 
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "exon.CUFF.11301.1.19", "USR5 function": "GDP-mannose transporter",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12485",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12484", "MNH120 function": "Dimeric alpha-beta barrel protein, secondary metabolite associated", 
    "B04 scaffold": "B04S84", "B04 id": "B04S84.g6046", "B04 function": "Dimeric alpha-beta barrel protein, secondary metabolite associated", 
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "NS.06856", "USR5 function": "Dimeric alpha-beta barrel protein, secondary metabolite associated",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12483", "MNH120 function": "Cytochrome P450", 
    "B04 scaffold": "B04S84", "B04 id": "B04S84.g6047", "B04 function": "Cytochrome P450", 
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "NS.06855", "USR5 function": "Cytochrome P450",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12483", "MNH120 function": "Cytochrome P450", 
    "B04 scaffold": "B04S84", "B04 id": "B04S84.g6047", "B04 function": "Cytochrome P450", 
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "exon.CUFF.11297.1.18", "USR5 function": "Cytochrome P450",
    },
    {
    "MNH120 scaffold": "scaffold_978", "MNH120 id": "atg12483", "MNH120 function": "Cytochrome P450", 
    "B04 scaffold": "B04S84", "B04 id": "B04S84.g6047", "B04 function": "Cytochrome P450", 
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "NS.06854",
    },
    ]
table = pd.DataFrame(table[::-1])
table = table[["MNH120 scaffold", "MNH120 id", "USR5 scaffold", "USR5 id", "B04 scaffold", "B04 id", "MNH120 function", "USR5 function", "B04 function"]]
table.to_csv('atg12487.function.tsv', sep='\t', index=False)
table.to_latex('atg12487.function.tex', index=False, na_rep='', longtable=True)
table

Unnamed: 0,MNH120 scaffold,MNH120 id,USR5 scaffold,USR5 id,B04 scaffold,B04 id,MNH120 function,USR5 function,B04 function
0,scaffold_978,atg12483,NODE_36160_length_249094_cov_15.165792,NS.06854,B04S84,B04S84.g6047,Cytochrome P450,,Cytochrome P450
1,scaffold_978,atg12483,NODE_36160_length_249094_cov_15.165792,exon.CUFF.11297.1.18,B04S84,B04S84.g6047,Cytochrome P450,Cytochrome P450,Cytochrome P450
2,scaffold_978,atg12483,NODE_36160_length_249094_cov_15.165792,NS.06855,B04S84,B04S84.g6047,Cytochrome P450,Cytochrome P450,Cytochrome P450
3,scaffold_978,atg12484,NODE_36160_length_249094_cov_15.165792,NS.06856,B04S84,B04S84.g6046,"Dimeric alpha-beta barrel protein, secondary metabolite associated","Dimeric alpha-beta barrel protein, secondary metabolite associated","Dimeric alpha-beta barrel protein, secondary metabolite associated"
4,scaffold_978,atg12485,,,,,,,
5,scaffold_978,atg12486,NODE_36160_length_249094_cov_15.165792,exon.CUFF.11301.1.19,B04S84,B04S84.g6045,GDP-mannose transporter,GDP-mannose transporter,GDP-mannose transporter
6,scaffold_978,atg12487,NODE_36160_length_249094_cov_15.165792,exon.CUFF.11325.4.26,B04S84,B04S84.g6044,Cellobiose dehydrogenase,Cellobiose dehydrogenase,Cellobiose dehydrogenase
7,scaffold_978,atg12488,NODE_36160_length_249094_cov_15.165792,NS.06857,,,,,
8,scaffold_978,atg12488,NODE_36160_length_249094_cov_15.165792,NS.06858,B04S84,B04S84.g6043,,,
9,scaffold_978,atg12489,NODE_36160_length_249094_cov_15.165792,NS.06859,B04S84,B04S84.g6042,,,


### rest

In [414]:
table = [
    {
    "MNH120 scaffold": "scaffold_48", "MNH120 id": "atg162",
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10047",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "NS.06847",
    },
    {
    "MNH120 scaffold": "scaffold_48", "MNH120 id": "atg161",
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10048",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "NS.06846",
    },
    {
    "MNH120 scaffold": "scaffold_48", "MNH120 id": "atg160", "MNH120 function": "Sulfite oxidase", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10049", "B04 function": "Sulfite oxidase",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "exon.CUFF.11287.1.9", "USR5 function": "Sulfite oxidase",
    },
    {
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10050",
    },
    {
    "MNH120 scaffold": "scaffold_48", "MNH120 id": "atg159", "MNH120 function": "Amun-like protein", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10051", "B04 function": "Amun-like protein",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "exon.CUFF.11284.2.7", "USR5 function": "Amun-like protein",
    },
    {
    "MNH120 scaffold": "scaffold_893", "MNH120 id": "atg7462", "MNH120 function": "Ubiquitin carboxyl-terminal hydrolase", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10052", "B04 function": "Ubiquitin carboxyl-terminal hydrolase",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "exon.CUFF.11303.1.5", "USR5 function": "Ubiquitin carboxyl-terminal hydrolase",
    },
    {
    "MNH120 scaffold": "scaffold_893", "MNH120 id": "atg7463", "MNH120 function": "Pyruvate dehydrogenase kinase", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10053", "B04 function": "Pyruvate dehydrogenase kinase",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "exon.CUFF.11302.1.4", "USR5 function": "Pyruvate dehydrogenase kinase",
    },
    {
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "NS.06845",
    },
    {
    "MNH120 scaffold": "scaffold_893", "MNH120 id": "atg7464",
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10054",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "exon.CUFF.11302.1.3",
    },
    {
    "MNH120 scaffold": "scaffold_893", "MNH120 id": "atg7465", "MNH120 function": "Beta-glucosidase family 1", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10055", "B04 function": "Beta-glucosidase family 1",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "exon.CUFF.11282.1.2", "USR5 function": "Beta-glucosidase family 1",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10702", "MNH120 function": "Membrane protein with unknown function", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10056", "B04 function": "Membrane protein with unknown function",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "NS.06844", "USR5 function": "Membrane protein with unknown function",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10703", "MNH120 function": "Secreted protein with unknown function", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10057", "B04 function": "Secreted protein with unknown function",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "exon.CUFF.11283.1.0", "USR5 function": "Secreted protein with unknown function",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10704",
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10058",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "NS.06843", # Possible RNI-like superfamily
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10705", "MNH120 function": "GPI transamidase component PIG-U", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10059", "B04 function": "GPI transamidase component PIG-U",
    "USR5 scaffold": "NODE_36160_length_249094_cov_15.165792", "USR5 id": "NS.06842", "USR5 function": "GPI transamidase component PIG-U",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08710", "USR5 function": "Oxidoreductase",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "exon.CUFF.14260.1.0", "USR5 function": "Aromatic-L-amino acid decarboxylase",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10706", "MNH120 function": "Possible glycosyltransferase", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10060", "B04 function": "Possible glycosyltransferase",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08711", ## 3' tail of atg10706
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08712",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08713",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08714",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08715",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08716", "USR5 function": "Partial Cytochrome P450 match",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08717", "USR5 function": "Partial ribonuclease H/Integrase, possible Gypsy TE",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08718", "USR5 function": "Partial polymerase, possible Gypsy TE",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08719", "USR5 function": "Partial polymerase, possible Gypsy TE",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08720", "USR5 function": "Weak partial Gypsy TE match",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08721", "USR5 function": "Partial phospholipase",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08722",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08723",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10706", "MNH120 function": "Possible glycosyltransferase",
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10060", "B04 function": "Possible glycosyltransferase",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08724", "USR5 function": "Possible glycosyltransferase",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10707", "MNH120 function": "MM3350-like protein, possible DNA-binding", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10062", "B04 function": "MM3350-like protein, possible DNA-binding",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "exon.CUFF.14230.1.3", "USR5 function": "MM3350-like protein, possible DNA-binding",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10708",
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10063",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08725",
    },
    {
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10064", "B04 function": "NCS1 family purine/pyrimidine transporter",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08726", "USR5 function": "NCS1 family purine/pyrimidine transporter",
    },
    {
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10065", "B04 function": "Clavaminate synthase-like protein", #### Split?
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "exon.CUFF.14211.1.7", "USR5 function": "Transcriptional regulatory protein",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10709",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08729", ## Partial plexin repeat
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "exon.CUFF.14232.2.11",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10710", "MNH120 function": "Chloride conductance regulatory protein ICLN", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10066", "B04 function": "Chloride conductance regulatory protein ICLN",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "exon.CUFF.14232.2.10", "USR5 function": "Chloride conductance regulatory protein ICLN",
    },
    {
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10067", "B04 function": "Glycosyltransferase family 25 member",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "exon.CUFF.14214.1.12", "USR5 function": "Glycosyltransferase family 25 member, possibly inactive",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10711", "MNH120 function": "Glycosyltransferase family 25 member, possibly inactive", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10067", "B04 function": "Glycosyltransferase family 25 member",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08730", "USR5 function": "Glycosyltransferase family 25 member, possibly inactive",
    },
    {
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10068", "B04 function": "Glycosyltransferase family 25 member",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08730", "USR5 function": "Glycosyltransferase family 25 member, possibly inactive",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10712",
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10069",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08731",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10713",
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10070",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "exon.CUFF.14217.1.14",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10713",
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10070",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "exon.CUFF.14217.2.15",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10713",
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10070",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "exon.CUFF.14217.3.16",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08732",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10714", "MNH120 function": "Vacuolar protein sorting-associated protein 54", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10071", "B04 function": "Vacuolar protein sorting-associated protein 54",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "exon.CUFF.14228.1.17", "USR5 function": "Vacuolar protein sorting-associated protein 54",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10715",
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10072",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08733",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08734",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10716",
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10073",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08735",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10717",
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10074",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08736",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10718", "MNH120 function": "MFS transporter", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10075", "B04 function": "MFS transporter",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08737", "USR5 function": "MFS transporter",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10719", "MNH120 function": "Secreted protein with unknown function", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10076", "B04 function": "Secreted protein with unknown function",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08738", "USR5 function": "Secreted protein with unknown function",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10720",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10721", "MNH120 function": "Possible M-phase phosphoprotein 6", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10077", "B04 function": "Possible M-phase phosphoprotein 6",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08739", "USR5 function": "Possible M-phase phosphoprotein 6",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10722", "MNH120 function": "NADH-ubiquinone oxidoreductase", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10078", "B04 function": "NADH-ubiquinone oxidoreductase",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08740", "USR5 function": "NADH-ubiquinone oxidoreductase",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08741", "USR5 function": "",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10723", "MNH120 function": "DNA damage repair protein, possible tRNA-splicing endonuclease",
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10079", "B04 function": "DNA damage repair protein, possible tRNA-splicing endonuclease",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "exon.CUFF.14234.1.22", "USR5 function": "DNA damage repair protein, possible tRNA-splicing endonuclease",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08742",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10724", "MNH120 function": "Secreted protein with possible transmembrane domain, unknown function", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10080", "B04 function": "Secreted protein with possible transmembrane domain, unknown function",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08743", "USR5 function": "Possible transmembrane protein, unknown function", # has extra bit on 5' end
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10725", "MNH120 function": "YjbQ-like protein, possible secondary thiamine-phosphate synthase activity", 
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10082", "B04 function": "YjbQ-like protein, possible secondary thiamine-phosphate synthase activity",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08744", "USR5 function": "YjbQ-like protein, possible secondary thiamine-phosphate synthase activity",
    },
    {
    "MNH120 scaffold": "scaffold_996", "MNH120 id": "atg10726",
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10083",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "exon.CUFF.14238.1.25",
    },
    {
    "B04 scaffold": "B04S209", "B04 id": "B04S209.g10084",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.16500", "USR5 id": "NS.08745",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.16500", "USR5 id": "NS.08746",
    },
    {
    "MNH120 scaffold": "scaffold_554", "MNH120 id": "atg4020", "MNH120 function": "Possible nuclear transport factor 2-like protein", 
    "B04 scaffold": "B04S181", "B04 id": "B04S181.g9491", "B04 function": "Possible nuclear transport factor 2-like protein",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08747", "USR5 function": "Possible nuclear transport factor 2-like protein",
    },
    {
    "MNH120 scaffold": "scaffold_987", "MNH120 id": "atg10943", "MNH120 function": "Nuclear movement protein NUDC, possible HSP20-like chaperone", 
    "B04 scaffold": "B04S181", "B04 id": "B04S181.g9490", "B04 function": "Nuclear movement protein NUDC, possible HSP20-like chaperone",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "exon.CUFF.14241.1.26", "USR5 function": "Nuclear movement protein NUDC, possible HSP20-like chaperone",
    },
    {
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.16500", "USR5 id": "exon.CUFF.14242.1.27",
    },
    {
    "MNH120 scaffold": "scaffold_987", "MNH120 id": "atg10942", "MNH120 function": "Probable lipid transporter/ligase, possible Acetyl-CoA synthetase-like protien", 
    "B04 scaffold": "B04S181", "B04 id": "B04S181.g9489", "B04 function": "Probable lipid transporter/ligase, possible Acetyl-CoA synthetase-like protien",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08748", "USR5 function": "Probable lipid transporter/ligase, possible Acetyl-CoA synthetase-like protien",
    },
    {
    "B04 scaffold": "B04S181", "B04 id": "B04S181.g9488",        
    },
    {
    "MNH120 scaffold": "scaffold_987", "MNH120 id": "atg10941", "MNH120 function": "NCS1 family purine-pyrimidine transporter/permease",
    "B04 scaffold": "B04S181", "B04 id": "B04S181.g9487", "B04 function": "NCS1 family purine-pyrimidine transporter/permease",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08749", "USR5 function": "NCS1 family purine-pyrimidine transporter/permease",
    },
    {
    "MNH120 scaffold": "scaffold_987", "MNH120 id": "atg10941", "MNH120 function": "NCS1 family purine-pyrimidine transporter/permease", 
    "B04 scaffold": "B04S181", "B04 id": "B04S181.g9487", "B04 function": "NCS1 family purine-pyrimidine transporter/permease",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08750", "USR5 function": "NCS1 family purine-pyrimidine transporter/permease",
    },
    {
    "MNH120 scaffold": "scaffold_987", "MNH120 id": "atg10940", "MNH120 function": "Sterol reductase/Lamin B receptor",
    "B04 scaffold": "B04S181", "B04 id": "B04S181.g9486", "B04 function": "Sterol reductase/Lamin B receptor",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "exon.CUFF.14250.2.30", "USR5 function": "Sterol reductase/Lamin B receptor",
    },
    {
    "MNH120 scaffold": "scaffold_987", "MNH120 id": "atg10939", "MNH120 function": "Zinc finger C2H2 type domain protein", 
    "B04 scaffold": "B04S181", "B04 id": "B04S181.g9485", # Truncated at 5' end compared to USR5 and MNH120
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "exon.CUFF.14248.1.31", "USR5 function": "Zinc finger C2H2 type domain protein",
    },
    {
    "MNH120 scaffold": "scaffold_987", "MNH120 id": "atg10906",
    "B04 scaffold": "B04S181", "B04 id": "B04S181.g9484",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08751",
    },
    {
    "B04 scaffold": "B04S181", "B04 id": "B04S181.g9484",
    "USR5 scaffold": "NODE_5090_length_307152_cov_15.165000", "USR5 id": "NS.08752",
    },
    ]
table = pd.DataFrame(table)
table = table[["MNH120 scaffold", "MNH120 id", "USR5 scaffold", "USR5 id", "B04 scaffold", "B04 id", "MNH120 function", "USR5 function", "B04 function"]]
table.to_csv('group.function.tsv', sep='\t', index=False)
table.to_latex('group.function.tex', index=False, na_rep='', longtable=True)
table

Unnamed: 0,MNH120 scaffold,MNH120 id,USR5 scaffold,USR5 id,B04 scaffold,B04 id,MNH120 function,USR5 function,B04 function
0,scaffold_48,atg162,NODE_36160_length_249094_cov_15.165792,NS.06847,B04S209,B04S209.g10047,,,
1,scaffold_48,atg161,NODE_36160_length_249094_cov_15.165792,NS.06846,B04S209,B04S209.g10048,,,
2,scaffold_48,atg160,NODE_36160_length_249094_cov_15.165792,exon.CUFF.11287.1.9,B04S209,B04S209.g10049,Sulfite oxidase,Sulfite oxidase,Sulfite oxidase
3,,,,,B04S209,B04S209.g10050,,,
4,scaffold_48,atg159,NODE_36160_length_249094_cov_15.165792,exon.CUFF.11284.2.7,B04S209,B04S209.g10051,Amun-like protein,Amun-like protein,Amun-like protein
5,scaffold_893,atg7462,NODE_36160_length_249094_cov_15.165792,exon.CUFF.11303.1.5,B04S209,B04S209.g10052,Ubiquitin carboxyl-terminal hydrolase,Ubiquitin carboxyl-terminal hydrolase,Ubiquitin carboxyl-terminal hydrolase
6,scaffold_893,atg7463,NODE_36160_length_249094_cov_15.165792,exon.CUFF.11302.1.4,B04S209,B04S209.g10053,Pyruvate dehydrogenase kinase,Pyruvate dehydrogenase kinase,Pyruvate dehydrogenase kinase
7,,,NODE_36160_length_249094_cov_15.165792,NS.06845,,,,,
8,scaffold_893,atg7464,NODE_36160_length_249094_cov_15.165792,exon.CUFF.11302.1.3,B04S209,B04S209.g10054,,,
9,scaffold_893,atg7465,NODE_36160_length_249094_cov_15.165792,exon.CUFF.11282.1.2,B04S209,B04S209.g10055,Beta-glucosidase family 1,Beta-glucosidase family 1,Beta-glucosidase family 1


In [405]:
pd.DataFrame.from_dict?