In [None]:
import pandas as pd
import pickle
import ete3

root = '/home/tobiassonva/data/eukgen/'
%cd {root}
!pwd

In [None]:
#reformat header by keeping only accession. Save header in separate tsv. Save old header in tsv file for lookup.

from Bio import SeqIO

fastafile = '/home/tobiassonva/data/polvir/polb_reduced_set.faa'
seqs = SeqIO.parse(fastafile, 'fasta')
print('Writing')
with open(fastafile+'.no_header', 'w') as out:
    header_dict = {}
    for seq in seqs:
        try:
            header = seq.description.split('|', 1)
            #print(header)
            header_dict[header[0]] = header[1]
        except IndexError:
            header_dict[seq.id] = 'no_header'
        
        seq.description = '' 
        seq.id = header[0]
        SeqIO.write(seq, out, 'fasta')
print('Done')

with open(fastafile+'.header_mapping', 'w') as out:
    for key, value in header_dict.items():
        out.write(f'{key}\t{value}\n')


In [None]:
#reformat header by keeping only accession. Save header in separate tsv. Save old header in tsv file for lookup.
#reformmatting from uniprot xx|accession|description

from Bio import SeqIO

fastafile = '/home/tobiassonva/data/databases/human_protein_atlas_localisation/merged_uniprot_acc.fasta'
seqs = SeqIO.parse(fastafile, 'fasta')
print('Writing')
with open(fastafile+'.no_header', 'w') as out:
    header_dict = {}
    for seq in seqs:
        try:
            header = seq.description.split('|', 2)
            #print(header)
            header_dict[header[1]] = header[2]
        except IndexError:
            header_dict[seq.id] = 'no_header'
        
        seq.description = '' 
        seq.id = header[1]
        SeqIO.write(seq, out, 'fasta')
print('Done')

with open(fastafile+'.header_mapping', 'w') as out:
    for key, value in header_dict.items():
        out.write(f'{key}\t{value}\n')


In [None]:
header = '>sp|Q05655|KPCD_HUMAN Protein kinase C delta type OS=Homo sapiens OX=9606 GN=PRKCD PE=1 SV=2'

header.split('|', 2)

In [None]:
#brute force filter taxa assignment by trying several possible taxonomic rank labels
classes = {}
for index, row in eukprot_species_taxa.iterrows():
    lineage = ete3.NCBITaxa().get_lineage(index)
    ranks = ete3.NCBITaxa().get_rank(lineage)
    tax_class = -1
    
    for key, item in ranks.items():
        if item == 'class':
            tax_class = key
            
    if tax_class == -1: 
        #print(index, 'no class')
        for key, item in ranks.items():
            if item == 'phylum':
                tax_class = key

    
    classes[index] = tax_class
    if tax_class == -1 :
        #print(index, 'no phylum')
        for key, item in ranks.items():
            if item == 'superphylum':
                tax_class = key

                
    if tax_class == -1: 
        #print(index, 'no superphylum')
        for key, item in ranks.items():
            if item == 'subkingdom':
                tax_class = key

                
    if tax_class == -1: 
        #print(index, 'no subkingdom')
        for key, item in ranks.items():
            if item == 'kindom':
                tax_class = key
                
    if tax_class == -1: 
        #print(index, 'no kingdom')
        for key, item in ranks.items():
            if item == 'family':
                tax_class = key

    if tax_class == -1: 
        #print(index, 'no family')
        for key, item in ranks.items():
            if item == 'clade':
                tax_class = key
    
    if tax_class == -1:
        print('No taxonomy for ', index)
    
    classes[index] = tax_class
    #print(index, row.species, tax_class)
    
#final stragllers for eukprot
# classes[1246795] = 2686027
# classes[652834] = 759891
# classes[232287] = 589438
# classes[1764928] = 589438
# classes[1155914] = 589438

In [None]:
#remove bad headers from eukprot, slow but only need doing once
from Bio import SeqIO
import numpy as np

cleaned_tsv = '/home/tobiassonva/data/eukgen/processing/search/clean_eukprot/clean_eukprot3.tsv'
fastafile = '/home/tobiassonva/data/eukgen/core_data/euk72_ep/euk72_ep.fasta' 

badDB = pd.read_csv(cleaned_tsv, sep='\t', header=None)
bad_accs = badDB[1].unique()

seqs = SeqIO.parse(fastafile, 'fasta')

print('Writing')
with open(fastafile+'.cleaned', 'w') as out:
    bads = 0
    for seq in seqs:
            if seq.id not in bad_accs:
                SeqIO.write(seq, out, 'fasta')
            else:
                bads += 1
                bad_id = np.argwhere(bad_accs == seq.id)
                bad_accs = np.delete(bad_accs, bad_id)
                print(f'found {bads} bad {seq.id}, {bad_accs.shape}')
print('Done')

In [None]:
import subprocess








            

In [None]:
# read in all names and taxa from XML
all_assembly_data = []

print('Compiling tax for proteins')
with open('/home/tobiassonva/data/eukgen/core_data/asgard2023/wolf2024/proteins/assembly_result.xml', 'r') as infile:
    for line in infile.readlines():
        line = line.strip()
        if 'LastMajorRelease' in line:
            new_data = [line.strip('<LastMajorReleaseAccession>')[:-2]]
        elif '<Taxid>' in line:
            new_data.append(line.strip('<Taxid>')[:-2])
            all_assembly_data.append(new_data)

 # format the tax mapping for genome files
all_assembly_data = pd.DataFrame(all_assembly_data, columns = ['Assembly', 'orgid'])
orgids = ete3.NCBITaxa().get_taxid_translator(all_assembly_data.orgid.unique())
all_assembly_data['species'] = [orgids[int(acc)] for acc in all_assembly_data.orgid]
all_assembly_data['superkingdom'] = ['Archaea']*all_assembly_data.shape[0]
all_assembly_data['class'] = ['Asgard']*all_assembly_data.shape[0]           

all_assembly_data

print('Expansing to seqs')

# iterate over genome files
files = subprocess.run(f"find core_data/asgard2023/wolf2024/proteins/ -name '*.fasta'", shell=True, text=True, capture_output=True)
files = [file for file in files.stdout.split('\n')][:-1]
names = [name.rsplit('/',1)[-1].strip('.fasta') for name in files]

all_seq_data = []
for n, file in enumerate(files):
    print(names[n])
    seqs = SeqIO.parse(file, 'fasta')
    for seq in seqs:
        acc = seq.description.split(' ', 1)[0]
        new_data = [acc]
        new_data.extend(all_assembly_data[all_assembly_data.Assembly == names[n]][['orgid', 'species', 'superkingdom', 'class']].values[0])
        all_seq_data.append(new_data)

all_protein_data = pd.DataFrame(all_seq_data, columns = ['acc', 'orgid', 'species', 'superkingdom', 'class'])


# iterate over assembly files
files = subprocess.run(f"find core_data/asgard2023/wolf2024/genomic/ -name '*.fasta'", shell=True, text=True, capture_output=True)
files = [file for file in files.stdout.split('\n')][:-1]
names = [name.rsplit('/',1)[-1].rsplit('_', 2)[0] for name in files]

print('Compiling tax')
all_seq_data = []
for n, file in enumerate(files):
    seqs = SeqIO.parse(file, 'fasta')
    for seq in seqs:
        acc = seq.id
        if 'MAG:' in seq.description:
            species = seq.description.split('MAG: ', 1)[1].rsplit(',', 1)[0].rsplit(' ', 1)[0].rsplit(' ', 1)[0]
        elif 'complete genome' in seq.description:
            species = seq.description.split(' ',1)[1].rsplit(',', 1)[0].rsplit(' ', 1)[0].rsplit(' ', 1)[0]
        else:
            print('ERROR: ', seq.description)
        all_seq_data.append([acc, species])

all_assembly_data = pd.DataFrame(all_seq_data, columns=['acc', 'species'])
orgids = ete3.NCBITaxa().get_name_translator(all_assembly_data.species.unique())

#manual addendums
orgids['Asgard group archaeon isolate'] = [2591838]
orgids['Candidatus Asgardarchaeum californiense isolate'] = [3067292]
orgids['Candidatus Baldrarchaeota archaeon isolate'] = [2798909]
orgids['Candidatus Baldrarchaeum yapensis isolate'] = [2798909]
orgids['Candidatus Gerdarchaeota archaeon isolate'] = [2795490]
orgids['Candidatus Heimdallarchaeota archaeon isolate'] = [1936272]
orgids['Candidatus Heimdallarchaeum aukensis isolate'] = [1936272]
orgids['Candidatus Heimdallarchaeum endolithica isolate'] = [1936272]
orgids['Candidatus Helarchaeota ASGARD archaeon'] = [2572044]
orgids['Candidatus Helarchaeota archaeon isolate'] = [2572044]
orgids['Candidatus Hermodarchaeota archaeon isolate'] = [2798910]
orgids['Candidatus Hermodarchaeum yapensis isolate'] = [2798910]
orgids['Candidatus Hodarchaeota archaeon isolate'] = [2798906]
orgids['Candidatus Hodarchaeum mangrovi isolate'] = [2798906]
orgids['Candidatus Jordarchaeia archaeon isolate'] = [2823877]
orgids['Candidatus Lokiarchaeia archaeon isolate'] = [3049162]
orgids['Candidatus Lokiarchaeota archaeon isolate'] = [3049162]
orgids['Candidatus Lokiarchaeum ossiferum isolate'] = [3049162]
orgids['Candidatus Lokiarchaeum sp.'] = [3049162]
orgids['Candidatus Njordarchaeota archaeon isolate'] = [3036961]
orgids['Candidatus Njordarchaeum logiae isolate'] = [3036961]
orgids['Candidatus Odinarchaeia archaeon isolate'] = [3067281]
orgids['Candidatus Odinarchaeum yellowstonii isolate'] = [3067281]
orgids['Candidatus Prometheoarchaeum syntrophicum strain'] = [2732160]
orgids['Candidatus Sifarchaeia archaeon isolate'] = [2949321]
orgids['Candidatus Thorarchaeota archaeon isolate'] = [1706441]
orgids['Candidatus Wukongarchaeota archaeon isolate'] = [2798905]
orgids['Candidatus Wukongarchaeum yapensis isolate'] = [2798905]

all_assembly_data['orgid'] = [orgids[name][0] for name in all_assembly_data.species]
all_assembly_data['superkingdom'] = ['Archaea']*all_assembly_data.shape[0]
all_assembly_data['class'] = ['Asgard']*all_assembly_data.shape[0]
all_assembly_data.set_index('acc', inplace=True)


print('Expanding to seqs')
# find all accs in prodrigal data
file = '/home/tobiassonva/data/eukgen/core_data/asgard2023/wolf2024/genomic/prodrigal.fasta.no_header'
seqs = SeqIO.parse(file, 'fasta')
accs = [seq.description for seq in seqs]

all_seq_data = []
for acc in accs:
    data = [acc]
    data.extend(all_assembly_data.loc[acc.rsplit('_', 1)[0]][['orgid', 'species' ,'superkingdom', 'class']])
    all_seq_data.append(data)
    
all_assembly_data = pd.DataFrame(all_seq_data, columns=['acc', 'orgid', 'species', 'superkingdom', 'class'])
print('Done')


all_data = pd.concat([all_protein_data, all_assembly_data])

In [None]:
acc_mapping = pd.read_csv('/home/tobiassonva/data/eukgen/core_data/asgard2023/wolf2024/accession_mapping.tsv', sep ='\t')
orgids = ete3.NCBITaxa().get_name_translator(acc_mapping.Taxonomy.unique())
acc_mapping['superkingdom'] = ['Archaea']*acc_mapping.shape[0]
acc_mapping['class'] = ['Asgard']*acc_mapping.shape[0]
acc_mapping['orgid'] = [orgids[name][0] for name in acc_mapping.Taxonomy]

acc_mapping


In [None]:
file = '/home/tobiassonva/data/eukgen/core_data/asgard2023/wolf2024/prodrigal.fasta.no_header'
seqs = SeqIO.parse(file, 'fasta')
for seq in seqs:
    acc = seq.description
    data.append([names[n], acc])
    print([names[n], acc])

In [None]:
# format prok2111_as from merger of prok211 and ettema2023 datasets
# use preexisting prok taxonomy and import manuslly curated taxids for ettema2023

#load prok header info
with open('core_data/taxonomy/prok2111_protein_taxonomy.pkl', 'rb') as infile:
    prot_data = pickle.load(infile)

#read manual curated asgard header files
asgard_seqs = pd.read_csv('core_data/ettema2023/asgard_proteomes2023_header_mapping.tsv', sep ='\t', names=['acc','header'])
asgard_acc_species_map = pd.read_csv('core_data/ettema2023/asgard_acc_species_mapping.tsv', sep='\t', names=['acc', 'species', 'orgid', 'class'], index_col=0)

#format protein_taxonomy table
asgard_seqs[['orgid', 'species', 'superkingdom', 'class']] = [[asgard_acc_species_map.loc[acc.split('..')[0]]['orgid'], asgard_acc_species_map.loc[acc.split('..')[0]].species, 'Archaea', 'Asgard'] for acc in asgard_seqs.acc]
asgard_seqs.set_index('acc', inplace=True)
asgard_seqs.drop('header', axis=1, inplace=True)

#write to tsv and pickle
asgard_seqs.to_csv('core_data/ettema2023/asgard2023_protein_taxonomy.tsv', sep='\t')
with open('core_data/ettema2023/asgard2023_protein_taxonomy.pkl', 'wb') as pkl:
    pickle.dump(asgard_seqs, pkl)


In [None]:
#additional files

merged = pd.concat([prot_data, asgard_seqs])
merged.to_csv('core_data/prok2111_as/prok2111_as_protein_taxonomy.tsv', sep='\t', header=None)
taxdb_cols = merged[['orgid']]
taxdb_cols.to_csv('core_data/prok2111_as/prok2111.tax', header=None, sep='\t')

In [None]:
# format taxonomy for final prok2111 and asgard classes with revision
data = pd.read_csv('core_data/prok2111/prok2111.tax', sep='\t', names=['acc', 'orgid'])

# get lineages
translator =  ete3.NCBITaxa().get_lineage_translator(data.orgid)

# Bacteria is 2, only other option is Archaea
translator_superkingdom = {key: 'Bacteria' if 2 in value else 'Archaea' for key, value in translator.items()}

# add final missing orgids
for orgid, values in orgid_addendum.items():
    translator_superkingdom[orgid] = values[0]

# add data to dataframe
data['superkindom'] = [translator_superkingdom[org] for org in data.orgid]


#attemt to cocnstruct the "class" dataframe
translator_class = {}
for orgid, lineage in translator.items():
    # attempt to grab class from lineage
    try:
        tax_class_id = [key for key, value in ete3.NCBITaxa().get_rank(lineage).items() if value == 'class']
        translator_class[orgid] = tax_class_id[0]
    
    # if none is found see if its in the addendum
    except IndexError:
        try:
            translator_class[orgid] = class_addendum_ids[lineage[-1]]
            
        # if not return the orgid for review
        except KeyError:
            print(orgid, lineage)

# add final missing orgids
for orgid, values in orgid_addendum.items():
    translator_class[orgid] = values[2]
    

# covert to class names
translator_class_name = ete3.NCBITaxa().get_taxid_translator(translator_class.values())

# add final missing orgids
for orgid, values in orgid_addendum.items():
    translator_class_name[orgid] = values[1]

In [None]:
# add and translate classes to dataframe
data['class_id'] = [translator_class[orgid] for orgid in data.orgid]
data['class'] = [translator_class_name[orgid] if orgid != -1 else 'incertae sedis' for orgid in data.class_id ]

In [None]:
#save
data.drop(['class_id', 'class_revision'], axis = 1).to_csv('core_data/tmp_tax/prok2111.tax2', header=None, index=None, sep='\t')

In [None]:
data['class_revision'] = [class_revision[orgid] for orgid in data['class']]

#save
data.drop(['class', 'class_id'], axis = 1).to_csv('core_data/tmp_tax/prok2111.tax.edited', header=None, index=None)

In [None]:
# format euk72_ep with final revised taxonomy
# format taxonomy for final prok2111 and asgard classes with revision
euk_data = pd.read_csv('core_data/euk72_ep/euk72_ep.tax', sep='\t', names=['acc', 'orgid', 'superkingdom', 'class'])

# reassign unknowns
unk_ids = euk_data[euk_data['class'] == 'unknown'].orgid.unique()
unklist = euk_data[euk_data.orgid.isin(unk_ids)].orgid
names = [class_addendum[unk] for unk in unklist]
euk_data.loc[euk_data[euk_data.orgid.isin(unk_ids)].index, 'class'] = names

# revise classes
euk_data['class_revision'] = [class_revision[name] for name in euk_data['class']]

euk_data.drop(['class'], axis = 1).to_csv('core_data/tmp_tax/euk.tax.edited', header=None, index=None, sep = '\t')

In [None]:

class_addendum_ids = {1238993: 544448,
1637999: 203494,
1779382: 1853220,
1540872: -1,
1911684: 544448,
1911683: 544448,
2488809: 203494,
2572087: -1,
713059: -1,
2572089: -1,
1930275: 203683,
2527975: 203683,
2528011: 203683,
2528008: 203683,
2528012: 203683,
2528018: 203683,
2527962: 203683,
1848255: 1760,
1848756: 1760,
2725268: 544448,
2026780: 203683,
2026724: 32061,
1932692: 1313211,
2755338: 186801,
2268202: 649775,
713051: -1,
2231116: 544448,
2886196: 1313211,
1871025: 91061,
2098: 544448,
45363: 544448,
92401: 544448,
2116: 544448,
1048830: 544448,
2259673: 1643678,
2795383: 1643678,
439481: 183967,
1229908: 1643678,
1959104: 1643678,
2508726: 1643678,
1470067: 1643678,
1229909: 1643678,
1470066: 1643678,
1580092: 1643678,
662947: 544448,
662945: 544448,
662946: 544448,
243273: 544448,
663918: 544448,
272634: 544448,
2104: 544448,
1112856: 544448,
272633: 544448,
710127: 544448,
2096: 544448,
1159202: 544448,
1159204: 544448,
1006581: 544448,
146919: 1853220,
309807: 1853220,
761659: 1853220,
1159203: 544448,
436308: 1643678,
1582439: 1643678,
1898749: -1,
448385: 3031711,
56: 3031711,
1254432: 3031711,
374847: -1,
2111: 544448,
243272: 544448,
1763363: 1117,
502025: 3031711,
518766: 1853220,
762570: 1853220,
29549: 1853220,
347256: 544448,
1267000: 544448,
525904: 32061,
1263835: 544448,
710128: 544448,
708616: 544448,
1159197: 544448,
1159201: 544448,
1159199: 544448,
1159198: 544448,
1159200: 544448,
459349: -1,
1233262: 544448,
1236904: 544448,
1213463: 544448,
2121: 544448,
2124: 544448,
2730359: 203494,
2796139: 203494,
92400: 544448,
29559: 544448,
1441379: 544448,
722438: 544448,
2259672: -1,
1228987: 1117,
52: 3031711,
29554: 544448,
142649: 544448,
1410606: -1,
1603555: -1,
1846278: -1,
927083: 3031711,
1263761: 544448,
1263760: 544448,
1263757: 544448,
1263758: 544448,
1263756: 544448,
1263763: 544448,
1263762: 544448,
1280940: 544448,
243899: 91061,
1920749: -1,
142651: 544448,
2026885: -1,
1930593: -1,
888845: 3031711,
1391654: 3031711,
2594042: -1,
2715735: -1,
281847: 1031332,
5722: 5719,
1169539: 2605696,
5755: 555280,
4792: 4762,
691883: 2316402,
5762: 5752,
222440: 2611341,
529818: 172820,
2903: 2830,
31276: 2497438,
10228: 3043701,
5759: 2605435,
5741: 207245,
6035: 6029,
33067: -1,
315: -1,
295358: 544448,
262719: 544448,
262722: 544448,
262723: 544448,
754503: 544448,
33923: 544448,
66693: -1,
1410606: 1643678,
1246955: 544448,
67593: 4762,
2110: 544448,
2113: 544448,
2112: 544448,
2120: 544448,
2951803: 1935183,
2725: -1,
69011: -1,
331677: -1,
561169: 2830,
1643353: -1,
496833: 544448,
267748: 544448,
38654: 7711,
1316930: 544448,
2529394: 203494,
1579370: -1,
1579378: -1,
72359: 6029,
72586: -1,
498716: 544448,
171279: 544448,
171281: 544448,
171282: 544448,
171285: 544448,
695850: 4762,
1318617: 544448,
7897: 7711,
663362: -1,
8467: 7711,
8469: 7711,
8478: 7711,
8479: 7711,
8496: 7711,
8502: 7711,
565575: 544448,
42094: 544448,
1222016: 91061,
632292: 91061,
2795018: 2885733,
632335: 91061,
632348: 91061,
370354: 2605435,
370355: 2605435,
632516: 91061,
632518: 91061,
403677: 4762,
272635: 544448,
2665156: -1,
273119: 544448,
76629: 544448,
142650: 544448,
77133: -1,
1257118: 555280,
798570: -1,
700510: -1,
1618337: -1,
1094619: 4762,
1619007: -1,
1619070: -1,
1619077: -1,
13735: 7711,
79676: -1,
505682: 544448,
374847: -1,
1620412: -1,
1882918: 3031711,
572263: 544448,
48003: 544448,
671143: -1,
311458: 1643678,
114880: 544448,
114881: 544448,
114885: 544448,
1394709: -1,
1394711: -1,
673860: 183967,
608506: 91061,
412133: 2611341,
1690483: 200643,
347257: 544448,
1920749: 2885733,
2608984: 1042312,
414004: 1643678,
53558: 544448,
184922: 2611341,
86660: 544448,
578460: 6029,
1267001: 544448,
907287: 544448,
284813: 6029,
743966: 544448,
743971: 544448,
2841263: -1,
2841264: -1,
907965: 6029,
351627: 91061,
876142: 6029,
289397: 544448,
60694: -1,
2715735: -1,
94835: 7711,
29553: 544448,
29555: 544448,
29556: 544448,
1700835: -1,
1700836: -1,
521460: 91061,
1930593: -1,
1078905: 1643678,
1898108: -1,
1603555: 1643678,
1178016: 6029,
1538547: 1935183,
1899017: -1,
2259672: 1643678,
294381: 2605435,
2094: 544448,
2099: 544448,
2100: 544448,
2107: 544448,
2109: 544448,
2130: 544448,
28903: 544448,
29561: 544448,
38504: 544448,
45361: 544448,
134821: 544448,
171284: 544448,
634997: 544448,
637387: 544448,
872331: 544448,
936139: 544448,
943945: 544448,
956483: 544448,
1116211: 544448,
1117644: 544448,
1118964: 544448,
1129369: 544448,
1397850: 544448}

orgid_addendum = {85643: ['Bacteria', 'Betaproteobacteria', 28216],
263820: ['Archaea', 'Thermoplasmata', 183967],
228908: ['Archaea', 'Candidatus Nanoarchaeia', 2885752],
342610: ['Bacteria', 'Betaproteobacteria', 28216],
866775: ['Bacteria', 'Gammaproteobacteria', 1236],
1740162: ['Bacteria', 'Betaproteobacteria', 28216],
1827146: ['Bacteria', 'Bacilli', 91061],
1963032: ['Bacteria', 'Bacilli', 91061],
2478912: ['Bacteria', 'Alphaproteobacteria', 28211],
2679994: ['Bacteria', 'Gammaproteobacteria', 1236],
2795384: ['Bacteria', 'Verrucomicrobiae', 203494]}

class_addendum = {1238993: 'Mycoplasmatota',
1637999: 'Verrucomicrobiae',
1779382: 'Rhodothermota',
1540872: 'incertae sedis',
1911684: 'Mycoplasmatota',
1911683: 'Mycoplasmatota',
2488809: 'Verrucomicrobiae',
2572087: 'incertae sedis',
713059: 'incertae sedis',
2572089: 'incertae sedis',
1930275: 'Planctomycetia',
2527975: 'Planctomycetia',
2528011: 'Planctomycetia',
2528008: 'Planctomycetia',
2528012: 'Planctomycetia',
2528018: 'Planctomycetia',
2527962: 'Planctomycetia',
1848255: 'Actinomycetes',
1848756: 'Actinomycetes',
2725268: 'Mycoplasmatota',
2026780: 'Planctomycetia',
2026724: 'Chloroflexia',
1932692: 'Lentisphaeria',
2755338: 'Clostridia',
2268202: 'Synergistia',
713051: 'incertae sedis',
2231116: 'Mycoplasmatota',
2886196: 'Lentisphaeria',
1871025: 'Bacilli',
2098: 'Mycoplasmatota',
45363: 'Mycoplasmatota',
92401: 'Mycoplasmatota',
2116: 'Mycoplasmatota',
1048830: 'Mycoplasmatota',
2259673: 'Nitrososphaeria',
2795383: 'Nitrososphaeria',
439481: 'Thermoplasmata',
1229908: 'Nitrososphaeria',
1959104: 'Nitrososphaeria',
2508726: 'Nitrososphaeria',
1470067: 'Nitrososphaeria',
1229909: 'Nitrososphaeria',
1470066: 'Nitrososphaeria',
1580092: 'Nitrososphaeria',
662947: 'Mycoplasmatota',
662945: 'Mycoplasmatota',
662946: 'Mycoplasmatota',
243273: 'Mycoplasmatota',
663918: 'Mycoplasmatota',
272634: 'Mycoplasmatota',
2104: 'Mycoplasmatota',
1112856: 'Mycoplasmatota',
272633: 'Mycoplasmatota',
710127: 'Mycoplasmatota',
2096: 'Mycoplasmatota',
1159202: 'Mycoplasmatota',
1159204: 'Mycoplasmatota',
1006581: 'Mycoplasmatota',
146919: 'Rhodothermota',
309807: 'Rhodothermota',
761659: 'Rhodothermota',
1159203: 'Mycoplasmatota',
436308: 'Nitrososphaeria',
1582439: 'Nitrososphaeria',
1898749: 'incertae sedis',
448385: 'Polyangia',
56: 'Polyangia',
1254432: 'Polyangia',
374847: 'incertae sedis',
2111: 'Mycoplasmatota',
243272: 'Mycoplasmatota',
1763363: 'Cyanobacteriota',
502025: 'Polyangia',
518766: 'Rhodothermota',
762570: 'Rhodothermota',
29549: 'Rhodothermota',
347256: 'Mycoplasmatota',
1267000: 'Mycoplasmatota',
525904: 'Chloroflexia',
1263835: 'Mycoplasmatota',
710128: 'Mycoplasmatota',
708616: 'Mycoplasmatota',
1159197: 'Mycoplasmatota',
1159201: 'Mycoplasmatota',
1159199: 'Mycoplasmatota',
1159198: 'Mycoplasmatota',
1159200: 'Mycoplasmatota',
459349: 'incertae sedis',
1233262: 'Mycoplasmatota',
1236904: 'Mycoplasmatota',
1213463: 'Mycoplasmatota',
2121: 'Mycoplasmatota',
2124: 'Mycoplasmatota',
2730359: 'Verrucomicrobiae',
2796139: 'Verrucomicrobiae',
92400: 'Mycoplasmatota',
29559: 'Mycoplasmatota',
1441379: 'Mycoplasmatota',
722438: 'Mycoplasmatota',
2259672: 'incertae sedis',
1228987: 'Cyanobacteriota',
52: 'Polyangia',
29554: 'Mycoplasmatota',
142649: 'Mycoplasmatota',
1410606: 'incertae sedis',
1603555: 'incertae sedis',
1846278: 'incertae sedis',
927083: 'Polyangia',
1263761: 'Mycoplasmatota',
1263760: 'Mycoplasmatota',
1263757: 'Mycoplasmatota',
1263758: 'Mycoplasmatota',
1263756: 'Mycoplasmatota',
1263763: 'Mycoplasmatota',
1263762: 'Mycoplasmatota',
1280940: 'Mycoplasmatota',
243899: 'Bacilli',
1920749: 'incertae sedis',
142651: 'Mycoplasmatota',
2026885: 'incertae sedis',
1930593: 'incertae sedis',
888845: 'Polyangia',
1391654: 'Polyangia',
2594042: 'incertae sedis',
2715735: 'incertae sedis',
281847: 'Cryptomycota',
5722: 'Parabasalia',
1169539: 'Vitrellaceae',
5755: 'Discosea',
4792: 'Oomycota',
691883: 'Fonticulaceae',
5762: 'Heterolobosea',
222440: 'Metamonada',
529818: 'Apusomonadidae',
2903: 'Haptophyta',
31276: 'Perkinsozoa',
10228: 'Uniplacotomia',
5759: 'Evosea',
5741: 'Fornicata',
6035: 'Microsporidia',
33067: 'incertae sedis',
315: 'incertae sedis',
295358: 'Mycoplasmatota',
262719: 'Mycoplasmatota',
262722: 'Mycoplasmatota',
262723: 'Mycoplasmatota',
754503: 'Mycoplasmatota',
33923: 'Mycoplasmatota',
66693: 'incertae sedis',
1410606: 'Nitrososphaeria',
1246955: 'Mycoplasmatota',
67593: 'Oomycota',
2110: 'Mycoplasmatota',
2113: 'Mycoplasmatota',
2112: 'Mycoplasmatota',
2120: 'Mycoplasmatota',
2951803: 'Asgard',
2725: 'incertae sedis',
69011: 'incertae sedis',
331677: 'incertae sedis',
561169: 'Haptophyta',
1643353: 'incertae sedis',
496833: 'Mycoplasmatota',
267748: 'Mycoplasmatota',
38654: 'Chordata',
1316930: 'Mycoplasmatota',
2529394: 'Verrucomicrobiae',
1579370: 'incertae sedis',
1579378: 'incertae sedis',
72359: 'Microsporidia',
72586: 'incertae sedis',
498716: 'Mycoplasmatota',
171279: 'Mycoplasmatota',
171281: 'Mycoplasmatota',
171282: 'Mycoplasmatota',
171285: 'Mycoplasmatota',
695850: 'Oomycota',
1318617: 'Mycoplasmatota',
7897: 'Chordata',
663362: 'incertae sedis',
8467: 'Chordata',
8469: 'Chordata',
8478: 'Chordata',
8479: 'Chordata',
8496: 'Chordata',
8502: 'Chordata',
565575: 'Mycoplasmatota',
42094: 'Mycoplasmatota',
1222016: 'Bacilli',
632292: 'Bacilli',
2795018: 'Candidatus Micrarchaeia',
632335: 'Bacilli',
632348: 'Bacilli',
370354: 'Evosea',
370355: 'Evosea',
632516: 'Bacilli',
632518: 'Bacilli',
403677: 'Oomycota',
272635: 'Mycoplasmatota',
2665156: 'incertae sedis',
273119: 'Mycoplasmatota',
76629: 'Mycoplasmatota',
142650: 'Mycoplasmatota',
77133: 'incertae sedis',
1257118: 'Discosea',
798570: 'incertae sedis',
700510: 'incertae sedis',
1618337: 'incertae sedis',
1094619: 'Oomycota',
1619007: 'incertae sedis',
1619070: 'incertae sedis',
1619077: 'incertae sedis',
13735: 'Chordata',
79676: 'incertae sedis',
505682: 'Mycoplasmatota',
374847: 'incertae sedis',
1620412: 'incertae sedis',
1882918: 'Polyangia',
572263: 'Mycoplasmatota',
48003: 'Mycoplasmatota',
671143: 'incertae sedis',
311458: 'Nitrososphaeria',
114880: 'Mycoplasmatota',
114881: 'Mycoplasmatota',
114885: 'Mycoplasmatota',
1394709: 'incertae sedis',
1394711: 'incertae sedis',
673860: 'Thermoplasmata',
608506: 'Bacilli',
412133: 'Metamonada',
1690483: 'Bacteroidia',
347257: 'Mycoplasmatota',
1920749: 'Candidatus Micrarchaeia',
2608984: 'Armatimonadia',
414004: 'Nitrososphaeria',
53558: 'Mycoplasmatota',
184922: 'Metamonada',
86660: 'Mycoplasmatota',
578460: 'Microsporidia',
1267001: 'Mycoplasmatota',
907287: 'Mycoplasmatota',
284813: 'Microsporidia',
743966: 'Mycoplasmatota',
743971: 'Mycoplasmatota',
2841263: 'incertae sedis',
2841264: 'incertae sedis',
907965: 'Microsporidia',
351627: 'Bacilli',
876142: 'Microsporidia',
289397: 'Mycoplasmatota',
60694: 'incertae sedis',
2715735: 'incertae sedis',
94835: 'Chordata',
29553: 'Mycoplasmatota',
29555: 'Mycoplasmatota',
29556: 'Mycoplasmatota',
1700835: 'incertae sedis',
1700836: 'incertae sedis',
521460: 'Bacilli',
1930593: 'incertae sedis',
1078905: 'Nitrososphaeria',
1898108: 'incertae sedis',
1603555: 'Nitrososphaeria',
1178016: 'Microsporidia',
1538547: 'Asgard',
1899017: 'incertae sedis',
2259672: 'Nitrososphaeria',
294381: 'Evosea',
2094: 'Mycoplasmatota',
2099: 'Mycoplasmatota',
2100: 'Mycoplasmatota',
2107: 'Mycoplasmatota',
2109: 'Mycoplasmatota',
2130: 'Mycoplasmatota',
28903: 'Mycoplasmatota',
29561: 'Mycoplasmatota',
38504: 'Mycoplasmatota',
45361: 'Mycoplasmatota',
134821: 'Mycoplasmatota',
171284: 'Mycoplasmatota',
634997: 'Mycoplasmatota',
637387: 'Mycoplasmatota',
872331: 'Mycoplasmatota',
936139: 'Mycoplasmatota',
943945: 'Mycoplasmatota',
956483: 'Mycoplasmatota',
1116211: 'Mycoplasmatota',
1117644: 'Mycoplasmatota',
1118964: 'Mycoplasmatota',
1129369: 'Mycoplasmatota',
1397850: 'Mycoplasmatota'}

class_revision = {'Gammaproteobacteria': 'Gammaproteobacteria',
'Actinomycetes': 'Actinomycetota',
'Alphaproteobacteria': 'Alphaproteobacteria',
'Betaproteobacteria': 'Betaproteobacteria',
'Deltaproteobacteria': 'Deltaproteobacteria',
'Mollicutes': 'Mycoplasmatota',
'Mycoplasmatota':'Mycoplasmatota',
'incertae sedis': 'DELETE',
'Korarchaeia': 'TACK group',
'Myxococcia': 'Myxococcota',
'Bipolaricaulia': 'DELETE',
'Chloroflexia': 'Chloroflexota',
'Thermodesulfobacteria': 'Thermodesulfobacteriota',
'Deferribacteres': 'DELETE',
'Acidimicrobiia': 'Actinomycetota',
'Rubrobacteria': 'Actinomycetota',
'Coriobacteriia': 'Actinomycetota',
'Bacilli': 'Bacillota',
'Flavobacteriia': 'FCB group',
'Sphingobacteriia': 'FCB group',
'Chrysiogenetes': 'DELETE',
'Spartobacteria': 'PVC group',
'Thermoprotei': 'TACK group',
'Methanobacteria': 'Euryarchaeota',
'Methanococci': 'Euryarchaeota',
'Halobacteria': 'Euryarchaeota',
'Thermoplasmata': 'Thermoplasmata',
'Thermococci': 'Euryarchaeota',
'Archaeoglobi': 'Euryarchaeota',
'Methanopyri': 'Euryarchaeota',
'Clostridia': 'Bacillota',
'Aquificae': 'DELETE',
'Thermotogae': 'Thermotogae',
'Rhodothermia': 'FCB group',
'Deinococci': 'DELETE',
'Thermomicrobia': 'DELETE',
'Chlorobiia': 'FCB group',
'Bacteroidia': 'FCB group',
'Dictyoglomia': 'DELETE',
'Fusobacteriia': 'DELETE',
'Verrucomicrobiae': 'PVC group',
'Planctomycetia': 'PVC group',
'Spirochaetia': 'DELETE',
'Nitrospiria': 'DELETE',
'Chlamydiia': 'Chlamydiia',
'Fibrobacteria': 'FCB group',
'Terriglobia': 'Acidobacteriota',
'Gemmatimonadetes': 'FCB group',
'Methanomicrobia': 'Euryarchaeota',
'Anaerolineae': 'Chloroflexota',
'Dehalococcoidia': 'Chloroflexota',
'Gracilibacteria': 'DELETE',
'Ktedonobacteria': 'Chloroflexota',
'Opitutae': 'PVC group',
'Endomicrobiia': 'DELETE',
'Caldilineae': 'Chloroflexota',
'Erysipelotrichia': 'Bacillota',
'Holophagae': 'Acidobacteriota',
'Zetaproteobacteria': 'DELETE',
'Elusimicrobia': 'DELETE',
'Synergistia': 'DELETE',
'Phycisphaerae': 'PVC group',
'Caldisericia': 'DELETE',
'Cytophagia': 'FCB group',
'Ignavibacteria': 'FCB group',
'Nitriliruptoria': 'Actinomycetota',
'Negativicutes': 'Bacillota',
'Nanohaloarchaea': 'Euryarchaeota',
'Lentisphaeria': 'PVC group',
'Ardenticatenia': 'Chloroflexota',
'Thermoleophilia': 'Actinomycetota',
'Oligoflexia': 'DELETE',
'Blastocatellia': 'Acidobacteriota',
'Nitrososphaeria': 'TACK group',
'Fimbriimonadia': 'DELETE',
'Limnochordia': 'Bacillota',
'Tissierellia': 'Bacillota',
'Acidithiobacillia': 'DELETE',
'Vicinamibacteria': 'Acidobacteriota',
'Rhodothermota': 'FCB group',
'Balneolia': 'FCB group',
'Chitinophagia': 'FCB group',
'Candidatus Izimaplasma': 'Mycoplasmatota',
'Kiritimatiellia': 'PVC group',
'Asgard': 'Asgard',
'Saprospiria': 'FCB group',
'Methylacidiphilae': 'PVC group',
'Calditrichia': 'DELETE',
'Hydrogenophilia': 'DELETE',
'Candidatus Saccharimonadia': 'DELETE',
'Saccharimonadia': 'DELETE',
'Coprothermobacteria': 'DELETE',
'Candidatus Thermofonsia': 'Chloroflexota',
'Candidatus Babeliae': 'DELETE',
'Candidatus Brocadiia': 'PVC group',
'Tepidiformia': 'Chloroflexota',
'Conexivisphaeria': 'TACK group',
'Thermodesulfovibrionia': 'DELETE',
'Atribacteria': 'DELETE',
'Candidatus Nanohalobia': 'DPANN group',
'Candidatus Micrarchaeia': 'DPANN group',
'Micrarchaeia': 'DPANN group',
'Candidatus Nanoarchaeia': 'DPANN group',
'Tichowtungiia': 'PVC group',
'Syntrophobacteria': 'Thermodesulfobacteriota',
'Desulfobacteria': 'Thermodesulfobacteriota',
'Cyanophyceae': 'Cyanobacteriota',
'Bdellovibrionia': 'DELETE',
'Bacteriovoracia': 'Bdellovibrionota',
'Desulfovibrionia': 'Thermodesulfobacteriota',
'Desulfobulbia': 'Thermodesulfobacteriota',
'Desulfarculia': 'Thermodesulfobacteriota',
'Desulfobaccia': 'Thermodesulfobacteriota',
'Syntrophia': 'Thermodesulfobacteriota',
'Desulfomonilia': 'Thermodesulfobacteriota',
'Desulfuromonadia': 'Thermodesulfobacteriota',
'Polyangia': 'Myxococcota',
'Epsilonproteobacteria': 'Campylobacterota',
'Desulfurellia': 'Campylobacterota',
'Bangiophyceae': 'Rhodophyta',
'Florideophyceae': 'Rhodophyta',
'Chrysophyceae': 'Ochrophyta',
'Haptophyta': 'Haptista',
'Xanthophyceae': 'Ochrophyta',
'Dinophyceae': 'Dinophyceae',
'Phaeophyceae': 'Ochrophyta',
'Cryptophyceae': 'Cryptophyceae',
'Euglenida': 'Euglenida',
'Chlorophyta': 'Chlorophyta',
'Chlorophyceae': 'Chlorophyta',
'Bryopsida': 'Streptophyta',
'Gnetopsida': 'Streptophyta',
'Magnoliopsida': 'Streptophyta',
'Oomycota': 'Oomycota',
'Saccharomycetes': 'Ascomycota',
'Ustilaginomycetes': 'Basidiomycota',
'Kinetoplastea': 'Euglenida',
'Parabasalia': 'Metamonada',
'Eustigmatophyceae': 'Ochrophyta',
'Heterolobosea': 'Heterolobosea',
'Colpodea': 'Ciliophora',
'Litostomatea': 'Ciliophora',
'Prostomatea': 'Ciliophora',
'Nassophorea': 'Ciliophora',
'Oligohymenophorea': 'Ciliophora',
'Microsporidia': 'Microsporidia',
'Demospongiae': 'Metazoa',
'Hydrozoa': 'Metazoa',
'Anthozoa': 'Metazoa',
'Scyphozoa': 'Metazoa',
'Trematoda': 'Metazoa',
'Cestoda': 'Metazoa',
'Polychaeta': 'Metazoa',
'Gastropoda': 'Metazoa',
'Bivalvia': 'Metazoa',
'Cephalopoda': 'Metazoa',
'Branchiopoda': 'Metazoa',
'Arachnida': 'Metazoa',
'Chilopoda': 'Metazoa',
'Echinoidea': 'Metazoa',
'Chordata': 'Metazoa',
'Ascidiacea': 'Metazoa',
'Chondrichthyes': 'Metazoa',
'Amphibia': 'Metazoa',
'Lepidosauria': 'Metazoa',
'Aves': 'Metazoa',
'Calcarea': 'Metazoa',
'Choanoflagellata': 'Choanoflagellata',
'Foraminifera': 'Rhizaria',
'Chlorarachniophyceae': 'Rhizaria',
'Ginkgoopsida': 'Streptophyta',
'Ulvophyceae': 'Chlorophyta',
'Opisthokonta': 'Opisthokonta',
'Spirotrichea': 'Ciliophora',
'Coscinodiscophyceae': 'Ochrophyta',
'Bacillariophyceae': 'Ochrophyta',
'Fragilariophyceae': 'Ochrophyta',
'Synurophyceae': 'Ochrophyta',
'Pelagophyceae': 'Ochrophyta',
'Phyllopharyngea': 'Ciliophora',
'Glaucocystophyceae': 'Glaucocystophyceae',
'Raphidophyceae': 'Ochrophyta',
'Dictyochophyceae': 'Ochrophyta',
'Mammalia': 'Metazoa',
'Clitellata': 'Metazoa',
'Insecta': 'Metazoa',
'Pinopsida': 'Streptophyta',
'Hexactinellida': 'Metazoa',
'Acantharea': 'Rhizaria',
'Polycystinea': 'Rhizaria',
'Hexanauplia': 'Metazoa',
'Trebouxiophyceae': 'Chlorophyta',
'Homoscleromorpha': 'Metazoa',
'Ancyromonadidae': 'Ancyromonadidae',
'Bolidophyceae': 'Ochrophyta',
'Mesostigmatophyceae': 'Streptophyta',
'Lingulata': 'Metazoa',
'Hyperoartia': 'Metazoa',
'Chromadorea': 'Metazoa',
'Ichthyosporea': 'Ichthyosporea',
'Zygnemophyceae': 'Streptophyta',
'Chlorokybophyceae': 'Streptophyta',
'Klebsormidiophyceae': 'Streptophyta',
'Malawimonadidae': 'Malawimonadidae',
'Cercozoa': 'Rhizaria',
'Tentaculata': 'Metazoa',
'Eumycetozoa': 'Amoebozoa',
'Jakobidae': 'Jakobida',
'Dothideomycetes': 'Ascomycota',
'Eurotiomycetes': 'Ascomycota',
'Lecanoromycetes': 'Ascomycota',
'Pezizomycetes': 'Ascomycota',
'Sordariomycetes': 'Ascomycota',
'Schizosaccharomycetes': 'Ascomycota',
'Taphrinomycetes': 'Ascomycota',
'Neolectomycetes': 'Ascomycota',
'Tremellomycetes': 'Basidiomycota',
'Agaricomycetes': 'Basidiomycota',
'Pinguiophyceae': 'Ochrophyta',
'Microbotryomycetes': 'Basidiomycota',
'Apusomonadidae': 'Apusozoa',
'Colpodellaceae': 'Colpodellida',
'Actinopteri': 'Metazoa',
'Marchantiopsida': 'Streptophyta',
'Collodictyonidae': 'CRuMs',
'Diplonemea': 'Euglenida',
'Centroplasthelida': 'Haptista',
'Heterotrichea': 'Ciliophora',
'Fornicata': 'Metamonada',
'Glomeromycetes': 'Fungi incertae sedis',
'Polypodiopsida': 'Streptophyta',
'Coleochaetophyceae': 'Streptophyta',
'Charophyceae': 'Streptophyta',
'Anthocerotopsida': 'Streptophyta',
'Picozoa': 'Eukaryota incertae sedis',
'Aconoidasida': 'Apicomplexa',
'Wallemiomycetes': 'Basidiomycota',
'Cystobasidiomycetes': 'Basidiomycota',
'Stylonematophyceae': 'Rhodophyta',
'Chytridiomycetes': 'Fungi incertae sedis',
'Monoblepharidomycetes': 'Fungi incertae sedis',
'Neocallimastigomycetes': 'Fungi incertae sedis',
'Blastocladiomycetes': 'Fungi incertae sedis',
'Amoebozoa': 'Amoebozoa',
'Discosea': 'Amoebozoa',
'Histionidae': 'Jakobida',
'Synchromophyceae': 'Ochrophyta',
'Telonemida': 'Telonemida',
'Mediophyceae': 'Ochrophyta',
'Armophorea': 'Ciliophora',
'Rhodellophyceae': 'Rhodophyta',
'Palpitomonas': 'Eukaryota incertae sedis',
'Compsopogonophyceae': 'Rhodophyta',
'Thecofilosea': 'Rhizaria',
'Cryptomycota': 'Fungi incertae sedis',
'Mamiellophyceae': 'Chlorophyta',
'Tsukubamonadidae': 'Tsukubamonadidae',
'Xylonomycetes': 'Ascomycota',
'Mantamonadidae': 'CRuMs',
'Nephroselmidophyceae': 'Chlorophyta',
'Plagiopylea': 'Ciliophora',
'Conoidasida': 'Apicomplexa',
'Planomonadidae': 'Ancyromonadidae',
'Picomonadea': 'Eukaryota incertae sedis',
'Basidiobolomycetes': 'Fungi incertae sedis',
'Entomophthoromycetes': 'Fungi incertae sedis',
'Breviatea': 'Breviatea',
'Flabellinia': 'Amoebozoa',
'Colponemidia': 'Colponemidia',
'Aphelidea': 'Aphelidea',
'Lycopodiopsida': 'Streptophyta',
'Chlorodendrophyceae': 'Chlorophyta',
'Malasseziomycetes': 'Basidiomycota',
'Imbricatea': 'Rhizaria',
'Ascetosporea': 'Rhizaria',
'Palmophyllophyceae': 'Chlorophyta',
'Mucoromycetes': 'Fungi incertae sedis',
'Mortierellomycetes': 'Fungi incertae sedis',
'Kickxellomycetes': 'Fungi incertae sedis',
'Dimargaritomycetes': 'Fungi incertae sedis',
'Harpellomycetes': 'Fungi incertae sedis',
'Zoopagomycetes': 'Fungi incertae sedis',
'Chloropicophyceae': 'Chlorophyta',
'Fonticulaceae': 'Fonticulaceae',
'Hemimastigophora': 'Hemimastigophora',
'Perkinsozoa': 'Perkinsozoa',
'Rhodelphea': 'Rhodelphea',
'Endomyxa': 'Rhizaria',
'Variosea': 'Amoebozoa',
'Evosea': 'Amoebozoa',
'Vitrellaceae': 'Colpodellida',
'Chromeraceae': 'Colpodellida',
'Alphamonaceae': 'Colpodellida',
'CRuMs': 'CRuMs',
'Metamonada': 'Metamonada',
'Preaxostyla': 'Metamonada',
'Echinamoebida': 'Amoebozoa',
'Elardia': 'Amoebozoa',
'Leptocardii': 'Metazoa',
'Andaluciidae': 'Jakobida',
'Stygiellidae': 'Jakobida',
'Bigyra': 'Bigyra',
'Ancyromonadida': 'Ancyromonadidae',
'Filasterea': 'Filasterea',
'Picocystophyceae': 'Chlorophyta',
'Sar': 'Sar',
'Pyramimonadophyceae': 'Chlorophyta',
'Muranotrichea': 'Ciliophora',
'Phytomyxea': 'Rhizaria',
'Prasinodermophyceae': 'Prasinodermophyceae',
'Sanchytriomycetes': 'Fungi incertae sedis',
'Marosporida': 'Apicomplexa',
'Nebulidea': 'Nebulidea',
'Uniplacotomia': 'Metazoa',
'Candidatus Cloacimonadia': 'FCB group',
'Candidatus Lokiarchaeia': 'Asgard',
'Candidatus Nanohaloarchaea': 'Euryarchaeota',
'Cyanobacteriota': 'Cyanobacteriota'}

In [None]:
def parse_annot(file):
    data = pd.read_csv(file, sep='\t', index_col=0)
    
    data[['label_type', 'predicted_label']] = [label.split(':') for label in data.predicted_label]

    data_pfam = data[data.label_type == 'Pfam']
    data_EC = data[data.label_type == 'EC']
    data_GO = data[data.label_type == 'GO']
    
    data_EC = data_EC.reset_index().drop_duplicates('sequence_name', keep='last').set_index('sequence_name')
    data_EC['defined_EC'] = [True if label[-1] != '-' else False for label in data_EC.predicted_label]
    
    return data_pfam, data_EC, data_GO

In [None]:
prok_labels = pd.read_csv('core_data/prok2111_as/prok2111_as.fasta.header_mapping', sep='\t', index_col=0, header=None)
euk_labels = labels = pd.read_csv('core_data/euk72_ep/euk72_ep.fasta.header_mapping', sep='\t', index_col=0, header=None)

labels= pd.concat([prok_labels, euk_labels])

In [None]:
prok_labels = prok_labels.columns = ['acc', 'desc']

In [None]:
labels.columns = ['desc']

In [None]:
labels

In [None]:
euk_labels. = ['acc', 'desc']

In [None]:
data_pfam, data_EC, data_GO = parse_annot('/data/luojaa/proteinfer/annot_microcosms/glycolysis.tsv')

In [None]:
data_EC[data_EC.defined_EC]

In [None]:
data = pd.read_csv('/data/luojaa/proteinfer/annot_microcosms/glycolysis.tsv', sep='\t', index_col=0)

In [None]:
data.index.unique()

In [None]:
data_GO.description.value_counts()[:50]

In [None]:
data_EC[(data_EC.defined_EC) & (data_EC.confidence > 0.9)].description.unique()

In [None]:
data_EC[(data_EC.defined_EC) & (data_EC.confidence > 0.9)].description.value_counts()

In [None]:
i = data_EC[data_EC.description == 'Ferredoxin--NADP(+) reductase.'].index
labels.loc[i]

In [None]:
 data_EC[data_EC.description == 'Ferredoxin--NADP(+) reductase.']

In [None]:
prok_i = data_EC[(data_EC.defined_EC) & (data_EC.confidence > 0.9)].index[10:]

In [None]:
data_EC.loc[prok_i]

In [None]:
print(*data_EC.loc[prok_i].index.values, sep=',')

In [None]:
labels.loc[prok_i].values

In [None]:
data_EC.iloc[:31].reset_index()#.drop_duplicates('predicted_label', keep='last')

In [None]:
data_EC.iloc[:31].reset_index().drop_duplicates('sequence_name', keep='last')

In [None]:
data[(data.defined_EC) & (data.label_type == 'EC') & (data.confidence > 0.95)].