In [16]:
import os
import os.path
from ete3 import Tree

# Extract Arabidopsis reference KO annotations

In [17]:
athkeggout = '../../../her7/02_functional_annotations/Arabidopsis_KEGG_reference_annotations.txt'
bs_threshold = 0

In [18]:
# Ath gene ids to ko KEGG terms
result = os.popen("curl https://rest.kegg.jp/link/ko/ath").read()

fo = open(athkeggout, 'w')
fo.write(result)
fo.close()

koDict = {}
lines = result.split('\n')

for line in lines:
    #print(line)
    if line == '':
        continue
        
    gene = line.split('\t')[0].split(':')[1]
    ko = line.rstrip().split('\t')[1].split(':')[1]
    #print(gene,ko)
    
    if '+' in ko:
        print('Warning:', ko)
        continue

    koDict[gene] = ko

In [19]:
koFriends = {}

#list of all ath KEGG modules (to pair orthologous ko terms together)
# eg K12638 and K12637 in M00371
result = os.popen("curl https://rest.kegg.jp/link/module/ath").read()
lines = result.split('\n')

modSet = set()
for line in lines:
    mod = line.rstrip().split('_')[-1]
    #print(mod)
    modSet.add(mod)
    
for mod in modSet:
    #print(mod)
    if mod == '':
        continue
    modres = os.popen("curl https://rest.kegg.jp/get/" + mod ).read()
    #print(modres)
    
    orthologs = modres.split('ORTHOLOGY  ')[1].split('CLASS  ')[0]
    for line in orthologs.split('\n'):
        if line == '':
            continue

        kolist = line.split()[0].split(',')

        if len(kolist) > 1:
            koset = set()
            
            for ko in kolist:
                if '+' not in ko:
                    koset.add(ko)
                    if ko not in koFriends:
                        koFriends[ko] = set()
                else:
                    print('Skipping: ', ko)
            
            for ko in koset:
                koFriends[ko] = koFriends[ko].union(koset)

Skipping:  K00955+K00957
Skipping:  K05859+K05860+K05861
Skipping:  K01677+K01678
Skipping:  K00087+K13479+K13480
Skipping:  K13481+K13482
Skipping:  K11177+K11178+K13483
Skipping:  K00087+K13479+K13480
Skipping:  K13481+K13482
Skipping:  K11177+K11178+K13483
Skipping:  K00087+K13479+K13480
Skipping:  K13481+K13482
Skipping:  K11177+K11178+K13483
Skipping:  K11204+K11205
Skipping:  K00282+K00283
Skipping:  K01955+K01956
Skipping:  K00609+K00610
Skipping:  K00166+K00167


In [20]:
len(koFriends)

592

# Extract orthologs with 1+ Arabidopsis and Ceratopteris gene(s)

In [21]:
ogDict = {}

orthofile = '../../../fern_phylogenetics/2_orthofinder/Orthogroups/Orthogroups.txt'

fi = open(orthofile)

for line in fi:
    og, genes = line.rstrip().split(': ')
    
    genelist = genes.split(' ')
    
    crilist = []
    athlist = []
    
    for gene in genelist:
        geneid = gene.split('_')[0]
        species = gene.split('_')[-1]
        
        if species == 'ath':
            #geneid = geneid.split('.')[0]
            athlist.append(gene)
            #print(og,geneid,species)
            
        if species == 'cri':
            #geneid = 'Ceric.' + geneid.split('.')[1]
            crilist.append(gene)
            #print(og,geneid,species)
            
    if len(athlist) > 0 and len(crilist) > 0:
        #print(og, len(athlist), len(crilist))
        ogDict[og] = [athlist,crilist]

fi.close()

# Parse ath-cri relationships from phylogenies

In [22]:
koSuperSet = set()

for og in ogDict:
    ogfile = '../../../fern_phylogenetics/2_orthofinder/Gene_Trees_IQTREE/' + og + '.ntr.tree'
    
    if os.path.exists(ogfile) == True:
        #print(ogfile)
        
        t = Tree(ogfile, format=1)
        
        for node in t.get_descendants():
            if not node.is_leaf() and node.name[0] != 'n':
                #print(node.name)
                bs = float(node.name)
                if bs < bs_threshold:
                    node.delete()

        for athgene in ogDict[og][0]:
            #print(athgene)
            
            if athgene.split('.')[0] not in koDict:
                continue
                
            targetko = koDict[athgene.split('.')[0]]
            #print(targetko)

            if targetko in koFriends:
                targetko = koFriends[targetko]
            else:
                targetko = set([targetko])

            #print(targetko)

            Q = t.get_leaves_by_name(name=athgene)[0]
            #print(Q)

            crigenes_fin = set()

            for A in Q.iter_ancestors():
                #print(A.name)
                toofar = 'n'
                crigenes = set()

                for leaf in A.iter_leaves():
                    species = leaf.name.split('_')[-1]
                    
                    if species == 'ath' and leaf.name != athgene:
                        if leaf.name.split('.')[0] in koDict:
                            ko = koDict[leaf.name.split('.')[0]]
                            #('\t',leaf.name, species, ko)

                            if ko not in targetko:
                                toofar = 'y'

                    if species == 'cri':
                        #print('\t',leaf.name)
                        crigenes.add('Ceric.' + leaf.name.split('.')[1] )

                if toofar == 'n':
                    crigenes_fin = crigenes_fin.union(crigenes)

                if toofar == 'y':
                    break

            #print('final genes:', crigenes_fin)
            for gene in crigenes_fin:
                for ko in targetko:
                    #print(gene, ko) 
                    koSuperSet.add((gene, ko))
            

In [23]:
len(koSuperSet)

8223

In [24]:
outfile = '../../../her7/02_functional_annotations/Ceratopteris_KEGG_annotations_by_phylogeny.txt'

fo = open(outfile, 'w')

for pair in koSuperSet:
    fo.write(pair[0] + '\t' + pair[1] + '\n')

fo.close()

https://rest.kegg.jp/get/M00371