In [1]:
from ete3 import PhyloTree, Tree, NCBITaxa, TreeStyle, AttrFace, NodeStyle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from os.path import join
import urllib3
import shutil
from Bio import SeqIO
import subprocess
import time

ncbi = NCBITaxa()

The multiple sequence allignments were made all in bash by downloading the unaligned FASTA sequences of EggNOG 5 spearately for bacteria and archaea and merging them into one file, and retrieveing the multiple sequence alignment from the corresponding orthologous group in the orthofinder results. The OF MSA was combined with the EggNOG sequences using the MAFFT -add function. The FASTA headers of the Asgard archaea were changed to start with their taxonomic identifier "1935183.". This file was then used as input for fasttree.


Example code:

mafft --auto --add /folder/EggNOG_COG.fasta --reorder /folder/OF_OG.fa > /folder/COG_msa.fasta

fasttree /folder/COG_msa.fasta > /folder/COG.nwk



# Manually add the archaeal COGs, use root level of EggNOG 6

In [2]:
COGs = pd.DataFrame(["COG0282", "COG0190", "COG4656", "COG1410", "COG1962", "COG0407", "COG5012"],
                    index=["AckA", "FolD", "MetV", "AcsE", "MtrH", "MtaA", "MtaC"])

COGs["Archaeal"] = pd.Series([["arCOG05260"], "", "", "", ["arCOG03220", "arCOG04336"], ["arCOG02028", "arCOG03323", "arCOG03324", "arCOG03325"], [
                             "arCOG02028", "arCOG02030", "arCOG02032", "arCOG03323", "arCOG03402", "arCOG03473", "arCOG03932"]], index=["AckA", "FolD", "MetV", "AcsE", "MtrH", "MtaA", "MtaC"])
COGs

Unnamed: 0,0,Archaeal
AckA,COG0282,[arCOG05260]
FolD,COG0190,
MetV,COG4656,
AcsE,COG1410,
MtrH,COG1962,"[arCOG03220, arCOG04336]"
MtaA,COG0407,"[arCOG02028, arCOG03323, arCOG03324, arCOG03325]"
MtaC,COG5012,"[arCOG02028, arCOG02030, arCOG02032, arCOG0332..."


Load the mapping file

In [3]:
emapper = pd.read_csv("../Data/input/MM_jbrjl6fg.emapper.annotations.tsv",
                      sep="\t", comment="#", header=None)
emapper.columns = ["query", "seed_ortholog", "evalue", "score", "eggNOG_OGs", "max_annot_lvl", "COG_category", "Description", "Preferred_name",
                   "GOs", "EC", "KEGG_ko", "KEGG_Pathway", "KEGG_Module", "KEGG_Reaction", "KEGG_rclass", "BRITE", "KEGG_TC", "CAZy", "BiGG_Reaction", "PFAMs"]

OFOG_Annot = pd.DataFrame()
OFOG_Annot["OFOG"] = emapper["query"].str.split(".", expand=True)[0]
OFOG_Annot["eOG"] = emapper["eggNOG_OGs"].str.split("@", expand=True)[0]

LinkedCOGs = COGs.merge(OFOG_Annot.pivot_table(
    index="eOG", values="OFOG", aggfunc=list), how="left", left_on=0, right_on="eOG")
LinkedCOGs.index = COGs.index
LinkedCOGs

Unnamed: 0,0,Archaeal,OFOG
AckA,COG0282,[arCOG05260],
FolD,COG0190,,[OG0001033]
MetV,COG4656,,"[OG0000883, OG0004585]"
AcsE,COG1410,,[OG0003553]
MtrH,COG1962,"[arCOG03220, arCOG04336]",[OG0000339]
MtaA,COG0407,"[arCOG02028, arCOG03323, arCOG03324, arCOG03325]",
MtaC,COG5012,"[arCOG02028, arCOG02030, arCOG02032, arCOG0332...","[OG0000098, OG0002348, OG0004668]"


# Download and gather sequences, align and build tree

### Download the bacterial level MSA from EggNOG

In [None]:
for OG in COGs.index:
    os.makedirs(f"../Data/LGTAnalysis/{OG}")
    url = f"http://eggnogapi5.embl.de/nog_data/text/raw_alg/{COGs.loc[OG,0]}"
    filename = f"../Data/LGTAnalysis/{OG}/{COGs.loc[OG,0]}_msa.fa"
    c = urllib3.PoolManager()
    with c.request('GET', url, preload_content=False) as res, open(filename, 'wb') as out_file:
        shutil.copyfileobj(res, out_file)

### Download the corresponding Archaeal level MSAs from EggNOG

In [None]:
for OG in COGs[COGs.Archaeal != ""].index:
    for arCOG in COGs.Archaeal[OG]:
        url = f"http://eggnogapi5.embl.de/nog_data/text/raw_alg/{arCOG}"
        filename = f"../Data/LGTAnalysis/{OG}/{arCOG}.fasta"
        c = urllib3.PoolManager()
        with c.request('GET', url, preload_content=False) as res, open(filename, 'wb') as out_file:
            shutil.copyfileobj(res, out_file)

### Copy the OrthoFinder groups and add Taxonomic ID for Asgard

In [None]:
for OG in LinkedCOGs[LinkedCOGs.OFOG.notna()].index:
    for OFOG in LinkedCOGs.OFOG[OG]:
        shutil.copy(
            f"../Data/OFResults/MultipleSequenceAlignments/{OFOG}.fa", f"../Data/LGTAnalysis/{OG}/{OFOG}.fasta")
        cmd = f" sed -i '' 's/>UP/>1935183.UP/' ../Data/LGTAnalysis/{OG}/{OFOG}.fasta"
        subprocess.call(cmd, shell=True)

### Concatenate all sequences to add into a single FASTA

In [None]:
for OG in COGs.index:
    cmd = f' cat ../Data/LGTAnalysis/{OG}/*.fasta >> ../Data/LGTAnalysis/{OG}/SeqstoAdd.fasta'
    subprocess.call(cmd, shell=True)

### Combine in a single MSA using MAFFT

In [None]:
for OG in COGs.index:
    seqs_old = f"../Data/LGTAnalysis/{OG}/{COGs.loc[OG,0]}_msa.fa"
    seqs_add = f"../Data/LGTAnalysis/{OG}/SeqstoAdd.fasta"
    aln_file = f"../Data/LGTAnalysis/{OG}/{OG}_combined.fasta"
    tree = f"../Data/LGTAnalysis/{OG}/{OG}.nwk"
    cmd_MAFFT = f' mafft --auto --add {seqs_add} --reorder {seqs_old} > {aln_file}'
    cmd_FastTree = f'fasttree {aln_file} > {tree}'

    subprocess.call(cmd_MAFFT, shell=True)
    print(f'alignment of additional sequences to {OG} is completed')

    subprocess.call(cmd_FastTree, shell=True)
    print(f'The phylogenetic tree for {OG} has been built')

In [None]:
for OG in COGs.index:
    aln_file = f"../Data/LGTAnalysis/{OG}/{OG}_combined.fasta"
    tree = f"../Data/LGTAnalysis/{OG}/{OG}.nwk"
    cmd_FastTree = f'fasttree {aln_file} > {tree}'

    #subprocess.call(cmd_FastTree, shell=True)
    print(cmd_FastTree)  # f'The phylogenetic tree for {OG} has been built')

# Load the tree and color it by phylogeny

In [None]:
COG_name = "AckA"

In [None]:

COG = PhyloTree(f"../Data/LGTAnalysis/{COG_name}/{COG_name}.nwk", format=1,
                sp_naming_function=lambda name: ncbi.get_lineage(name.split('.')[0])[3])
R = COG.get_midpoint_outgroup()
# and set it as tree outgroup
COG.set_outgroup(R)
COG.ladderize()

ts = TreeStyle()
ts.mode = "c"
ts.show_leaf_name = False

In [None]:
for lf in COG.iter_leaves():
    lf.add_feature("clade", ncbi.get_lineage(lf.name.split('.')[0])[3])

clades = []
for lf in COG.iter_leaves():
    clades.append(lf.clade)

Attr_CladeNames = pd.DataFrame(index=list(set(clades)))
Attr_CladeNames["Name"] = ""

for TaxID in Attr_CladeNames.index:
    Attr_CladeNames.loc[TaxID] = ncbi.translate_to_names([TaxID])

colors = {1: "Black",
          49928: "RoyalBlue",
          2323: "RoyalBlue",
          1783275: "Coral",
          1935183: "OrangeRed",
          28890: "DarkOrange",
          2138240: "Aqua",
          2498710: "PaleTurquoise",
          508458: "Aquamarine",
          203691: "CadetBlue",
          3018035: "SteelBlue",
          40117: "MediumBlue",
          32066: "Navy",
          1224: "Indigo",
          2818505: "Lime",
          68297: "LimeGreen",
          200783: "ForestGreen",
          200918: "DarkOliveGreen",
          1783257: "Olive",
          200930: "Purple",
          1802340: "SlateBlue",
          1783270: "DarkViolet",
          1783272: "Plum",
          200938: "Fuchsia",
          200940: "DarkMagenta",
          29547: "RoyalBlue",
          1930617: "DarkKhaki",
          57723: "BlueViolet"}

Attr_CladeNames["color"] = pd.Series(colors.values(), index=colors.keys())

Attr_CladeNames

In [None]:
for node in COG.traverse():
    if node.is_leaf() == False:
        leaf_clades = [leaf.clade for leaf in node.iter_leaves()]
        unique_clades = list(set(leaf_clades))
        node.add_feature("clade", 1)
        if (len(unique_clades) == 1):
            node.clade = unique_clades[0]

#COG.write(format=1, outfile="COG0190_AnnotIntNodes.nw")

In [None]:
for node in COG.traverse():
    node.img_style['hz_line_color'] = colors[node.clade]
    node.img_style['vt_line_color'] = colors[node.clade]

COG.render(f"../Data/LGTAnalysis/{COG_name}/{COG_name}.svg", tree_style=ts)
# COG.show(tree_style=ts)

In [None]:
plt.axis('off')
for clade in Attr_CladeNames.index:
    plt.plot(np.NaN, np.NaN, Attr_CladeNames.color[clade])

plt.legend(Attr_CladeNames.Name)  # , bbox_to_anchor=(1, 1))