In [1]:
import numpy as np
import pandas as pd

# Retrieve the gene counts from the Orthofinder results

In [2]:
emapper = pd.read_csv("../Data/input/MM_jbrjl6fg.emapper.annotations.tsv", sep ="\t", comment = "#", header = None)
emapper.columns = ["query","seed_ortholog","evalue","score","eggNOG_OGs","max_annot_lvl","COG_category","Description","Preferred_name","GOs","EC","KEGG_ko","KEGG_Pathway","KEGG_Module","KEGG_Reaction","KEGG_rclass","BRITE","KEGG_TC","CAZy","BiGG_Reaction","PFAMs"]

OFOGs = pd.read_csv("../Data/OFResults/Orthogroups.GeneCount.tsv", sep ="\t", comment = "#", index_col = 0, header = 0)

In [3]:
OFOG_Annot = pd.DataFrame()
OFOG_Annot["OFOG"] = emapper["query"].str.split(".",expand=True)[0]
OFOG_Annot["COG"] = emapper["eggNOG_OGs"].str.split("@",expand=True)[0]
OFOG_Annot

Unnamed: 0,OFOG,COG
0,OG0000000,KOG0084
1,OG0000001,COG1136
2,OG0000003,COG1131
3,OG0000005,COG1028
4,OG0000006,COG2414
...,...,...
2280,OG0010442,COG2006
2281,OG0010449,COG1250
2282,OG0010450,COG0451
2283,OG0010452,arCOG02290


In [4]:
OFOGs = OFOGs.loc[OFOG_Annot.OFOG]
OFOGs = OFOGs.merge(OFOG_Annot, how= "left", left_on=None, right_on="OFOG", left_index=True, right_index=False)
OFOGs = OFOGs.set_index("COG")
OFOGs = OFOGs.drop(axis = 1, labels = ["Total", "OFOG"])
OFOGs = OFOGs.groupby(by=OFOGs.index, axis=0).sum() #some OFOGs correspond to the same COG, and need to be summed up
OFOGs

Unnamed: 0_level_0,UP000070043,UP000070149,UP000070599,UP000185561,UP000186063,UP000186209,UP000186239,UP000186851,UP000228988,UP000245584,...,UP000751408,UP000760201,UP000771954,UP000775429,UP000777023,UP000825566,UP000825906,UP000826397,UP000826686,UP000826833
COG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28H67,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28H8D,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28H95,0,1,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
28HGQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28HKD,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
arCOG12322,0,0,0,1,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
arCOG12705,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
arCOG12964,0,0,0,0,0,0,1,0,0,0,...,0,1,1,1,1,1,0,1,1,1
arCOG14015,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


# Retrieve the gene counts from eggNOG

In [5]:
Root_OGs = pd.read_csv("../Data/input/1_members.tsv", sep ="\t", comment = "#", header = None, index_col = 1)

#Restructure to a count matrix, takes around 5 minutes
Root_OGs["list"] = Root_OGs[4].str.split(",", expand = False)
Root_OGs = Root_OGs["list"].explode().str.split(".", expand = True, n=1)
Root_OGs.columns = ["species","gene"]
Root_OGs = Root_OGs.reset_index()
Root_OGs.columns = ["COG","species","gene"]
Root_OGs = Root_OGs.groupby(["COG","species"]).count().unstack().fillna(0)
Root_OGs.columns = Root_OGs.columns.droplevel(0)

# Combine both dataframes

In [6]:
OFOGs = OFOGs.reindex(Root_OGs.index, fill_value=0) #fill the dataframe with 0s for all other COGs not occuring in the OrthoFinder analysis

In [7]:
Root_OGs = pd.concat([Root_OGs, OFOGs], axis=1)#.fillna(0)

# By Genus

In [11]:
from ete3 import NCBITaxa
ncbi = NCBITaxa()

In [12]:
Taxonomy = pd.DataFrame(index = Root_OGs.columns)
Taxonomy["Kingdom"] = "NaN"
Taxonomy["Order"] = "NaN"

for ID in Taxonomy.index:
    if ID.startswith("UP"):
        Taxonomy.Kingdom[ID] = 2157
        Taxonomy.Order[ID] = 1935183
    else: 
        Taxonomy.Kingdom[ID] = ncbi.get_lineage(ID)[2]
        Taxonomy.Order[ID] = ncbi.get_lineage(ID)[3]
        
#TaxCount = pd.DataFrame(Taxonomy.groupby("Order").size())
#TaxCount["Name"] = ncbi.get_taxid_translator(TaxCount.index)
#TaxCount

TaxCount = pd.DataFrame(Taxonomy[Taxonomy.Kingdom != 2759].groupby("Order").size())
TaxCount["Name"] = ncbi.get_taxid_translator(TaxCount.index)
TaxCount



Unnamed: 0_level_0,0,Name
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
1224,1555,Pseudomonadota
2323,5,Bacteria incertae sedis
28890,120,Euryarchaeota
29547,80,Campylobacterota
32066,30,Fusobacteriota
40117,4,Nitrospirota
49928,3,unclassified Bacteria
57723,16,Acidobacteriota
68297,2,Dictyoglomota
200783,16,Aquificota


## Discard Eukaryotes

In [14]:
Root_OGs = Root_OGs.loc[:,Taxonomy.Kingdom != 2759]
Root_OGs


Unnamed: 0_level_0,1000565,1000569,1000570,1000588,1001240,1001530,1001585,100226,1002339,1002340,...,UP000751408,UP000760201,UP000771954,UP000775429,UP000777023,UP000825566,UP000825906,UP000826397,UP000826686,UP000826833
COG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28H50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
28H51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
28H52,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
28H53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
28H54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
arCOG15264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
arCOG15268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
arCOG15271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
arCOG15273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
#Root_OGs.to_csv("../data/output/AllOFOG&eOGCounts.csv") #save to file
Root_OGs.to_pickle("../data/output/AllOFOG&eOGCounts.pkl") #alternatively save to pickle format, larger file but quicker to load.