In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ete3 import Tree, ProfileFace, TreeStyle, BarChartFace

# Load the orthofinder orthologous groups

# Load the list of BFEB proteins and their COGs

In [51]:
emapper = pd.read_csv("../Data/input/BFEB/MM_1m43dwrn.emapper.annotations.tsv", sep ="\t", comment = "#", header = None)
emapper.columns = ["query","seed_ortholog","evalue","score","eggNOG_OGs","max_annot_lvl","COG_category","Description","Preferred_name","GOs","EC","KEGG_ko","KEGG_Pathway","KEGG_Module","KEGG_Reaction","KEGG_rclass","BRITE","KEGG_TC","CAZy","BiGG_Reaction","PFAMs"]

BFEB_Annot = pd.DataFrame()
BFEB_Annot["ProteinID"] = emapper["query"].str.split(".",expand=True)[0]
BFEB_Annot["eOG"] = emapper["eggNOG_OGs"].str.split("@",expand=True)[0]
BFEB_Annot.set_index("ProteinID")

BFEBs = pd.read_csv("../Data/input/BFEB/BFEB_Table.csv", sep =";", comment = "#", index_col = 3, header = 0).iloc[:,0:2]
BFEBs = BFEBs.merge(BFEB_Annot, how = "left", left_index=True, right_on="ProteinID").reset_index(drop = True)
BFEBs#.head()

Unnamed: 0,BFEB,Subunits,ProteinID,eOG
0,A,HydA,AAD36496,COG1905
1,A,HydB,AAD36495,COG1894
2,A,HydC,AAD36494,COG1905
3,B,MvhA,CAF30379,COG0374
4,B,MvhG,CAF30378,COG1941
5,B,MvhD,CAF30377,COG1908
6,B,HdrA,CAF30381,COG1148
7,B,HdrB,CAF30711,COG2048
8,B,HdrC,CAF30710,COG2048
9,C,FdhA,CAF30854,COG0243


# Retrieve the gene counts from the Orthofinder results

In [3]:
emapper = pd.read_csv("../Data/input/MM_jbrjl6fg.emapper.annotations.tsv", sep ="\t", comment = "#", header = None)
emapper.columns = ["query","seed_ortholog","evalue","score","eggNOG_OGs","max_annot_lvl","COG_category","Description","Preferred_name","GOs","EC","KEGG_ko","KEGG_Pathway","KEGG_Module","KEGG_Reaction","KEGG_rclass","BRITE","KEGG_TC","CAZy","BiGG_Reaction","PFAMs"]

OFOGs = pd.read_csv("../Data/input/Proteomes/OrthoFinder/Results_Dec03/Orthogroups/Orthogroups.GeneCount.tsv", sep ="\t", comment = "#", index_col = 0, header = 0)

OFOG_Annot = pd.DataFrame()
OFOG_Annot["OFOG"] = emapper["query"].str.split(".",expand=True)[0]
OFOG_Annot["eOG"] = emapper["eggNOG_OGs"].str.split("@",expand=True)[0]
OFOG_Annot.head()

Unnamed: 0,OFOG,eOG
0,OG0000000,KOG0084
1,OG0000001,COG1136
2,OG0000003,COG1131
3,OG0000005,COG1028
4,OG0000006,COG2414


In [4]:
BFEBs.eOG.loc[BFEBs.eOG.isin(OFOG_Annot.eOG)]

0     COG1905
1     COG1894
2     COG1905
3     COG0374
4     COG1941
5     COG1908
6     COG1148
7     COG2048
8     COG2048
10    COG1035
11    COG1148
12    COG2048
13    COG2048
14    COG3383
15    COG1142
17    COG1142
18    COG3383
19    COG1894
20    COG1905
21    COG0543
22    COG0493
23    COG2086
24    COG2025
25    COG0644
26    COG2440
27    COG1960
28    COG2025
29    COG2086
30    COG1960
31    COG2086
32    COG2025
33    COG3383
34    COG3383
35    COG1894
36    COG1905
37    COG0277
38    COG1145
39    COG2086
40    COG1148
41    COG2048
42    COG2048
43    COG0685
44    COG4656
45    COG1908
46    COG0493
47    COG2048
Name: eOG, dtype: object

In [5]:
#WLP_genes_all = pd.Series(["COG1229","COG1029","COG2218","COG1153","COG1146","COG1145","COG2191","COG2037","COG3252","COG4074","COG1927","COG2141","COG4063","COG4062","COG4061","COG4060","COG4059","COG4218","COG4064","COG1962","COG4058","COG4054","COG4057","COG1148","COG2048","COG1150","COG1962","COG3383","COG1894","COG3383","COG4624","COG1145","COG1142","COG2759","COG3404","COG0190","COG2759","COG0373","COG0685","COG0646","COG0685","COG1410","COG1152","COG1880","COG1142","COG1151","COG0369","COG2069","COG1456","COG1614","COG0280","COG0282"])#.unique()
BFEB_genes = BFEBs.eOG.loc[BFEBs.eOG.isin(OFOG_Annot.eOG)]

BFEB_genes_OFOGs = OFOG_Annot.set_index("eOG").loc[BFEB_genes.values]
BFEB_genes_OFOGs

Unnamed: 0_level_0,OFOG
eOG,Unnamed: 1_level_1
COG1905,OG0001794
COG1894,OG0001673
COG1905,OG0001794
COG0374,OG0000139
COG1941,OG0000136
...,...
COG4656,OG0004585
COG1908,OG0000025
COG0493,OG0000171
COG2048,OG0000029


In [6]:
BFEB_OFOGs_Count = OFOGs.loc[BFEB_genes_OFOGs.OFOG.values].reset_index().merge(BFEB_genes_OFOGs.reset_index(), how= "left", left_on="Orthogroup", right_on="OFOG").set_index("Orthogroup").drop(["Total","OFOG"], axis=1)
BFEB_OFOGs_Count = BFEB_OFOGs_Count.drop_duplicates()#[BFEB_OFOGs_Count.duplicated() == False]

COGsSpOI = BFEB_OFOGs_Count.groupby(['eOG']).sum()#.loc[BFEB_genes.values]
COGsSpOI

Unnamed: 0_level_0,UP000070043,UP000070149,UP000070599,UP000185561,UP000186063,UP000186209,UP000186239,UP000186851,UP000228988,UP000245584,...,UP000751408,UP000760201,UP000771954,UP000775429,UP000777023,UP000825566,UP000825906,UP000826397,UP000826686,UP000826833
eOG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
COG0277,0,0,0,0,0,0,0,0,0,1,...,2,3,1,0,1,0,0,1,0,1
COG0374,2,2,1,2,2,1,0,1,0,0,...,2,2,3,3,2,2,1,2,1,3
COG0493,4,1,0,2,1,1,0,0,0,3,...,3,3,3,1,5,3,1,3,1,2
COG0543,5,3,2,3,1,1,3,2,1,4,...,4,3,5,4,3,3,1,3,2,3
COG0644,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,0
COG0685,1,0,0,1,1,0,0,0,0,1,...,2,1,2,0,2,0,1,1,0,1
COG1035,0,1,0,1,0,0,1,0,0,0,...,1,2,3,1,2,0,1,0,0,0
COG1142,3,4,2,4,6,5,1,1,0,1,...,2,4,6,1,4,3,0,3,1,5
COG1145,1,3,0,2,0,0,0,0,0,2,...,5,9,5,0,5,6,1,5,2,1
COG1148,13,11,6,12,2,1,1,1,0,1,...,12,21,20,4,17,8,4,12,5,7


## Calculate median count per taxonomic sub group

In [7]:
ProposedClass = pd.read_csv("../Data/input/ProposedClassification.csv", sep =",", comment = "#", header = 0, index_col = "UPID")#["Organism"]
data = COGsSpOI.T.merge(ProposedClass, how = "left", left_index=True, right_index=True)#.groupby(["Group"])
COGs_ByGenus = data.groupby(data.iloc[:,-1]).median().T
COGs_ByGenus

  COGs_ByGenus = data.groupby(data.iloc[:,-1]).median().T


Group,Heimdall 1,Heimdall 2,Heimdall 3,Heimdall 4,Heimdall 5,Loki 1,Loki 2,Loki 3,Loki 4,Loki 5,Odin,Thor
COG0277,0.0,0.0,1.0,0.0,0.0,3.0,0.5,0.0,0.0,1.0,0.0,0.0
COG0374,0.5,2.0,1.0,1.5,2.0,2.0,2.0,1.5,1.0,2.0,1.0,2.0
COG0493,0.5,1.0,2.0,0.5,4.0,3.0,0.5,0.0,1.5,1.5,0.0,2.0
COG0543,1.0,1.0,3.0,3.5,6.0,3.0,2.5,0.0,2.5,3.0,2.0,3.0
COG0644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0
COG0685,0.0,1.0,1.0,0.0,0.0,1.0,0.5,0.0,1.0,1.5,0.0,1.0
COG1035,0.0,0.0,0.0,1.0,0.0,2.0,1.5,0.5,1.0,1.0,0.0,0.0
COG1142,2.5,6.0,1.0,1.0,3.5,4.0,1.5,1.5,1.0,2.0,1.0,3.0
COG1145,0.0,0.0,1.0,0.0,3.0,9.0,7.0,1.0,2.0,4.0,0.0,2.0
COG1148,0.5,2.0,4.0,2.5,2.5,21.0,10.0,3.5,6.5,10.0,1.0,6.5


# Retrieve the gene counts from eggNOG

In [8]:
Root_OGMembers = pd.read_csv("../Data/input/EggNOG/1_members.tsv", sep ="\t", comment = "#", header = None, index_col = 1)

# Keep only the relevant rows, with COG related to BFEB
Root_OGs = Root_OGMembers[Root_OGMembers.index.isin(BFEBs.eOG)]

#Restructure to a count matrix
Root_OGs["list"] = Root_OGs[4].str.split(",", expand = False)
Root_OGs = Root_OGs["list"].explode().str.split(".", expand = True, n=1)#.reset_index(name = "COG")
Root_OGs.columns = ["species","gene"]
Root_OGs = Root_OGs.reset_index()
Root_OGs.columns = ["COG","species","gene"]
Root_OG_Counts = Root_OGs.groupby(["COG","species"]).count().unstack().fillna(0)
Root_OG_Counts.columns = Root_OG_Counts.columns.droplevel(0)

#Subset to the chosen reference species
NCBIRefIDs = pd.read_csv("../Data/input/EggNOG/NCBIRefIDs.tsv", sep ="\t", comment = "#", header = None)
NCBIRefIDs.columns = ["TaxID", "TaxName"]
NCBIRefIDs = NCBIRefIDs.astype(str).set_index("TaxID")

COGsRefSp = Root_OG_Counts[NCBIRefIDs.index]
COGsRefSp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Root_OGs["list"] = Root_OGs[4].str.split(",", expand = False)


species,574087,931626,264732,33035,1123288,903814,748727,545694,903818,243232,...,192952,323259,304371,410358,456320,419665,339860,186497,155864,224308
COG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
COG0243,1.0,1.0,6.0,1.0,9.0,0.0,3.0,2.0,5.0,1.0,...,0.0,6.0,2.0,2.0,2.0,1.0,1.0,2.0,11.0,3.0
COG0277,3.0,1.0,2.0,0.0,6.0,6.0,5.0,0.0,9.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0
COG0374,1.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,2.0,...,4.0,1.0,5.0,1.0,4.0,3.0,2.0,2.0,2.0,0.0
COG0493,1.0,8.0,4.0,6.0,3.0,2.0,9.0,2.0,5.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,3.0,5.0,1.0
COG0543,3.0,2.0,4.0,4.0,3.0,3.0,5.0,2.0,1.0,1.0,...,2.0,2.0,2.0,2.0,1.0,1.0,1.0,5.0,1.0,1.0
COG0644,3.0,1.0,5.0,0.0,2.0,0.0,2.0,0.0,1.0,3.0,...,6.0,2.0,3.0,2.0,2.0,2.0,5.0,2.0,4.0,0.0
COG0685,1.0,1.0,1.0,1.0,3.0,2.0,1.0,1.0,3.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
COG1035,0.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,4.0,...,4.0,7.0,5.0,3.0,5.0,3.0,3.0,0.0,0.0,0.0
COG1142,3.0,5.0,5.0,2.0,1.0,2.0,6.0,2.0,1.0,3.0,...,1.0,0.0,1.0,0.0,2.0,3.0,0.0,2.0,7.0,0.0
COG1145,3.0,10.0,5.0,5.0,4.0,3.0,6.0,3.0,2.0,10.0,...,10.0,10.0,13.0,7.0,9.0,11.0,8.0,4.0,4.0,1.0


# Combine OrthoFinder and EggNOG matrices

In [25]:
COGs = pd.merge(COGs_ByGenus, COGsRefSp, how = "outer", left_index=True, right_index=True).fillna(0)#.set_index("OFOG").drop("Total", axis = 1)
COGs = COGs.drop_duplicates()
COGs# = COGs.loc[BFEB_genes[BFEB_genes.isin(COGs.index)].unique()]
#COGs

Unnamed: 0,Heimdall 1,Heimdall 2,Heimdall 3,Heimdall 4,Heimdall 5,Loki 1,Loki 2,Loki 3,Loki 4,Loki 5,...,192952,323259,304371,410358,456320,419665,339860,186497,155864,224308
COG0243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6.0,2.0,2.0,2.0,1.0,1.0,2.0,11.0,3.0
COG0277,0.0,0.0,1.0,0.0,0.0,3.0,0.5,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0
COG0374,0.5,2.0,1.0,1.5,2.0,2.0,2.0,1.5,1.0,2.0,...,4.0,1.0,5.0,1.0,4.0,3.0,2.0,2.0,2.0,0.0
COG0493,0.5,1.0,2.0,0.5,4.0,3.0,0.5,0.0,1.5,1.5,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,3.0,5.0,1.0
COG0543,1.0,1.0,3.0,3.5,6.0,3.0,2.5,0.0,2.5,3.0,...,2.0,2.0,2.0,2.0,1.0,1.0,1.0,5.0,1.0,1.0
COG0644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,6.0,2.0,3.0,2.0,2.0,2.0,5.0,2.0,4.0,0.0
COG0685,0.0,1.0,1.0,0.0,0.0,1.0,0.5,0.0,1.0,1.5,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
COG1035,0.0,0.0,0.0,1.0,0.0,2.0,1.5,0.5,1.0,1.0,...,4.0,7.0,5.0,3.0,5.0,3.0,3.0,0.0,0.0,0.0
COG1142,2.5,6.0,1.0,1.0,3.5,4.0,1.5,1.5,1.0,2.0,...,1.0,0.0,1.0,0.0,2.0,3.0,0.0,2.0,7.0,0.0
COG1145,0.0,0.0,1.0,0.0,3.0,9.0,7.0,1.0,2.0,4.0,...,10.0,10.0,13.0,7.0,9.0,11.0,8.0,4.0,4.0,1.0


# Reformat by genes as rows

In [43]:
BFEBs_Genes = BFEBs.merge(COGs, how = "left", left_on="eOG", right_index= True).set_index("Subunits").drop(["BFEB","ProteinID","eOG"],axis=1)

In [26]:
SpeciesIndexDF = pd.read_csv("../Data/input/SpeciesIndexDF.tsv", sep ="\t", comment = "#", header = 0, index_col = "Proteome Id")["Organism"]
SpNamesID = pd.concat([SpeciesIndexDF,NCBIRefIDs["TaxName"]])
SpNamesID

UP000321408       Ca. Prometheoarchaeum syntrophicum
UP000070043      Ca. Thorarchaeota archaeon SMTZ1-83
UP000070149      Ca. Thorarchaeota archaeon SMTZ1-45
UP000070599       Ca. Thorarchaeota archaeon SMTZ-45
UP000185561    Thorarchaeota archaeon (strain AB_25)
                               ...                  
419665                        Methanococcus aeolicus
339860                     Methanosphaera stadtmanae
186497                          Pyrococcus furiosus 
155864                             Escherichia coli 
224308                            Bacillus subtilis 
Length: 89, dtype: object

In [50]:
Combined = Tree("../Data/input/CombninedSpeciesTreeETE.nw", format=1)# made using orthofinder, NCBI ID's are slightly different to the ones from the strains in EggNOG so that's corrected in the .nw file.
Combined.ladderize()

LeavesToKeep = ProposedClass[~ProposedClass.iloc[:,0].duplicated()]
LeavesToKeep.columns = ["TaxName"]
Combined.prune(NCBIRefIDs.index.union(LeavesToKeep.index))

#Add columns to link to the count matrix
LeavesToKeep["Feature"] = LeavesToKeep["TaxName"]
NCBIRefIDs["Feature"] = NCBIRefIDs.index

# Combine into single indexing dataframe
SpNamesID = pd.concat([LeavesToKeep,NCBIRefIDs])

# Rename the Asgard leaves so that they match the median counts instead of the chosen species
for lf in Combined.iter_leaves():
    if lf.name in list(LeavesToKeep.index):
        lf.name = LeavesToKeep.loc[lf.name][0]

# Add the counts to the specific leaves of the tree and change the names from TaxID to actual names
for lf in Combined.iter_leaves():
    lf.add_features(profile = BFEBs_Genes[lf.name].values)
    lf.add_features(deviation = [0 for x in range(len(BFEBs_Genes))])
    lf.add_face(ProfileFace(max_v=36.0, min_v=0.0, center_v=10.0, style='heatmap', colorscheme=4, width=len(BFEBs_Genes)*10, height=10), column=0, position="aligned")
    lf.name = SpNamesID.set_index("Feature")["TaxName"][lf.name]

description = list(BFEBs_Genes.index)
axisface = BarChartFace([0]*len(list(BFEBs_Genes.index)), width=len(BFEBs_Genes)*10, height=0, labels=list(BFEBs_Genes.index), max_value=1, scale_fsize=1)

ts = TreeStyle()
ts.draw_guiding_lines = True
ts.show_leaf_name = True
ts.aligned_foot.add_face(axisface, 0)

#Combined.render('../Data/output/BFEBheatmap.png', tree_style=ts)
Combined.show(tree_style=ts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  LeavesToKeep["Feature"] = LeavesToKeep["TaxName"]
