In [1]:
import pandas as pd
from ete3 import NCBITaxa
from ete3 import Tree

In [2]:
# Open taxonomic classification given to me by Fracois
df = pd.read_csv("Scion16_tax_Primer.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,Genus,Family,Order,Class,Phylum,Kingdom
0,ASV100,Hyphomonadaceae,Hyphomonadaceae,Caulobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
1,ASV1000,Rhodobacteraceae,Rhodobacteraceae,Rhodobacterales,Alphaproteobacteria,Proteobacteria,Bacteria
2,ASV10000,Halioglobus,Halieaceae,Cellvibrionales,Gammaproteobacteria,Proteobacteria,Bacteria
3,ASV100000,OM190,OM190,OM190,OM190,Planctomycetes,Bacteria
4,ASV100001,Candidatus_Kaiserbacteria,Candidatus_Kaiserbacteria,Candidatus_Kaiserbacteria,Parcubacteria,Patescibacteria,Bacteria
...,...,...,...,...,...,...,...
123773,ASV99994,WCHB1-41,WCHB1-41,WCHB1-41,Kiritimatiellae,Kiritimatiellaeota,Bacteria
123774,ASV99996,OM190,OM190,OM190,OM190,Planctomycetes,Bacteria
123775,ASV99997,OM190,OM190,OM190,OM190,Planctomycetes,Bacteria
123776,ASV99998,Cellvibrionales,Cellvibrionales,Cellvibrionales,Gammaproteobacteria,Proteobacteria,Bacteria


In [4]:
# Get a list with all unique genera
genus_list = df['Genus'].unique()

In [5]:
# Check the number of unique genera
len(genus_list)

1776

In [6]:
# Initialize ETE3
ncbi = NCBITaxa()

In [7]:
# Convert genus name to NCBI tax id
tax_id_list = []
for genus in genus_list:
    name2taxid = ncbi.get_name_translator([str(genus)])
    name2taxid = list(name2taxid.values())
    if len(name2taxid) > 0:
        tax_id_list.append(name2taxid[0][0])
len(tax_id_list)

1186

In [8]:
# Build the tree
t = ncbi.get_topology(tax_id_list, intermediate_nodes=True)
t.write(format=8, outfile="tree.nw")

In [9]:
# Open results file from PlasticDB and create a dictionary with all genera names that had matches in the PlasticDB database
with open("j4369676_genus.tsv") as f:
    f = f.readlines()
    genus_dict = {}
    for line in f[1:]:
        if len(line.split("\t")) > 1:
            genus = line.split("\t")[1]
            if genus not in genus_dict:
                genus_dict[genus] = ""

In [11]:
# Convert genus name to NCBI tax id 
for genus, value in genus_dict.items():
    name2taxid = ncbi.get_name_translator([str(genus)])
    name2taxid = list(name2taxid.values())
    if len(name2taxid) > 0:
        genus_dict[genus] =  name2taxid[0][0]

In [12]:
# Print all genera found in the database
for item, value in genus_dict.items():
    print(item, value)

Vibrio 662
Bdellovibrio 958
Moritella 58050
Shewanella 22
Flavobacterium 237
Pseudoalteromonas 53246
Bacillus 1386
Aliiglaciecola 1406885
Psychrobacter 497
Pseudomonas 286
Cobetia 204286
Ralstonia 48736
Aestuariibacter 249523
Desulfovibrio 872
Acinetobacter 469
Alcanivorax 59753
Massilia 149698
Mycobacterium 1763
Streptococcus 1301
Lysinibacillus 400634
Paenibacillus 44249
Rhodococcus 1827
Thermoactinomyces 2023
Oleispira 188907
Bacteroides 816
Variovorax 34072
Pelosinus 365348
Marinobacter 2742
Anaerobacter 1485
Micromonospora 1873
Sphingomonas 13687
Arthrobacter 1663
Stenotrophomonas 40323
Sphingobacterium 28453
Paracoccus 265
Staphylococcus 1279
Leucobacter 55968
Kocuria 57493
Acidovorax 12916
Alteromonas 226
Brevundimonas 41275
Exiguobacterium 33986
Streptomyces 1883
Halomonas 2745
Pelobacter 18
Achromobacter 222
Brevibacillus 55080
Acetobacterium 33951
Comamonas 283
Pseudonocardia 1847
Delftia 80865
Rhodospirillum 1081
Undibacterium 401469
Rahnella 34037
Leptothrix 88
Pseudoxantho

In [15]:
# Create annotation file to add stars to plastic degraders
labels_txt = open("degraders_symbols.txt", "w")
labels_txt.write("""DATASET_SYMBOL
SEPARATOR COMMA
DATASET_LABEL,All plastics
COLOR,#00ff00
MAXIMUM_SIZE,5
DATA
#100379,3,5,#0000ff,0,0
""")

for genus, tax_id in genus_dict.items():
            labels_txt.write(str(tax_id)+",3,50,#ff0000,1,1\n")
        
labels_txt.close()

In [16]:
# Create another genus dict with only the genera that were reported to degrade PBS
with open("j4369676_genus.tsv") as f:
    f = f.readlines()
    genus_dict = {}
    for line in f[1:]:
        if len(line.split("\t")) > 1:
            genus = line.split("\t")[1]
            plastic = line.split("\t")[3]
            if genus not in genus_dict and plastic == "PBS":
                genus_dict[genus] = ""

In [17]:
# Convert genus name to NCBI tax id 
for genus, value in genus_dict.items():
    name2taxid = ncbi.get_name_translator([str(genus)])
    name2taxid = list(name2taxid.values())
    if len(name2taxid) > 0:
        genus_dict[genus] =  name2taxid[0][0]

In [18]:
# Create annotation file to add stars to plastic degraders
labels_txt = open("PBS_symbols.txt", "w")
labels_txt.write("""DATASET_SYMBOL
SEPARATOR COMMA
DATASET_LABEL,All plastics
COLOR,#00ff00
MAXIMUM_SIZE,5
DATA
#100379,3,5,#0000ff,0,0
""")

for genus, tax_id in genus_dict.items():
            labels_txt.write(str(tax_id)+",3,50,#0000ff,1,1\n")
        
labels_txt.close()