In [1]:
import pandas as pd
import psycopg2
from psycopg.errors import ForeignKeyViolation
import os
import tabulate
import numpy as np

## 1. Creating Dataframes for phyla, classes, subclasses, orders, families, genera, species and specie-synonyms from the WFO csv

In [3]:
columns_to_keep = ['taxonID',
                   'scientificName',
                   'taxonRank',
                   'parentNameUsageID',
                   'family',
                   'genus',
                   'nomenclaturalStatus',
                   'taxonomicStatus',
                   'acceptedNameUsageID'
                   ]

# WFO Database saved as csv file
input_csv = 'classification.csv'

# transforms taxonID to an integer ID (e.g wfo-0001302018 --> 1302018)
def taxon_number(taxon_id):
    if taxon_id[:3] == 'wfo':
        return int(taxon_id[4:])
    else:
        return 0

In [52]:
# list of english phyla names corresponding to the scientific names
phyla_english_names = [('Lycopodiophyta', 'Lycopods'),
                       ('Marchantiophyta', 'Liverworts'),
                       ('Angiosperms', 'Flowering plants'),
                       ('Anthocerotophyta', 'Hornworts'),
                       ('Bryophyta', 'Mosses'),
                       ('Polypodiophyta', 'Ferns'),
                       ('Pinophyta', 'Conifers and allies'),
                       ('Ginkgophyta', 'Ginkgo'),
                       ('Cycadophyta', 'Cycadophyta')
                       ]
phyla_english_names_df = pd.DataFrame(phyla_english_names, columns=['scientificName', 'phylum_englishName'])

In [5]:
wfo_df = pd.read_csv(input_csv,
                     sep='\t',
                     encoding='ANSI',
                     usecols=columns_to_keep,
                     converters={0: taxon_number, 5: taxon_number, 19: taxon_number}
                     )

### Possible values for nomenclaturalStatus and taxonomicStatus:

In [6]:
display(wfo_df['nomenclaturalStatus'].value_counts())
display(wfo_df['taxonomicStatus'].value_counts())
# print(wfo_df['taxonRank'].value_counts())

nomenclaturalStatus
Valid           840842
Illegitimate     28364
Invalid          18175
Superfluous       1561
Conserved          521
Rejected           432
Name: count, dtype: int64

taxonomicStatus
Synonym      943255
Accepted     444907
Unchecked    187975
Name: count, dtype: int64

#### We will keep only Conserved and Valid nomenclaturalStatus, and Accepted taxonomicStatus. But first we save all the species which are synonyms of accepted species in a separate dataframe:

In [762]:
# species_synonyms_df = wfo_df.loc[(wfo_df['nomenclaturalStatus'] == 'Valid') & (wfo_df['taxonomicStatus'] == 'Synonym') & (wfo_df['taxonRank'] == 'species'), ['taxonID', 'scientificName', 'acceptedNameUsageID']]
# display(species_synonyms_df)

Unnamed: 0,taxonID,scientificName,acceptedNameUsageID
15,1302026,Kobresia bucharica,345171
27,1302039,Carex graciliformis,344146
30,1302042,Carex kattegatensis,352303
35,1302048,Carex musartiana,350145
74,1302089,Pycreus nilagiricus,373599
...,...,...,...
1575565,1000057514,Erica gordoniae,673433
1575566,1000057515,Erica eweriana,671867
1575995,1000057833,Azalea punicea,1229780
1576093,1000057862,Pilostyles pringlei,1129097


In [7]:
wfo_df = wfo_df[(wfo_df['nomenclaturalStatus'] == 'Valid') | (wfo_df['nomenclaturalStatus'] == 'Conserved')]
wfo_df = wfo_df[wfo_df['taxonomicStatus'] == 'Accepted']
wfo_df = wfo_df.sort_values('scientificName')
display(wfo_df)

Unnamed: 0,taxonID,scientificName,taxonRank,parentNameUsageID,family,genus,nomenclaturalStatus,taxonomicStatus,acceptedNameUsageID
1405743,4000000001,Aa,genus,7000000429,Orchidaceae,Aa,Valid,Accepted,0
316954,319089,Aa achalensis,species,4000000001,Orchidaceae,Aa,Valid,Accepted,0
756946,760991,Aa argyrolepis,species,4000000001,Orchidaceae,Aa,Valid,Accepted,0
918422,922666,Aa aurantiaca,species,4000000001,Orchidaceae,Aa,Valid,Accepted,0
923805,928062,Aa calceata,species,4000000001,Orchidaceae,Aa,Valid,Accepted,0
...,...,...,...,...,...,...,...,...,...
1442733,4000046085,Ã— Zygosepescalum,genus,7000000429,Orchidaceae,Zygosepescalum,Valid,Accepted,0
1441514,4000044858,Ã— Zygostylis,genus,7000000429,Orchidaceae,Zygostylis,Valid,Accepted,0
1443289,4000046644,Ã— Zygotoria,genus,7000000429,Orchidaceae,Zygotoria,Valid,Accepted,0
1443330,4000046685,Ã— Zygowarrea,genus,7000000429,Orchidaceae,Zygowarrea,Valid,Accepted,0


In [8]:
print(wfo_df['nomenclaturalStatus'].value_counts())
print(wfo_df['family'].value_counts())
print(wfo_df['genus'].value_counts())


nomenclaturalStatus
Valid        444500
Conserved       393
Name: count, dtype: int64
family
Asteraceae          42662
Orchidaceae         35644
Fabaceae            30661
Rubiaceae           16761
Poaceae             14120
                    ...  
Culcitaceae             3
Saelaniaceae            3
Agdestidaceae           3
Tetracarpaeaceae        3
Chenopodiaceae          1
Name: count, Length: 717, dtype: int64
genus
Hieracium          4465
Astragalus         3741
Carex              2826
Euphorbia          2562
Begonia            2522
                   ... 
Zygomatophyllum       1
Zygolum               1
Zygogardmannia        1
Zygodisanthus         1
Zygocella             1
Name: count, Length: 17874, dtype: int64


Now that we reduced the dataframe to valid or conserved entries, we list the possible taxonRank in taxonomic order:
<br/><br/>

|             | Any   | Valid  | Conserved|
|-------------|-------|--------| ----------- |
| kingdom     | 1     | 1      | 0 |
| subkingdom  | 2     | 2      | 0 |
| phylum      | 9     | 9      | 0 |
| class       | 16    | 16     | 0 |
| subclass    | 26    | 26     | 0 |
| superorder  | 2     | 2      | 0 |
| order       | 142   | 141    | 1 |
| family      | 717   | 698    | 19 |
| subfamily   | 90    | 89     | 1 |
| tribe       | 84    | 84     | 0 |
| subtribe    | 170   | 170    | 0 |
| genus       | 17910 | 17598  | 312 |
| subgenus    | 71    | 71     | 0 |
| section     | 267   | 267    | 0 |
| subsection  | 86    | 86     | 0 |
| series      | 22    | 22     | 0 |
| subseries   | 2     | 2      | 0 |
| species     | 377223 | 377164 | 59 |
| prole       | 1     |        |   |
| subspecies  | 23371 |        |   |
| variety     | 23821 |        |   |
| subvariety  | 3     |        |   |
| form        | 857   |        |   |

In [10]:
wfo_df_grouped_by_taxonRank = wfo_df.groupby('taxonRank')

In [13]:
parent_df = wfo_df.filter(['taxonID', 'scientificName', 'taxonRank', 'nomenclaturalStatus', 'taxonomicStatus', 'acceptedNameUsageID']).rename(columns={'taxonID': 'parentNameUsageID', 'scientificName': 'parentScientificName', 'taxonRank': 'parentTaxonRank', 'nomenclaturalStatus': 'parentNomenclaturalStatus', 'taxonomicStatus': 'parentTaxonomicStatus', 'acceptedNameUsageID': 'parentAcceptedNameUsageID'}).set_index(['parentNameUsageID'])
species_df = wfo_df_grouped_by_taxonRank.get_group('species').join(parent_df, on=['parentNameUsageID'], how='left')
display(species_df)

Unnamed: 0,taxonID,scientificName,taxonRank,parentNameUsageID,family,genus,nomenclaturalStatus,taxonomicStatus,acceptedNameUsageID,parentScientificName,parentTaxonRank,parentNomenclaturalStatus,parentTaxonomicStatus,parentAcceptedNameUsageID
316954,319089,Aa achalensis,species,4000000001,Orchidaceae,Aa,Valid,Accepted,0,Aa,genus,Valid,Accepted,0.0
756946,760991,Aa argyrolepis,species,4000000001,Orchidaceae,Aa,Valid,Accepted,0,Aa,genus,Valid,Accepted,0.0
918422,922666,Aa aurantiaca,species,4000000001,Orchidaceae,Aa,Valid,Accepted,0,Aa,genus,Valid,Accepted,0.0
923805,928062,Aa calceata,species,4000000001,Orchidaceae,Aa,Valid,Accepted,0,Aa,genus,Valid,Accepted,0.0
947494,951869,Aa colombiana,species,4000000001,Orchidaceae,Aa,Valid,Accepted,0,Aa,genus,Valid,Accepted,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404334,406838,Ã— Trichoechinopsis Ã— imperialis,species,4000038988,Cactaceae,Trichoechinopsis,Valid,Accepted,0,Ã— Trichoechinopsis,genus,Valid,Accepted,0.0
404336,406840,Ã— Trichomoza Ã— roseiflora,species,4000039013,Cactaceae,Trichomoza,Valid,Accepted,0,Ã— Trichomoza,genus,Valid,Accepted,0.0
412580,415123,Ã— Tridentapelia Ã— bijliae,species,4001303442,Apocynaceae,Tridentapelia,Valid,Accepted,0,Ã— Tridentapelia,genus,Valid,Accepted,0.0
404365,406869,Ã— Turbiniphora Ã— panarottoi,species,4000039594,Cactaceae,Turbiniphora,Valid,Accepted,0,Ã— Turbiniphora,genus,Valid,Accepted,0.0


In [17]:
print(species_df['parentTaxonRank'].value_counts(),'\n')
print(f'We have {len(species_df)} valid and accepted species.')
print(f'and {species_df['parentTaxonRank'].value_counts().sum()} of those have valid and accepted parents.')
# --- 
# save species with parent=(genus, Valid, Accepted) in a separate frame to insert later
# for the rest of the species, check if they have a valid genus listed.
# if yes, add them to the df with the genus listed as parent

parentTaxonRank
genus         372410
section         3331
subsection      1035
subgenus         279
subseries         78
series            71
Name: count, dtype: int64 

We have 377223 valid and accepted species.
and 377204 of those have valid and accepted parents.


In [25]:
# it looks like all species have genus and family listed in the columns genus and family (a.k.a. no empty entry):
print(pd.isnull(species_df['genus']).value_counts())
print(pd.isnull(species_df['family']).value_counts())

genus
False    377223
Name: count, dtype: int64
family
False    377223
Name: count, dtype: int64


In [53]:
# ---- phyla dataframe ----
phyla_df = wfo_df_grouped_by_taxonRank.get_group('phylum').filter(['taxonID', 'scientificName']).merge(phyla_english_names_df, how='left', on='scientificName')
display(phyla_df)

Unnamed: 0,taxonID,scientificName,phylum_englishName
0,9949999999,Angiosperms,Flowering plants
1,4100002134,Anthocerotophyta,Hornworts
2,9949999996,Bryophyta,Mosses
3,4100003333,Cycadophyta,Cycadophyta
4,4100003332,Ginkgophyta,Ginkgo
5,4100003337,Lycopodiophyta,Lycopods
6,4100002135,Marchantiophyta,Liverworts
7,4100003331,Pinophyta,Conifers and allies
8,4100003336,Polypodiophyta,Ferns


In [69]:
# ---- classes dataframe ----
classes_df = wfo_df_grouped_by_taxonRank.get_group('class').filter(['taxonID', 'scientificName', 'parentNameUsageID']).merge(phyla_df.filter(['taxonID', 'scientificName']), how='left', left_on='parentNameUsageID', right_on='taxonID', suffixes=(None, "_parent_phylum")).filter(['taxonID', 'scientificName', 'taxonID_parent_phylum', 'scientificName_parent_phylum'])
display(classes_df)

Unnamed: 0,taxonID,scientificName,taxonID_parent_phylum,scientificName_parent_phylum
0,9850000001,Andreaeobryopsida,9949999996,Bryophyta
1,9850000002,Andreaeopsida,9949999996,Bryophyta
2,4100000009,Anthocerotopsida,4100002134,Anthocerotophyta
3,4100002058,Bryopsida,9949999996,Bryophyta
4,4100002548,Cycadopsida,4100003333,Cycadophyta
5,4100002550,Ginkgoopsida,4100003332,Ginkgophyta
6,4100000011,Haplomitriopsida,4100002135,Marchantiophyta
7,4100000012,Jungermanniopsida,4100002135,Marchantiophyta
8,4100000010,Leiosporocerotopsida,4100002134,Anthocerotophyta
9,4100003338,Lycopodiopsida,4100003337,Lycopodiophyta


In [70]:
# ---- subclasses dataframe ----
subclasses_df = wfo_df_grouped_by_taxonRank.get_group('subclass').filter(['taxonID', 'scientificName', 'parentNameUsageID']).merge(classes_df.filter(['taxonID', 'scientificName']), how='left', left_on='parentNameUsageID', right_on='taxonID', suffixes=(None, "_parent_class")).filter(['taxonID', 'scientificName', 'taxonID_parent_class', 'scientificName_parent_class'])
display(subclasses_df)

Unnamed: 0,taxonID,scientificName,taxonID_parent_class,scientificName_parent_class
0,4100002070,Anthocerotidae,4100000009,Anthocerotopsida
1,4100002073,Blasiidae,4100000013,Marchantiopsida
2,9800000001,Bryidae,4100002058,Bryopsida
3,9800000002,Buxbaumiidae,4100002058,Bryopsida
4,4100002553,Cupressidae,4100002552,Pinopsida
5,4100002549,Cycadidae,4100002548,Cycadopsida
6,1000044985,Dendrocerotidae,4100000009,Anthocerotopsida
7,9800000006,Dicranidae,4100002058,Bryopsida
8,9800000003,Diphysciidae,4100002058,Bryopsida
9,4100003340,Equisetidae,4100003341,Polypodiopsida


In [71]:
# ---- superorders dataframe ----
superorders_df = wfo_df_grouped_by_taxonRank.get_group('superorder').filter(['taxonID', 'scientificName', 'parentNameUsageID']).merge(subclasses_df.filter(['taxonID', 'scientificName']), how='left', left_on='parentNameUsageID', right_on='taxonID', suffixes=(None, "_parent_subclass")).filter(['taxonID', 'scientificName', 'taxonID_parent_subclass', 'scientificName_parent_subclass'])
display(superorders_df)

Unnamed: 0,taxonID,scientificName,taxonID_parent_subclass,scientificName_parent_subclass
0,9500000001,Bryanae,9800000001,Bryidae
1,9500000002,Hypnanae,9800000001,Bryidae


In [91]:
# ---- orders dataframe ----
order_parents = {'phylum': phyla_df, 'class': classes_df, 'subclass': subclasses_df, 'superorder': superorders_df}
# ---- 4 orders_dfs, one for each parent category
orders_dfs = [wfo_df_grouped_by_taxonRank.get_group('order').filter(['taxonID', 'scientificName', 'parentNameUsageID']).merge(order_parents[key].filter(['taxonID', 'scientificName']), how='inner', left_on='parentNameUsageID', right_on='taxonID', suffixes=(None, f"_parent_{key}")) for key in order_parents]
all_columns = set().union(*[df.columns for df in orders_dfs])

# display(orders_dfs)
orders_df = pd.concat([df.filter(['taxonID', 'scientificName', 'parentNameUsageID']) for df in orders_dfs], axis=0)
display(orders_df)

Unnamed: 0,taxonID,scientificName,parentNameUsageID
0,9000000003,Acorales,9949999999
1,9000000013,Alismatales,9949999999
2,9000000014,Amborellales,9949999999
3,9000000022,Apiales,9949999999
4,9000000023,Aquifoliales,9949999999
...,...,...,...
9,4100000612,Orthodontiales,9500000001
10,9000000366,Orthotrichales,9500000001
11,9000000439,Ptychomniales,9500000002
12,9000000457,Rhizogoniales,9500000001


In [93]:
# ---- families dataframe ----
families_df = wfo_df_grouped_by_taxonRank.get_group('family').filter(['taxonID', 'scientificName', 'parentNameUsageID']).merge(orders_df.filter(['taxonID', 'scientificName']), how='inner', left_on='parentNameUsageID', right_on='taxonID', suffixes=(None, "_parent_order")).filter(['taxonID', 'scientificName', 'taxonID_parent_order', 'scientificName_parent_order'])
display(families_df)

Unnamed: 0,taxonID,scientificName,taxonID_parent_order,scientificName_parent_order
0,7000000001,Acanthaceae,9000000283,Lamiales
1,7000000002,Achariaceae,9000000311,Malpighiales
2,7000000003,Achatocarpaceae,9000000088,Caryophyllales
3,7000000004,Acoraceae,9000000003,Acorales
4,7000000005,Acrobolbaceae,9000000278,Jungermanniales
...,...,...,...,...
712,7000000649,Xyridaceae,9000000415,Poales
713,7000000650,Zamiaceae,9000000149,Cycadales
714,7000000651,Zingiberaceae,9000000577,Zingiberales
715,7000000652,Zosteraceae,9000000013,Alismatales


In [94]:
# ---- subfamilies dataframe ----
subfamilies_df = wfo_df_grouped_by_taxonRank.get_group('subfamily').filter(['taxonID', 'scientificName', 'parentNameUsageID']).merge(families_df.filter(['taxonID', 'scientificName']), how='inner', left_on='parentNameUsageID', right_on='taxonID', suffixes=(None, "_parent_family")).filter(['taxonID', 'scientificName', 'taxonID_parent_family', 'scientificName_parent_family'])
display(subfamilies_df)

Unnamed: 0,taxonID,scientificName,taxonID_parent_family,scientificName_parent_family
0,6500000985,Acrosanthoideae,7000000010,Aizoaceae
1,6500000986,Aizooideae,7000000010,Aizoaceae
2,6500000984,Alpinioideae,7000000651,Zingiberaceae
3,1000002334,Arbutoideae,7000000218,Ericaceae
4,4100002170,Asphodeloideae,7000000659,Asphodelaceae
...,...,...,...,...
85,6500000529,Woodwardioideae,7000000077,Blechnaceae
86,4100000025,Wunderlichioideae,7000000146,Asteraceae
87,1000000028,Xanthoceratoideae,7000000544,Sapindaceae
88,6500000775,Xanthorrhoeoideae,7000000659,Asphodelaceae


In [95]:
# ---- tribes dataframe ----
tribes_df = wfo_df_grouped_by_taxonRank.get_group('tribe').filter(['taxonID', 'scientificName', 'parentNameUsageID']).merge(subfamilies_df.filter(['taxonID', 'scientificName']), how='inner', left_on='parentNameUsageID', right_on='taxonID', suffixes=(None, "_parent_subfamily")).filter(['taxonID', 'scientificName', 'taxonID_parent_subfamily', 'scientificName_parent_subfamily'])
display(tribes_df)

Unnamed: 0,taxonID,scientificName,taxonID_parent_subfamily,scientificName_parent_subfamily
0,4100002171,Aloeae,4100002170,Asphodeloideae
1,5000001280,Alpinieae,6500000984,Alpinioideae
2,5000001283,Anisostigmateae,6500000990,Sesuvioideae
3,4100000064,Anthemideae,4100000023,Asteroideae
4,4100002037,Anthocercideae,4100002036,Nicotianoideae
...,...,...,...,...
79,5000000588,Tamijieae,6500000617,Tamijioideae
80,4100000084,Tarchonantheae,4100000027,Carduoideae
81,4000040168,Vernonieae,4100000036,Vernonioideae
82,5000000850,Wunderlichieae,4100000025,Wunderlichioideae


In [96]:
# ---- subtribes dataframe ----
subtribes_df = wfo_df_grouped_by_taxonRank.get_group('subtribe').filter(['taxonID', 'scientificName', 'parentNameUsageID']).merge(tribes_df.filter(['taxonID', 'scientificName']), how='inner', left_on='parentNameUsageID', right_on='taxonID', suffixes=(None, "_parent_tribe")).filter(['taxonID', 'scientificName', 'taxonID_parent_tribe', 'scientificName_parent_tribe'])
display(subtribes_df)

Unnamed: 0,taxonID,scientificName,taxonID_parent_tribe,scientificName_parent_tribe
0,4100001320,Adenostemmatinae,4100000042,Eupatorieae
1,4100002022,Afroasterinae,4100000054,Astereae
2,4500000250,Ageratinae,4100000042,Eupatorieae
3,4500000251,Alomiinae,4100000042,Eupatorieae
4,4100001352,Ambrosiinae,5000000795,Heliantheae
...,...,...,...,...
165,4100001275,Vernoniinae,4000040168,Vernonieae
166,4500000394,Warioniinae,4100000044,Cichorieae
167,4100001932,Xerantheminae,4100000062,Cardueae
168,4500000047,Zaluzaniinae,5000000795,Heliantheae


In [106]:
# ---- genera dataframe ---- columns = ['taxonID', 'scientificName', 'family', 'familyID']
genus_parents = {'phylum': phyla_df, 'class': classes_df, 'subclass': subclasses_df, 'superorder': superorders_df, 'order': orders_df, 'family': families_df, 'subfamily': subfamilies_df, 'tribe': tribes_df, 'subtribe': subtribes_df}
# ---- 9 orders_dfs, one for each parent category
genera_dfs = [wfo_df_grouped_by_taxonRank.get_group('genus').filter(['taxonID', 'scientificName', 'family', 'genus', 'parentNameUsageID']).merge(genus_parents[key].filter(['taxonID', 'scientificName']), how='inner', left_on='parentNameUsageID', right_on='taxonID', suffixes=(None, "_parent_genus")) for key in genus_parents]
# display(genera_dfs)
genera_df = pd.concat([df.filter(['taxonID', 'scientificName', 'parentNameUsageID']) for df in genera_dfs], axis=0)
display(genera_df)

Unnamed: 0,taxonID,scientificName,parentNameUsageID
0,4000000001,Aa,7000000429
1,4000046264,Aakia,7000000483
2,4000000016,Abatia,7000000540
3,4000000024,Abdra,7000000082
4,4000000030,Abeliophyllum,7000000422
...,...,...,...
852,4000041233,Zinnia,4500000230
853,4000041254,Zoegea,4100001338
854,4000041357,Zyrphelis,4100001316
855,4000041358,Zyzyxia,4100001302


In [61]:
print(genera_df.loc[genera_df['scientificName'] == 'Athyrium'])

            taxonID scientificName        family    familyID
1408638  4000003575       Athyrium  Aspleniaceae  7000000051
1408637  4000003569       Athyrium   Athyriaceae  7000000055


In [107]:
# ---- subgenus dataframe ----
subgenera_df = wfo_df_grouped_by_taxonRank.get_group('subgenus').filter(['taxonID', 'scientificName', 'parentNameUsageID']).merge(genera_df.filter(['taxonID', 'scientificName']), how='inner', left_on='parentNameUsageID', right_on='taxonID', suffixes=(None, "_parent_genus")).filter(['taxonID', 'scientificName', 'taxonID_parent_genus', 'scientificName_parent_genus'])
display(subgenera_df)

Unnamed: 0,taxonID,scientificName,taxonID_parent_genus,scientificName_parent_genus
0,3500004052,Aizoon subgen. Aizoon,4000001055,Aizoon
1,3500004053,Aizoon subgen. Capensia,4000001055,Aizoon
2,3500004054,Aizoon subgen. Galenia,4000001055,Aizoon
3,3500004056,Aizoon subgen. Karooica,4000001055,Aizoon
4,3500004057,Aizoon subgen. Kolleria,4000001055,Aizoon
...,...,...,...,...
66,3500004144,Tetragonia subgen. Tetragonella,4000037907,Tetragonia
67,3500004145,Tetragonia subgen. Tetragonia,4000037907,Tetragonia
68,3500004148,Tetragonia subgen. Tetragonoides,4000037907,Tetragonia
69,3500004150,Trianthema subgen. Papularia,4000038870,Trianthema


In [176]:
# ---- sections dataframe ---- 
section_parents = {'genus': genera_df, 'subgenus': subgenera_df}
# ---- 2 section_dfs, one for each parent category
sections_dfs = [wfo_df_grouped_by_taxonRank.get_group('section').filter(['taxonID', 'scientificName', 'parentNameUsageID']).merge(section_parents[key].filter(['taxonID', 'scientificName']), how='inner', left_on='parentNameUsageID', right_on='taxonID', suffixes=(None, f"_parent_{key}")) for key in section_parents]
# display(sections_df)
sections_df = pd.concat([df.filter(['taxonID', 'scientificName', 'parentNameUsageID']) for df in sections_dfs], axis=0)
display(sections_df)

Unnamed: 0,taxonID,scientificName,parentNameUsageID
0,1000041355,Acmella sect. Acmella,4000000312
1,4100000113,Anisoptera sect. Anisoptera,4000002200
2,1000040263,Anisoptera sect. Glabrae,4000002200
3,1000039816,Begonia sect. Alicida,4000004308
4,1000039471,Begonia sect. Apterobegonia,4000004308
...,...,...,...
57,3400004945,Rhododendron sect. Tsutsutsi,1000040525
58,3400012108,Riedelia sect. Coralliophyta,3500004009
59,3400012109,Riedelia sect. Cornuta,3500004009
60,3400012110,Riedelia sect. Geocharides,3500004009


In [123]:
# ---- subsections dataframe ----
subsection_parents = {'genus': genera_df, 'section': sections_df}
# ---- 2 subsection_dfs, one for each parent category
subsections_dfs = [wfo_df_grouped_by_taxonRank.get_group('subsection').filter(['taxonID', 'scientificName', 'parentNameUsageID']).merge(subsection_parents[key].filter(['taxonID', 'scientificName']), how='inner', left_on='parentNameUsageID', right_on='taxonID', suffixes=(None, f"_parent_{key}")) for key in subsection_parents]
# display(subsections_dfs)
subsections_df = pd.concat([df.filter(['taxonID', 'scientificName', 'parentNameUsageID']) for df in subsections_dfs], axis=0)
display(subsections_df)

Unnamed: 0,taxonID,scientificName,parentNameUsageID
0,1000029004,Solidago subsect. Brachychaeta,4000035732
1,1000029005,Solidago subsect. Drummondiani,4000035732
2,3000000102,Solidago subsect. Glomeruliflorae,4000035732
3,3000001047,Solidago subsect. Humiles,4000035732
4,3000002927,Solidago subsect. Junceae,4000035732
...,...,...,...
68,3000003222,Riedelia subsect. Subulocalyces,3400012109
69,1000050961,Rubroshorea subsect. Auriculatae,1000056268
70,1000056269,Rubroshorea subsect. Rubroshorea,1000056268
71,1000000059,Shorea subsect. Mutica,1000000057


In [126]:
# ---- series dataframe ----
series_parents = {'genus': genera_df, 'section': sections_df}
# ---- 2 series_df, one for each parent category
series_dfs = [wfo_df_grouped_by_taxonRank.get_group('series').filter(['taxonID', 'scientificName', 'parentNameUsageID']).merge(series_parents[key].filter(['taxonID', 'scientificName']), how='inner', left_on='parentNameUsageID', right_on='taxonID', suffixes=(None, f"_parent_{key}")) for key in series_parents]
# display(subsections_dfs)
series_df = pd.concat([df.filter(['taxonID', 'scientificName', 'parentNameUsageID']) for df in series_dfs], axis=0)
display(series_df)

Unnamed: 0,taxonID,scientificName,parentNameUsageID
0,1000002456,Gaultheria ser. Trichophyllae,4000015402
1,1000041375,Senecio ser. Nemorenses,4000035060
2,2800002943,Senna ser. Aculeatae,4000035076
3,2800003003,Senna ser. Aphyllae,4000035076
4,2800003659,Solidago ser. Argutae,4000035732
5,2800004439,Solidago ser. Brachychaetae,4000035732
6,1000029000,Solidago ser. Canadenses,4000035732
7,1000024771,Solidago ser. Macrophyllae,4000035732
8,2800004440,Solidago ser. Spectabiles,4000035732
9,1000029002,Solidago ser. Tortifoliae,4000035732


In [127]:
# ---- subseries dataframe ----
subseries_df = wfo_df_grouped_by_taxonRank.get_group('subseries').filter(['taxonID', 'scientificName', 'parentNameUsageID']).merge(series_df.filter(['taxonID', 'scientificName']), how='inner', left_on='parentNameUsageID', right_on='taxonID', suffixes=(None, "_parent_series")).filter(['taxonID', 'scientificName', 'taxonID_parent_series', 'scientificName_parent_series'])
display(subseries_df)

Unnamed: 0,taxonID,scientificName,taxonID_parent_series,scientificName_parent_series
0,2700000233,Styrax subser. Foveolaria,2800000358,Styrax ser. Valvatae
1,2700000232,Styrax subser. Latifoli,2800000358,Styrax ser. Valvatae


In [178]:
# ---- species dataframe ---- columns = ['taxonID', 'scientificName', 'genus', 'family', 'genusID']
genera_taxonID = genera_df.filter(['taxonID', 'scientificName']).rename(columns={'taxonID': 'parentNameUsageID', 'scientificName': 'genus'}).set_index(['parentNameUsageID', 'genus'])
species_df = wfo_df_grouped_by_taxonRank.get_group('species').filter(['taxonID', 'scientificName', 'parentNameUsageID', 'genus', 'family']).join(genera_taxonID, on=['parentNameUsageID', 'genus'], how='inner')
#.join(genera_taxonID, on='genus', how='inner')
# genera_taxonID2 = genera_df.filter(['taxonID', 'scientificName']).rename(columns={'taxonID': 'parentNameUsageID', 'scientificName': 'genus'}).set_index(['parentNameUsageID', 'genus'])
#display(species_df.join(genera_taxonID2, on=['parentNameUsageID', 'genus'], how='inner'))
#display(species_df.join(genera_taxonID, on='genus', how='inner'))

display(species_df)

Unnamed: 0,taxonID,scientificName,parentNameUsageID,genus,family
316954,319089,Aa achalensis,4000000001,Aa,Orchidaceae
756946,760991,Aa argyrolepis,4000000001,Aa,Orchidaceae
918422,922666,Aa aurantiaca,4000000001,Aa,Orchidaceae
923805,928062,Aa calceata,4000000001,Aa,Orchidaceae
947494,951869,Aa colombiana,4000000001,Aa,Orchidaceae
...,...,...,...,...,...
26751,26166,Zyrphelis montana,4000041357,Zyrphelis,Asteraceae
81214,80864,Zyrphelis perezioides,4000041357,Zyrphelis,Asteraceae
92607,92281,Zyrphelis taxifolia,4000041357,Zyrphelis,Asteraceae
1312510,1334611,Zyzyura mayana,4001303479,Zyzyura,Asteraceae


### 2. Inserting data from df into postgres

In [168]:
db_params = {
    'dbname': 'WhatPlant_DB',
    'user': 'postgres',
    'password': 'codersbay',
    'host': 'localhost',
    'port': '5432'
}

conn = psycopg2.connect(
        dbname=db_params['dbname'],
        user=db_params['user'],
        password=db_params['password'],
        host=db_params['host'],
        port=db_params['port']
    )
cur = conn.cursor()

In [169]:
insert_into_phyla_query = '''   INSERT INTO WhatPlant.phyla (phylum_id, phylum_name, phylum_english_name) 
                                VALUES (%s, %s, %s); '''

insert_into_classes_query = ''' INSERT INTO WhatPlant.classes (class_id, class_name, phylum_id) 
                                VALUES (%s, %s, %s); '''

insert_into_subclasses_query = ''' INSERT INTO WhatPlant.subclasses (subclass_id, subclass_name, class_id) 
                                   VALUES (%s, %s, %s); '''

insert_into_orders_query = '''   INSERT INTO WhatPlant.orders (order_id, order_name) 
                                            VALUES (%s, %s); '''
insert_into_orders_query_w_subclass = '''   INSERT INTO WhatPlant.orders (order_id, order_name, subclass_id) 
                                            VALUES (%s, %s, %s); '''
insert_into_orders_query_w_class = '''  INSERT INTO WhatPlant.orders (order_id, order_name, class_id) 
                                        VALUES (%s, %s, %s); '''
insert_into_orders_query_w_phylum = '''     INSERT INTO WhatPlant.orders (order_id, order_name, phylum_id) 
                                            VALUES (%s, %s, %s); '''

insert_into_families_query = '''  INSERT INTO WhatPlant.families (family_id, family_name, order_id)
                                VALUES (%s, %s, %s); '''

insert_into_genera_query = '''  INSERT INTO WhatPlant.genera (genus_id, genus_name, parent_id)
                                VALUES (%s, %s, %s); '''

insert_into_species_query = ''' INSERT INTO WhatPlant.species (species_id, species_name, genus_name, family_name)
                                    VALUES (%s, %s, %s, %s); '''

insert_into_species_synonyms_query = '''    INSERT INTO WhatPlant.species_synonyms 
                                            (species_synonym_id, species_synonym_name, synonym_for)
                                            VALUES (%s, %s, %s); '''

In [170]:
for _, row in phyla_df.iterrows():
    phylum_data = (row['taxonID'], row['scientificName'], row['phylum_englishName'])
    cur.execute(insert_into_phyla_query, phylum_data)

for _, row in classes_df.iterrows():
    class_data = (row['taxonID'], row['scientificName'], row['taxonID_parent_phylum'])
    cur.execute(insert_into_classes_query, class_data)

for _, row in subclasses_df.iterrows():
    subclass_data = (row['taxonID'], row['scientificName'], row['taxonID_parent_class'])
    cur.execute(insert_into_subclasses_query, subclass_data)

In [171]:
for _, row in orders_df.iterrows():
    # order_data = (row['taxonID'], row['scientificName'], row['parentNameUsageID'])
    order_data = (row['taxonID'], row['scientificName'])
    cur.execute(insert_into_orders_query, order_data)

In [172]:
for _, row in families_df.iterrows():
    family_data = (row['taxonID'], row['scientificName'], row['taxonID_parent_order'])
    cur.execute(insert_into_families_query, family_data)

In [173]:
for _, row in genera_df.iterrows():
    genus_data = (row['taxonID'], row['scientificName'], row['parentNameUsageID'])
    cur.execute(insert_into_genera_query, genus_data)

In [174]:
for _, row in species_df.iterrows():
    species_data = (row['taxonID'], row['scientificName'], row['genus'], row['family'])
    cur.execute(insert_into_species_query, species_data)

In [175]:
conn.commit()
cur.close()
conn.close()