In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [8]:
# Load the dataset
df = pd.read_csv('taxonomic_dataset.csv')

In [9]:
df.head()

Unnamed: 0,kingdom,phylum,class,order,family,genus,species,infraspecificEpithet,taxonRank,scientificName,verbatimScientificName,verbatimScientificNameAuthorship,countryCode,stateProvince
0,Animalia,Chordata,Squamata,,Teiidae,Aspidoscelis,Aspidoscelis lineattissimus,,SPECIES,"Aspidoscelis lineattissimus (Cope, 1878)",Aspidoscelis lineattissimus,,MX,Michoacán
1,Plantae,Tracheophyta,Magnoliopsida,Apiales,Apiaceae,Cicuta,Cicuta maculata,,SPECIES,Cicuta maculata L.,Cicuta maculata,,US,New Jersey
2,Plantae,Tracheophyta,Magnoliopsida,Ericales,Ericaceae,Calluna,Calluna vulgaris,,SPECIES,Calluna vulgaris (L.) Hull,Calluna vulgaris,,FI,Oulu
3,Animalia,Arthropoda,Insecta,Coleoptera,Chrysomelidae,Zygogramma,Zygogramma signatipennis,,SPECIES,"Zygogramma signatipennis (Stål, 1859)",Zygogramma signatipennis,,MX,Distrito Federal
4,Animalia,Chordata,Squamata,,Gekkonidae,Hemidactylus,Hemidactylus turcicus,,SPECIES,"Hemidactylus turcicus (Linnaeus, 1758)",Hemidactylus turcicus,,US,Texas


In [10]:
df.dtypes

kingdom                              object
phylum                               object
class                                object
order                                object
family                               object
genus                                object
species                              object
infraspecificEpithet                 object
taxonRank                            object
scientificName                       object
verbatimScientificName               object
verbatimScientificNameAuthorship    float64
countryCode                          object
stateProvince                        object
dtype: object

In [11]:
# Get the unique values of each variable
for column in df.columns:
    print(f"Unique values of {column}: {df[column].unique()}")

Unique values of kingdom: ['Animalia' 'Plantae' 'Fungi' 'Chromista' 'Bacteria' 'Protozoa' 'Viruses']
Unique values of phylum: ['Chordata' 'Tracheophyta' 'Arthropoda' nan 'Mollusca' 'Ascomycota'
 'Basidiomycota' 'Bryophyta' 'Echinodermata' 'Chlorophyta' 'Rhodophyta'
 'Cnidaria' 'Oomycota' 'Cyanobacteria' 'Mycetozoa' 'Marchantiophyta'
 'Ochrophyta' 'Annelida' 'Platyhelminthes' 'Charophyta' 'Porifera'
 'Actinobacteriota' 'Proteobacteria' 'Bryozoa' 'Onychophora'
 'Pisuviricota' 'Mucoromycota' 'Sipuncula' 'Ctenophora' 'Nematoda'
 'Euglenozoa' 'Firmicutes' 'Blastocladiomycota' 'Anthocerotophyta'
 'Entomophthoromycota' 'Brachiopoda' 'Chytridiomycota'
 'Nucleocytoviricota']
Unique values of class: ['Squamata' 'Magnoliopsida' 'Insecta' 'Aves' nan 'Gastropoda' 'Mammalia'
 'Pezizomycetes' 'Testudines' 'Liliopsida' 'Arachnida' 'Amphibia'
 'Polypodiopsida' 'Agaricomycetes' 'Pinopsida' 'Malacostraca' 'Bryopsida'
 'Lecanoromycetes' 'Bivalvia' 'Lycopodiopsida' 'Asteroidea'
 'Sordariomycetes' 'Ulvophyc

In [12]:
# Get the number of missing values in each variable
print(df.isnull().sum())

kingdom                                  0
phylum                                  17
class                                 1305
order                                 3019
family                                  50
genus                                   89
species                               1260
infraspecificEpithet                 96847
taxonRank                                0
scientificName                           0
verbatimScientificName                   0
verbatimScientificNameAuthorship    100000
countryCode                            306
stateProvince                          496
dtype: int64


In [13]:
#since infraspecificEpithet and verbatimScientificNameAuthorship contains no data we can drop them
df.drop(columns=['infraspecificEpithet', 'verbatimScientificNameAuthorship'], inplace=True)


In [14]:
df.head()

Unnamed: 0,kingdom,phylum,class,order,family,genus,species,taxonRank,scientificName,verbatimScientificName,countryCode,stateProvince
0,Animalia,Chordata,Squamata,,Teiidae,Aspidoscelis,Aspidoscelis lineattissimus,SPECIES,"Aspidoscelis lineattissimus (Cope, 1878)",Aspidoscelis lineattissimus,MX,Michoacán
1,Plantae,Tracheophyta,Magnoliopsida,Apiales,Apiaceae,Cicuta,Cicuta maculata,SPECIES,Cicuta maculata L.,Cicuta maculata,US,New Jersey
2,Plantae,Tracheophyta,Magnoliopsida,Ericales,Ericaceae,Calluna,Calluna vulgaris,SPECIES,Calluna vulgaris (L.) Hull,Calluna vulgaris,FI,Oulu
3,Animalia,Arthropoda,Insecta,Coleoptera,Chrysomelidae,Zygogramma,Zygogramma signatipennis,SPECIES,"Zygogramma signatipennis (Stål, 1859)",Zygogramma signatipennis,MX,Distrito Federal
4,Animalia,Chordata,Squamata,,Gekkonidae,Hemidactylus,Hemidactylus turcicus,SPECIES,"Hemidactylus turcicus (Linnaeus, 1758)",Hemidactylus turcicus,US,Texas


In [15]:
len(df.columns)

12

In [16]:
# df.describe()

In [21]:
import folium

# Convert non-numeric data to numeric
df['species'] = pd.to_numeric(df['species'], errors='coerce')

# Handle missing or null values
df.fillna(0, inplace=True)  # Replace NaN values with 0
# Create a choropleth map
m = folium.Map(location=[40, -100], zoom_start=4)

# Add a choropleth layer
folium.Choropleth(
    geo_data=open('countries.geojson').read(),
    data=df,
    columns=['countryCode', 'species'],
    key_on='feature.properties.name',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Number of Species'
).add_to(m)

# Display the map
m

ValueError: key_on `'properties.name'` not found in GeoJSON.