In [19]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data cleaning

### Load data

In [16]:
data = pd.read_csv('../raw_data/gbif/germany.csv', sep = '\t', low_memory = False)

In [21]:
data.head(3)

Unnamed: 0,gbifID,datasetKey,occurrenceID,kingdom,phylum,class,order,family,genus,species,infraspecificEpithet,taxonRank,scientificName,verbatimScientificName,verbatimScientificNameAuthorship,countryCode,locality,stateProvince,occurrenceStatus,individualCount,publishingOrgKey,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters,coordinatePrecision,elevation,elevationAccuracy,depth,depthAccuracy,eventDate,day,month,year,taxonKey,speciesKey,basisOfRecord,institutionCode,collectionCode,catalogNumber,recordNumber,identifiedBy,dateIdentified,license,rightsHolder,recordedBy,typeStatus,establishmentMeans,lastInterpreted,mediaType,issue
0,3746955493,50c9509d-22c7-4a22-a47d-8c48425ef4a7,https://www.inaturalist.org/observations/10536...,Plantae,Tracheophyta,Magnoliopsida,Oxalidales,Oxalidaceae,Oxalis,Oxalis corniculata,,SPECIES,Oxalis corniculata L.,Oxalis corniculata,,DE,,Berlin,PRESENT,,28eb1a3f-1c15-4a95-931a-4af90ecb574d,52.490407,13.321256,13.0,,,,,,2022-01-22T15:02:56,22.0,1.0,2022.0,8427624,8427624.0,HUMAN_OBSERVATION,iNaturalist,Observations,105368931,,Alexis,2022-01-22T16:35:16,CC_BY_4_0,Alexis,Alexis,,,2022-05-19T00:35:30.942Z,StillImage,COORDINATE_ROUNDED
1,1913684610,50c9509d-22c7-4a22-a47d-8c48425ef4a7,https://www.inaturalist.org/observations/16601473,Plantae,Tracheophyta,Magnoliopsida,Asterales,Asteraceae,Erigeron,Erigeron annuus,,SPECIES,Erigeron annuus (L.) Pers.,Erigeron annuus,,DE,,Bayern,PRESENT,,28eb1a3f-1c15-4a95-931a-4af90ecb574d,49.30856,11.357783,4.0,,,,,,2018-09-16T12:08:00,16.0,9.0,2018.0,3117449,3117449.0,HUMAN_OBSERVATION,iNaturalist,Observations,16601473,,Werner Lauckner,2018-09-16T20:16:46,CC_BY_NC_4_0,Werner Lauckner,Werner Lauckner,,,2022-05-19T00:31:18.631Z,StillImage,COORDINATE_ROUNDED
2,2238782236,50c9509d-22c7-4a22-a47d-8c48425ef4a7,https://www.inaturalist.org/observations/20383075,Plantae,Tracheophyta,Polypodiopsida,Polypodiales,Aspleniaceae,Asplenium,Asplenium ruta-muraria,,SPECIES,Asplenium ruta-muraria L.,Asplenium ruta-muraria,,DE,,Berlin,PRESENT,,28eb1a3f-1c15-4a95-931a-4af90ecb574d,52.506927,13.267875,31.0,,,,,,2019-02-14T13:15:00,14.0,2.0,2019.0,2650625,2650625.0,HUMAN_OBSERVATION,iNaturalist,Observations,20383075,,Alexis,2019-02-16T08:32:07,CC_BY_4_0,Alexis,Alexis,,,2022-05-19T00:08:16.238Z,StillImage,COORDINATE_ROUNDED


- **gbifID** is the unique identifier for an occurrence record in GBIF

- **taxonKey**: A taxonKey is the primary id number used in GBIF to id a species (or some higher group). These are the id numbers found in the GBIF backbone taxonomy. Often you will see them in the URL of an occurrence search: https://www.gbif.org/occurrence/search?taxon_key=7412043 46. These are the most important keys and usually what other keys map back to (rule of thumb: **“all keys lead to taxonKeys”**.

- **usageKey**: Using rgbif or GBIF API, you might encounter a usageKey. This key can be treated as equivalent to a GBIF taxonkey in the context of the GBIF Backbone. I have never found a usageKey in the wild that was not equivalent to a GBIF taxonKey, but it might exist.

- **speciesKey**: Sometimes you might find a speciesKey in a download or somewhere else. This key is the key for the species and is often equivalent to the GBIF taxonKey in the context of the GBIF Backbone. If the record is not of rank SPECIES, it won’t have a speciesKey. The same goes for the other GBIF ranked keys: genusKey, familyKey, classKey, orderKey, phylumKey, kingdomKey. These are also equivalent to their corresponding GBIF taxonKeys when the taxa is that rank.

In [22]:
data.columns

Index(['gbifID', 'datasetKey', 'occurrenceID', 'kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'species', 'infraspecificEpithet',
       'taxonRank', 'scientificName', 'verbatimScientificName',
       'verbatimScientificNameAuthorship', 'countryCode', 'locality',
       'stateProvince', 'occurrenceStatus', 'individualCount',
       'publishingOrgKey', 'decimalLatitude', 'decimalLongitude',
       'coordinateUncertaintyInMeters', 'coordinatePrecision', 'elevation',
       'elevationAccuracy', 'depth', 'depthAccuracy', 'eventDate', 'day',
       'month', 'year', 'taxonKey', 'speciesKey', 'basisOfRecord',
       'institutionCode', 'collectionCode', 'catalogNumber', 'recordNumber',
       'identifiedBy', 'dateIdentified', 'license', 'rightsHolder',
       'recordedBy', 'typeStatus', 'establishmentMeans', 'lastInterpreted',
       'mediaType', 'issue'],
      dtype='object')

In [27]:
data.nunique()

gbifID                              2633206
datasetKey                               32
occurrenceID                        1303541
kingdom                                   1
phylum                                    7
class                                    25
order                                   127
family                                  369
genus                                  1887
species                                6395
infraspecificEpithet                    874
taxonRank                                 8
scientificName                         8824
verbatimScientificName                12535
verbatimScientificNameAuthorship        490
countryCode                               1
locality                              37623
stateProvince                            39
occurrenceStatus                          1
individualCount                         159
publishingOrgKey                         22
decimalLatitude                     1175275
decimalLongitude                

In [65]:
data.isna().sum()

gbifID                                    0
datasetKey                                0
occurrenceID                        1329665
kingdom                                   0
phylum                                  439
class                                   561
order                                   630
family                                  630
genus                                  1264
species                               69742
infraspecificEpithet                2595598
taxonRank                                 0
scientificName                            0
verbatimScientificName                    0
verbatimScientificNameAuthorship    1666538
countryCode                               0
locality                            1169383
stateProvince                       2430579
occurrenceStatus                          0
individualCount                     1537743
publishingOrgKey                          0
decimalLatitude                           4
decimalLongitude                

In [63]:
selected_columns = ['gbifID', 'datasetKey', 'kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'species', 'scientificName', 'decimalLatitude', 'decimalLongitude', 'day',
       'month', 'year', 'taxonKey', 'license']

### Select columns

In [66]:
data_selected = data[selected_columns]
data_selected.head(3)

Unnamed: 0,gbifID,datasetKey,kingdom,phylum,class,order,family,genus,species,scientificName,decimalLatitude,decimalLongitude,day,month,year,taxonKey,license
0,3746955493,50c9509d-22c7-4a22-a47d-8c48425ef4a7,Plantae,Tracheophyta,Magnoliopsida,Oxalidales,Oxalidaceae,Oxalis,Oxalis corniculata,Oxalis corniculata L.,52.490407,13.321256,22.0,1.0,2022.0,8427624,CC_BY_4_0
1,1913684610,50c9509d-22c7-4a22-a47d-8c48425ef4a7,Plantae,Tracheophyta,Magnoliopsida,Asterales,Asteraceae,Erigeron,Erigeron annuus,Erigeron annuus (L.) Pers.,49.30856,11.357783,16.0,9.0,2018.0,3117449,CC_BY_NC_4_0
2,2238782236,50c9509d-22c7-4a22-a47d-8c48425ef4a7,Plantae,Tracheophyta,Polypodiopsida,Polypodiales,Aspleniaceae,Asplenium,Asplenium ruta-muraria,Asplenium ruta-muraria L.,52.506927,13.267875,14.0,2.0,2019.0,2650625,CC_BY_4_0


### Drop NaNs and Duplicates

In [67]:
len(data_selected)

2633206

In [69]:
data_selected.duplicated().sum()

0

In [70]:
data_cleaned = data_selected.dropna()

In [71]:
len(data_cleaned)

2563358

In [73]:
100 * len(data_cleaned) / len(data_selected) 

97.34741603961103

In [75]:
data_cleaned.isna().sum()

gbifID              0
datasetKey          0
kingdom             0
phylum              0
class               0
order               0
family              0
genus               0
species             0
scientificName      0
decimalLatitude     0
decimalLongitude    0
day                 0
month               0
year                0
taxonKey            0
license             0
dtype: int64

In [80]:
len(data_cleaned['taxonKey'].unique()) == len(data_cleaned['scientificName'].unique())

True

### Rename columns

In [88]:
data_renamed = data_cleaned.rename(columns = {'decimalLatitude': 'latitude', 'decimalLongitude': 'longitude'})

In [95]:
data_renamed.head(3)

Unnamed: 0,gbifID,datasetKey,kingdom,phylum,class,order,family,genus,species,scientificName,latitude,longitude,day,month,year,taxonKey,license
0,3746955493,50c9509d-22c7-4a22-a47d-8c48425ef4a7,Plantae,Tracheophyta,Magnoliopsida,Oxalidales,Oxalidaceae,Oxalis,Oxalis corniculata,Oxalis corniculata L.,52.490407,13.321256,22.0,1.0,2022.0,8427624,CC_BY_4_0
1,1913684610,50c9509d-22c7-4a22-a47d-8c48425ef4a7,Plantae,Tracheophyta,Magnoliopsida,Asterales,Asteraceae,Erigeron,Erigeron annuus,Erigeron annuus (L.) Pers.,49.30856,11.357783,16.0,9.0,2018.0,3117449,CC_BY_NC_4_0
2,2238782236,50c9509d-22c7-4a22-a47d-8c48425ef4a7,Plantae,Tracheophyta,Polypodiopsida,Polypodiales,Aspleniaceae,Asplenium,Asplenium ruta-muraria,Asplenium ruta-muraria L.,52.506927,13.267875,14.0,2.0,2019.0,2650625,CC_BY_4_0


In [96]:
gbifID = ['gbifID']
col = ['latitude', 'longitude', 'scientificName']
data_final = data_renamed[gbifID + col]
metadata = data_renamed.drop(columns = col)

In [97]:
data_final.head(3)

Unnamed: 0,gbifID,latitude,longitude,scientificName
0,3746955493,52.490407,13.321256,Oxalis corniculata L.
1,1913684610,49.30856,11.357783,Erigeron annuus (L.) Pers.
2,2238782236,52.506927,13.267875,Asplenium ruta-muraria L.


In [98]:
metadata.head(3)

Unnamed: 0,gbifID,datasetKey,kingdom,phylum,class,order,family,genus,species,day,month,year,taxonKey,license
0,3746955493,50c9509d-22c7-4a22-a47d-8c48425ef4a7,Plantae,Tracheophyta,Magnoliopsida,Oxalidales,Oxalidaceae,Oxalis,Oxalis corniculata,22.0,1.0,2022.0,8427624,CC_BY_4_0
1,1913684610,50c9509d-22c7-4a22-a47d-8c48425ef4a7,Plantae,Tracheophyta,Magnoliopsida,Asterales,Asteraceae,Erigeron,Erigeron annuus,16.0,9.0,2018.0,3117449,CC_BY_NC_4_0
2,2238782236,50c9509d-22c7-4a22-a47d-8c48425ef4a7,Plantae,Tracheophyta,Polypodiopsida,Polypodiales,Aspleniaceae,Asplenium,Asplenium ruta-muraria,14.0,2.0,2019.0,2650625,CC_BY_4_0


### Writing csv's

occurences.csv

In [155]:
data_final.to_csv('../raw_data/gbif/occurences.csv', index=False)

metadata.csv

In [156]:
metadata.to_csv('../raw_data/gbif/metadata.csv', index=False)

10, 100, 1k and 10k subsets

In [148]:
data_final.sample(10).to_csv('../biodiversipy/data/gbif/occurences_10.csv', index=False)
data_final.sample(100).to_csv('../biodiversipy/data/gbif/occurences_100.csv', index=False)
data_final.sample(1000).to_csv('../biodiversipy/data/gbif/occurences_1k.csv', index=False)
data_final.sample(10000).to_csv('../biodiversipy/data/gbif/occurences_10k.csv', index=False)

#### Check for correct csv writing

In [157]:
occurences = pd.read_csv('../raw_data/gbif/occurences.csv')

In [158]:
occurences.shape

(2563358, 4)

In [159]:
metadata = pd.read_csv('../raw_data/gbif/metadata.csv')

In [160]:
metadata.shape

(2563358, 14)

In [150]:
occurences_10 = pd.read_csv('../biodiversipy/data/gbif/occurences_10.csv')
occurences_100 = pd.read_csv('../biodiversipy/data/gbif/occurences_100.csv')
occurences_1k = pd.read_csv('../biodiversipy/data/gbif/occurences_1k.csv')
occurences_10k = pd.read_csv('../biodiversipy/data/gbif/occurences_10k.csv')

In [151]:
occurences_10.shape, occurences_100.shape, occurences_1k.shape, occurences_10k.shape

((10, 4), (100, 4), (1000, 4), (10000, 4))

# Check for coordinates outside of Germany with reverse Nominatim [takes too long]

Install geopy-2.2.0

In [103]:
#!pip install geopy

Collecting geopy
  Downloading geopy-2.2.0-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting geographiclib<2,>=1.49
  Downloading geographiclib-1.52-py3-none-any.whl (38 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.52 geopy-2.2.0


In [104]:
from geopy.geocoders import Nominatim

In [105]:
# initialize Nominatim API 
geolocator = Nominatim(user_agent="geoapiExercises")

In [137]:
# Latitude & Longitude input
Latitude = '52.49925782450705'
Longitude = '13.401844897821519'
  
location = geolocator.reverse(Latitude+","+Longitude)
  
# Display
location[0][-11:]

'Deutschland'

In [131]:
#!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/78.4 kB[0m [31m443.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.64.0
