# Plant dataset
Information

# Clean dataset

In [20]:
import numpy as np
import pandas as pd

## Load data

In [21]:
plant = pd.read_csv('../Data/raw/plantanet_subsample15.csv', sep='\t')
display(plant.shape)
plant.head()

(150000, 51)

Unnamed: 0.1,Unnamed: 0,gbifID,datasetKey,occurrenceID,kingdom,phylum,class,order,family,genus,...,identifiedBy,dateIdentified,license,rightsHolder,recordedBy,typeStatus,establishmentMeans,lastInterpreted,mediaType,issue
0,6913275,3956314347,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10050789731,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Plantaginaceae,Cymbalaria,...,,,CC_BY_4_0,,,,,2023-02-08T17:06:19.959Z,,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...
1,11131049,3952045610,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10103707122,Plantae,Tracheophyta,Magnoliopsida,Sapindales,Anacardiaceae,Pistacia,...,,,CC_BY_4_0,,,,,2023-02-08T17:30:41.400Z,,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...
2,6797613,3999022620,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10593121245,Plantae,Tracheophyta,Liliopsida,Asparagales,Orchidaceae,Anacamptis,...,,,CC_BY_4_0,,,,,2023-02-08T17:39:19.787Z,,COUNTRY_DERIVED_FROM_COORDINATES;CONTINENT_DER...
3,9056894,3951723097,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10067365606,Plantae,Tracheophyta,Magnoliopsida,Boraginales,Boraginaceae,Cynoglossum,...,,,CC_BY_4_0,,,,,2023-02-08T17:29:23.909Z,,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...
4,4757360,3952394627,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10536959012,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Orobanchaceae,Lathraea,...,,,CC_BY_4_0,,,,,2023-02-08T17:38:06.790Z,,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...


Explore data

In [22]:
plant.dtypes

Unnamed: 0                            int64
gbifID                                int64
datasetKey                           object
occurrenceID                         object
kingdom                              object
phylum                               object
class                                object
order                                object
family                               object
genus                                object
species                              object
infraspecificEpithet                float64
taxonRank                            object
scientificName                       object
verbatimScientificName               object
verbatimScientificNameAuthorship     object
countryCode                          object
locality                            float64
stateProvince                       float64
occurrenceStatus                     object
individualCount                       int64
publishingOrgKey                     object
decimalLatitude                 

In [23]:
plant.isnull().sum()/len(plant) # check for missing values

Unnamed: 0                          0.000000
gbifID                              0.000000
datasetKey                          0.000000
occurrenceID                        0.000000
kingdom                             0.000000
phylum                              0.000000
class                               0.000000
order                               0.000000
family                              0.000000
genus                               0.000027
species                             0.000027
infraspecificEpithet                1.000000
taxonRank                           0.000000
scientificName                      0.000000
verbatimScientificName              0.000000
verbatimScientificNameAuthorship    0.000000
countryCode                         0.000027
locality                            1.000000
stateProvince                       1.000000
occurrenceStatus                    0.000000
individualCount                     0.000000
publishingOrgKey                    0.000000
decimalLat

## Clean and standardize dataset

Standirdize columns
- names to lowercase
- remove "unnamed:_0" column
- correct format of dates so it can be transformed
- transform date columns to datetime format

In [24]:
def standardize_columns(df):
    '''
This functions replaces the names of the columns by the lower case version,
blank spaces for underscores '_' and drops the column "Unnamed: 0" with index values
input: data frame
    '''
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    return df.drop(['unnamed:_0'], axis=1) # I had to change this part because in this version of the csv file there is no "unnamed: 0" column (before: return df.drop(['unnamed:_0'], axis=1))

def clean_columns (df):
    df = standardize_columns(df)
    df['eventdate'] = df['eventdate'].apply(lambda x: x + '.00' if x.find(".") == -1 else x)
    df['eventdate'] = pd.to_datetime(df['eventdate'], errors='coerce')
    #df['effective_month'] = df['effective_to_date'].dt.month 
    df['dateidentified'] = pd.to_datetime(df['dateidentified'], errors='coerce')
    return df
        
plant_copy = clean_columns(plant)
plant_copy.head()

Unnamed: 0,gbifid,datasetkey,occurrenceid,kingdom,phylum,class,order,family,genus,species,...,identifiedby,dateidentified,license,rightsholder,recordedby,typestatus,establishmentmeans,lastinterpreted,mediatype,issue
0,3956314347,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10050789731,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Plantaginaceae,Cymbalaria,Cymbalaria muralis,...,,NaT,CC_BY_4_0,,,,,2023-02-08T17:06:19.959Z,,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...
1,3952045610,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10103707122,Plantae,Tracheophyta,Magnoliopsida,Sapindales,Anacardiaceae,Pistacia,Pistacia lentiscus,...,,NaT,CC_BY_4_0,,,,,2023-02-08T17:30:41.400Z,,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...
2,3999022620,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10593121245,Plantae,Tracheophyta,Liliopsida,Asparagales,Orchidaceae,Anacamptis,Anacamptis pyramidalis,...,,NaT,CC_BY_4_0,,,,,2023-02-08T17:39:19.787Z,,COUNTRY_DERIVED_FROM_COORDINATES;CONTINENT_DER...
3,3951723097,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10067365606,Plantae,Tracheophyta,Magnoliopsida,Boraginales,Boraginaceae,Cynoglossum,Cynoglossum creticum,...,,NaT,CC_BY_4_0,,,,,2023-02-08T17:29:23.909Z,,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...
4,3952394627,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10536959012,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Orobanchaceae,Lathraea,Lathraea squamaria,...,,NaT,CC_BY_4_0,,,,,2023-02-08T17:38:06.790Z,,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...


### Remove columns with no data
Columns to drop: mediaType, establishmentMeans, stateProvince, occurrenceStatus, taxonRank, infraspecificEpithet, locality, coordinatePrecision, depth, depthAccuracy, institutionCode, collectionCode, catalogNumber, recordNumber, 
identifiedBy, dateIdentified, rightsHolder, recordedBy, typeStatus 

In [25]:
plant_copy.drop(['mediatype', 'establishmentmeans', 'stateprovince', 'taxonrank', 'infraspecificepithet', 
             'locality', 'coordinateprecision', 'depth', 'depthaccuracy', 'institutioncode', 'collectioncode', 
             'catalognumber', 'recordnumber', 'identifiedby', 'dateidentified', 'rightsholder', 'recordedby', 
             'typestatus'], axis=1, inplace=True)
plant_copy.head()

Unnamed: 0,gbifid,datasetkey,occurrenceid,kingdom,phylum,class,order,family,genus,species,...,eventdate,day,month,year,taxonkey,specieskey,basisofrecord,license,lastinterpreted,issue
0,3956314347,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10050789731,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Plantaginaceae,Cymbalaria,Cymbalaria muralis,...,2016-05-28 11:32:01.482,28,5,2016,8200663,8200663.0,HUMAN_OBSERVATION,CC_BY_4_0,2023-02-08T17:06:19.959Z,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...
1,3952045610,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10103707122,Plantae,Tracheophyta,Magnoliopsida,Sapindales,Anacardiaceae,Pistacia,Pistacia lentiscus,...,2019-08-18 12:02:28.413,18,8,2019,3190583,3190583.0,HUMAN_OBSERVATION,CC_BY_4_0,2023-02-08T17:30:41.400Z,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...
2,3999022620,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10593121245,Plantae,Tracheophyta,Liliopsida,Asparagales,Orchidaceae,Anacamptis,Anacamptis pyramidalis,...,2022-05-20 21:10:09.158,20,5,2022,2808330,2808330.0,HUMAN_OBSERVATION,CC_BY_4_0,2023-02-08T17:39:19.787Z,COUNTRY_DERIVED_FROM_COORDINATES;CONTINENT_DER...
3,3951723097,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10067365606,Plantae,Tracheophyta,Magnoliopsida,Boraginales,Boraginaceae,Cynoglossum,Cynoglossum creticum,...,2019-04-14 10:50:38.537,14,4,2019,4064467,4064467.0,HUMAN_OBSERVATION,CC_BY_4_0,2023-02-08T17:29:23.909Z,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...
4,3952394627,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10536959012,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Orobanchaceae,Lathraea,Lathraea squamaria,...,2022-03-14 14:36:23.989,14,3,2022,3738478,3738478.0,HUMAN_OBSERVATION,CC_BY_4_0,2023-02-08T17:38:06.790Z,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...


In [26]:
plant_copy.isnull().sum()/len(plant_copy)

gbifid                              0.000000
datasetkey                          0.000000
occurrenceid                        0.000000
kingdom                             0.000000
phylum                              0.000000
class                               0.000000
order                               0.000000
family                              0.000000
genus                               0.000027
species                             0.000027
scientificname                      0.000000
verbatimscientificname              0.000000
verbatimscientificnameauthorship    0.000000
countrycode                         0.000027
occurrencestatus                    0.000000
individualcount                     0.000000
publishingorgkey                    0.000000
decimallatitude                     0.000000
decimallongitude                    0.000000
coordinateuncertaintyinmeters       0.288507
elevation                           0.180520
elevationaccuracy                   0.180520
eventdate 

Explore specific columns and decided to remove them or not beased on their information and the aim of the project:
- basisofrecord: contains just one value for all entries (HUMAN_OBSERVATION)
- ocurrencestatus: contains just one value for all entries (PRESENT)
- kingdom: one value ('Plantae')
- phylum: one value ('Tracheophyta')
- datasetkey: one value ('14d5676a-2c54-4f94-9023-1e8dcd822aa0')
- elevation: not relevant for analysis
- elevationaccuracy: not relevant
- coordinate uncertainty in meters: not relevant
- specieskey

In [27]:
display(plant_copy['occurrencestatus'].unique())
display(plant_copy['basisofrecord'].unique())
display(plant_copy['kingdom'].unique())
display(plant_copy['phylum'].unique())
display(plant_copy['datasetkey'].unique())
display(plant_copy['individualcount'].unique())


array(['PRESENT'], dtype=object)

array(['HUMAN_OBSERVATION'], dtype=object)

array(['Plantae'], dtype=object)

array(['Tracheophyta'], dtype=object)

array(['14d5676a-2c54-4f94-9023-1e8dcd822aa0'], dtype=object)

array([1], dtype=int64)

In [28]:
display(plant_copy['occurrenceid'].duplicated().sum())
display(plant_copy['gbifid'].duplicated().sum())
# both columns contain unique identifiers, we can safely remove one

0

0

Drop the columns because they don't bring relevant information for the analysis of the dataset

In [29]:
plant_copy.drop(['gbifid', 'occurrencestatus', 'basisofrecord', 'coordinateuncertaintyinmeters',
                 'elevation', 'elevationaccuracy', 'specieskey', 'datasetkey', 'kingdom', 'phylum',
                 'verbatimscientificname', 'verbatimscientificnameauthorship', 'individualcount',
                 'publishingorgkey', 'taxonkey', 'license', 'lastinterpreted', 'issue'], axis = 1, inplace = True)
plant_copy.columns

Index(['occurrenceid', 'class', 'order', 'family', 'genus', 'species',
       'scientificname', 'countrycode', 'decimallatitude', 'decimallongitude',
       'eventdate', 'day', 'month', 'year'],
      dtype='object')

In [30]:
plant_copy.isnull().sum()/len(plant_copy)

occurrenceid        0.000000
class               0.000000
order               0.000000
family              0.000000
genus               0.000027
species             0.000027
scientificname      0.000000
countrycode         0.000027
decimallatitude     0.000000
decimallongitude    0.000000
eventdate           0.000000
day                 0.000000
month               0.000000
year                0.000000
dtype: float64

Remaining missing values:
- genus and species: might be the case that it was not possible to identify to genus or species level -> replace missing values for 'unknown'
- countrycode: replace for 'U' = unknown

In [31]:
plant_copy['genus'].fillna('unknown', inplace=True)
plant_copy['species'].fillna('unknown', inplace=True)

In [32]:
# here I want to see all the possible unique values so I can replace the missing ones for 'U'. 
# But first I want to check that no country has 'U' as code.
# countrycodes.sort() and sorted() return error: '<' not supported between instances of 'float' and 'str'
# There are only strings and missing values.
# Alternative chosen: fillna('unknown')
countrycodes= plant_copy['countrycode'].unique()
countrycodes = list(countrycodes)
#countrycodes.sort() 
#countrycodes = sorted(countrycodes, key=lambda x: (x is None, x))
countrycodes

['ES',
 'FR',
 'DE',
 'RU',
 'US',
 'GB',
 'IT',
 'AT',
 'NL',
 'LU',
 'AU',
 'GR',
 'PL',
 'CZ',
 'CH',
 'BE',
 'ME',
 'PE',
 'BR',
 'HU',
 'LT',
 'DK',
 'TR',
 'SE',
 'RS',
 'GP',
 'SI',
 'UA',
 'MA',
 'IE',
 'FI',
 'CA',
 'PT',
 'IL',
 'SK',
 'LV',
 'MX',
 'SG',
 'BO',
 'IM',
 'IN',
 'RO',
 'BG',
 'UY',
 'HR',
 'PH',
 'ID',
 'TW',
 'NC',
 'TH',
 'PK',
 'DZ',
 'AR',
 'RE',
 'CR',
 'NP',
 'KG',
 'NO',
 'EE',
 'CO',
 'JP',
 'NZ',
 'VN',
 'MQ',
 'BL',
 'IR',
 'SM',
 'JE',
 'GE',
 'BY',
 'CL',
 'SN',
 'CY',
 'TM',
 'RW',
 'AD',
 'MT',
 'GG',
 'BM',
 'VE',
 'CN',
 'CM',
 'MD',
 'IS',
 'UG',
 'MZ',
 'BD',
 'ZA',
 'GT',
 'KM',
 'DO',
 'GD',
 'TT',
 'PR',
 'VU',
 'PY',
 'BA',
 'SV',
 'AL',
 'EC',
 'LI',
 nan,
 'IQ',
 'NG',
 'SA',
 'SC',
 'GI',
 'TZ',
 'HT',
 'AX',
 'BT',
 'GF',
 'ZM',
 'AZ',
 'AM',
 'EG',
 'PF',
 'UZ',
 'ML',
 'CU',
 'KR',
 'KE',
 'XK',
 'LC',
 'MK',
 'VI',
 'CW',
 'HN',
 'HK',
 'MV',
 'GA',
 'MU',
 'SY',
 'MY',
 'CD',
 'LK',
 'TN',
 'KZ',
 'LR',
 'LY',
 'PA',
 'MC',
 'MF',


In [33]:
plant_copy['countrycode'].fillna('unknown', inplace=True)

In [34]:
plant_copy.isnull().sum()/len(plant_copy)

occurrenceid        0.0
class               0.0
order               0.0
family              0.0
genus               0.0
species             0.0
scientificname      0.0
countrycode         0.0
decimallatitude     0.0
decimallongitude    0.0
eventdate           0.0
day                 0.0
month               0.0
year                0.0
dtype: float64

## Save cleaned file to csv

In [35]:
from pathlib import Path
file_path = Path('../Data/clean/plantanet_clean.csv')
plant_copy.to_csv(file_path, sep='\t', index=False)

# Internet usage and GDP dataset

Variable Name & Description of Indicator:
- country: Unique Identifier
- incomeperperson: Gross Domestic Product per capita in constant 2000 US$. The inflation but not the differences in the cost of living between countries has been taken into account.
- internetuserate: Internet users (per 100 people). Internet users are people with access to the worldwide network.
- urbanrate: Urban population (% of total) Urban population refers to people living in urban areas as defined by national statistical offices (calculated using World Bank population estimates and urban ratios from the United Nations World Urbanization Prospects)

More information is available at [Gapminder](www.gapminder.org)


In [36]:
internet = pd.read_csv('../Data/raw/gapminder_internet_edit.csv', sep=';')
display(internet.shape)
internet.head()

(211, 4)

Unnamed: 0,country,incomeperperson,internetuserate,urbanrate
0,Afghanistan,,3.654122,24.04
1,Albania,1914.996551,44.989947,46.72
2,Algeria,2231.993335,12.500073,65.22
3,Andorra,21943.3399,81.0,88.92
4,Angola,1381.004268,9.999954,56.7


## Explore data

In [37]:
internet.dtypes

country             object
incomeperperson    float64
internetuserate    float64
urbanrate          float64
dtype: object

In [38]:
internet.isnull().sum()/len(internet) # proportion of missing values

country            0.000000
incomeperperson    0.099526
internetuserate    0.090047
urbanrate          0.042654
dtype: float64

In [39]:
print(internet[internet['incomeperperson'].isnull()]) # rows with missing values in column 'incomeperperson'

                   country  incomeperperson  internetuserate  urbanrate
0              Afghanistan              NaN         3.654122      24.04
8                    Aruba              NaN        41.800889      46.78
34          Cayman Islands              NaN        66.000000     100.00
43            Cook Islands              NaN              NaN        NaN
61           Faroe Islands              NaN        75.200000      41.42
65        French Polynesia              NaN        48.957328      51.64
71               Gibraltar              NaN        65.000000        NaN
75              Guadeloupe              NaN              NaN        NaN
76                    Guam              NaN              NaN      93.16
99         Korea Dem. Rep.              NaN              NaN      62.68
101                 Kuwait              NaN        38.260234      98.36
121             Martinique              NaN              NaN        NaN
132                Myanmar              NaN              NaN    

In order to merge this dataframe with the plants one, we need another dataframe to act as a bridge.
- Internet: country info is stored by name
- Plants: country infor is stored by coountry code

# Country codes

In [40]:
codes = pd.read_csv('../Data/raw/country_list_edit.csv', sep=';') # this dataframe will work as bridge between plants and internet dataframes
display(codes.shape)
codes.head()

(250, 2)

Unnamed: 0,Name,Code
0,Afghanistan,AF
1,Aland Islands (Finland),AX
2,Albania,AL
3,Algeria,DZ
4,American Samoa,AS


## Explore data and clean

In [41]:
# rename the columns using a dictionary
codes.rename(columns=({'Name': 'country', 'Code': 'countrycode'}), inplace=True)
codes.columns

Index(['country', 'countrycode'], dtype='object')

In [42]:
codes.head()

Unnamed: 0,country,countrycode
0,Afghanistan,AF
1,Aland Islands (Finland),AX
2,Albania,AL
3,Algeria,DZ
4,American Samoa,AS


## Missing values

In [43]:
# check if there are missing values
codes.isnull().sum()/len(codes)

country        0.000
countrycode    0.004
dtype: float64

In [44]:
# check which countries do not have a code
print(codes[codes['countrycode'].isnull()])

     country countrycode
154  Namibia         NaN


In [45]:
# Namibia's country code is NA - replace missing value for 'NA'
codes['countrycode'].fillna('NA', inplace=True)

In [46]:
codes.isnull().sum()/len(codes)

country        0.0
countrycode    0.0
dtype: float64

In [47]:
# verify that there are no duplicated values
codes['countrycode'].duplicated().sum()

0

In [48]:
# compare the common columns in the two dataframes before the merge
cc = plant_copy["countrycode"].isin(codes["countrycode"])
type(cc)
cc.value_counts()

countrycode
True     149996
False         4
Name: count, dtype: int64

In [49]:
dd = codes["countrycode"].isin(plant_copy["countrycode"])
type(dd)
dd.value_counts()

countrycode
True     186
False     64
Name: count, dtype: int64

# Merge dataframes: plant_copy and codes
- Codes dataframe will serve as bridge to connect to the internet dataframe

In [50]:
plant_codes = pd.merge(plant_copy, codes, how='left', on='countrycode')
plant_codes.head()

Unnamed: 0,occurrenceid,class,order,family,genus,species,scientificname,countrycode,decimallatitude,decimallongitude,eventdate,day,month,year,country
0,q-10050789731,Magnoliopsida,Lamiales,Plantaginaceae,Cymbalaria,Cymbalaria muralis,"Cymbalaria muralis P.Gaertn., B.Mey. & Scherb.",ES,39.737365,-0.828874,2016-05-28 11:32:01.482,28,5,2016,Spain
1,q-10103707122,Magnoliopsida,Sapindales,Anacardiaceae,Pistacia,Pistacia lentiscus,Pistacia lentiscus L.,FR,43.06862,5.817762,2019-08-18 12:02:28.413,18,8,2019,France
2,q-10593121245,Liliopsida,Asparagales,Orchidaceae,Anacamptis,Anacamptis pyramidalis,Anacamptis pyramidalis (L.) Rich.,FR,45.8525,-0.155,2022-05-20 21:10:09.158,20,5,2022,France
3,q-10067365606,Magnoliopsida,Boraginales,Boraginaceae,Cynoglossum,Cynoglossum creticum,Cynoglossum creticum Mill.,FR,43.151389,5.736667,2019-04-14 10:50:38.537,14,4,2019,France
4,q-10536959012,Magnoliopsida,Lamiales,Orobanchaceae,Lathraea,Lathraea squamaria,Lathraea squamaria L.,DE,48.07944,7.587583,2022-03-14 14:36:23.989,14,3,2022,Germany


In [51]:
plant_codes.isnull().sum()/len(plant_codes)

occurrenceid        0.000000
class               0.000000
order               0.000000
family              0.000000
genus               0.000000
species             0.000000
scientificname      0.000000
countrycode         0.000000
decimallatitude     0.000000
decimallongitude    0.000000
eventdate           0.000000
day                 0.000000
month               0.000000
year                0.000000
country             0.000027
dtype: float64

In [52]:
indexes = [plant_codes[plant_codes['country'].isnull()].index]
type(indexes)
indexes

[Index([7001, 38054, 56843, 143602], dtype='int64')]

In [53]:
# iterate through the indexes where there's a missing value in countries (usefull when there are many) and returns the country corresponding country code
for i in indexes:
    print(f"The value of the row index {i} is {plant_codes.loc[i, 'countrycode']}")

The value of the row index Index([7001, 38054, 56843, 143602], dtype='int64') is 7001      unknown
38054     unknown
56843     unknown
143602    unknown
Name: countrycode, dtype: object


Based on this information, the missing values in country will be filled with 'unknown'

In [54]:
plant_codes.fillna('unknown', inplace=True) # fill missing values with 'unknown'
plant_codes.isnull().sum()/len(plant_codes) # double-check that there are no more missing values

occurrenceid        0.0
class               0.0
order               0.0
family              0.0
genus               0.0
species             0.0
scientificname      0.0
countrycode         0.0
decimallatitude     0.0
decimallongitude    0.0
eventdate           0.0
day                 0.0
month               0.0
year                0.0
country             0.0
dtype: float64

## Merge dataframes: plant_codes and internet

In [55]:
plant_internet = pd.merge(plant_codes, internet, how='left', on='country') 
plant_internet.head()

Unnamed: 0,occurrenceid,class,order,family,genus,species,scientificname,countrycode,decimallatitude,decimallongitude,eventdate,day,month,year,country,incomeperperson,internetuserate,urbanrate
0,q-10050789731,Magnoliopsida,Lamiales,Plantaginaceae,Cymbalaria,Cymbalaria muralis,"Cymbalaria muralis P.Gaertn., B.Mey. & Scherb.",ES,39.737365,-0.828874,2016-05-28 11:32:01.482,28,5,2016,Spain,15461.75837,65.808554,77.12
1,q-10103707122,Magnoliopsida,Sapindales,Anacardiaceae,Pistacia,Pistacia lentiscus,Pistacia lentiscus L.,FR,43.06862,5.817762,2019-08-18 12:02:28.413,18,8,2019,France,22878.46657,77.498619,77.36
2,q-10593121245,Liliopsida,Asparagales,Orchidaceae,Anacamptis,Anacamptis pyramidalis,Anacamptis pyramidalis (L.) Rich.,FR,45.8525,-0.155,2022-05-20 21:10:09.158,20,5,2022,France,22878.46657,77.498619,77.36
3,q-10067365606,Magnoliopsida,Boraginales,Boraginaceae,Cynoglossum,Cynoglossum creticum,Cynoglossum creticum Mill.,FR,43.151389,5.736667,2019-04-14 10:50:38.537,14,4,2019,France,22878.46657,77.498619,77.36
4,q-10536959012,Magnoliopsida,Lamiales,Orobanchaceae,Lathraea,Lathraea squamaria,Lathraea squamaria L.,DE,48.07944,7.587583,2022-03-14 14:36:23.989,14,3,2022,Germany,25306.18719,82.526898,73.64


# Final dataframe
Contains information on:
- Plant species (at different levels of identification).
- Location and country were plant was identified using the app
- Date of identification of the plant
- GDP per country
- Income per person per country
- Internet user rate per country
- Urban rate per country

In [56]:
plant_internet.shape

(150000, 18)

In [57]:
plant_internet.dtypes

occurrenceid                object
class                       object
order                       object
family                      object
genus                       object
species                     object
scientificname              object
countrycode                 object
decimallatitude            float64
decimallongitude           float64
eventdate           datetime64[ns]
day                          int64
month                        int64
year                         int64
country                     object
incomeperperson            float64
internetuserate            float64
urbanrate                  float64
dtype: object

In [58]:
plant_internet.isnull().sum()/len(plant_internet)

occurrenceid        0.000000
class               0.000000
order               0.000000
family              0.000000
genus               0.000000
species             0.000000
scientificname      0.000000
countrycode         0.000000
decimallatitude     0.000000
decimallongitude    0.000000
eventdate           0.000000
day                 0.000000
month               0.000000
year                0.000000
country             0.000000
incomeperperson     0.004453
internetuserate     0.004340
urbanrate           0.004020
dtype: float64

## Missing values
- The missing information could be retrieved doing research online. Due to time constrains, the values will be replace by 0 to avoid the coulmn types to change to string, and later ignored during regression analysis to avoid bias.
- Another option would be to drop those rows qith missing information since the datframe is so big. The problem would be that the countries with missing information are small ones that could be the most interesting for the analysis. At least, for the analysis for which there is data.

In [56]:
# list of countris with no income information
countries_missing_income = list(plant_internet['country'][plant_internet['incomeperperson'].isnull()])
countries_missing_income
set(countries_missing_income)

{'Afghanistan',
 'Aland Islands (Finland)',
 'Bonaire',
 'Cook Islands',
 'Curacao',
 'Faroe Islands',
 'French Guiana',
 'French Polynesia',
 'Gibraltar',
 'Guadeloupe',
 'Guernsey',
 'Isle of Man',
 'Jersey',
 'Kosovo',
 'Kuwait',
 'Martinique',
 'Mayotte',
 'Myanmar',
 'New Caledonia',
 'Palestine',
 'Reunion',
 'Saint Bartholemy',
 'Saint Martin (French part)',
 'Saint Pierre and Miquelon',
 'Taiwan',
 'Virgin Islands',
 'Wallis and Futuna',
 'unknown'}

### Save dataset WITH missing values to plot
- In the plots, to be able to truly visualize the real outliers and not the artificially-filled zeros, it's better to keep it as NaN
- Having another dataset with the NaN filled with zeros is the best alternative to avoid dropping those rows.


In [59]:
file_path = Path('../Data/clean/plantanet_internet_gdp_nan.csv')
plant_internet.to_csv(file_path, sep=',', index=False)

Fill missing values with zeros

In [60]:
plant_internet['incomeperperson'].fillna(0, inplace=True) # fill missing values with zeros
plant_internet['internetuserate'].fillna(0, inplace=True)
plant_internet['urbanrate'].fillna(0, inplace=True)
plant_internet.isnull().sum()/len(plant_internet)

occurrenceid        0.0
class               0.0
order               0.0
family              0.0
genus               0.0
species             0.0
scientificname      0.0
countrycode         0.0
decimallatitude     0.0
decimallongitude    0.0
eventdate           0.0
day                 0.0
month               0.0
year                0.0
country             0.0
incomeperperson     0.0
internetuserate     0.0
urbanrate           0.0
dtype: float64

In [61]:
plant_internet.head()

Unnamed: 0,occurrenceid,class,order,family,genus,species,scientificname,countrycode,decimallatitude,decimallongitude,eventdate,day,month,year,country,incomeperperson,internetuserate,urbanrate
0,q-10050789731,Magnoliopsida,Lamiales,Plantaginaceae,Cymbalaria,Cymbalaria muralis,"Cymbalaria muralis P.Gaertn., B.Mey. & Scherb.",ES,39.737365,-0.828874,2016-05-28 11:32:01.482,28,5,2016,Spain,15461.75837,65.808554,77.12
1,q-10103707122,Magnoliopsida,Sapindales,Anacardiaceae,Pistacia,Pistacia lentiscus,Pistacia lentiscus L.,FR,43.06862,5.817762,2019-08-18 12:02:28.413,18,8,2019,France,22878.46657,77.498619,77.36
2,q-10593121245,Liliopsida,Asparagales,Orchidaceae,Anacamptis,Anacamptis pyramidalis,Anacamptis pyramidalis (L.) Rich.,FR,45.8525,-0.155,2022-05-20 21:10:09.158,20,5,2022,France,22878.46657,77.498619,77.36
3,q-10067365606,Magnoliopsida,Boraginales,Boraginaceae,Cynoglossum,Cynoglossum creticum,Cynoglossum creticum Mill.,FR,43.151389,5.736667,2019-04-14 10:50:38.537,14,4,2019,France,22878.46657,77.498619,77.36
4,q-10536959012,Magnoliopsida,Lamiales,Orobanchaceae,Lathraea,Lathraea squamaria,Lathraea squamaria L.,DE,48.07944,7.587583,2022-03-14 14:36:23.989,14,3,2022,Germany,25306.18719,82.526898,73.64


## Save final cleaned dataframe

In [111]:
from pathlib import Path
file_path = Path('../Data/clean/plantanet_internet_gdp.csv')
plant_internet.to_csv(file_path, sep=',', index=False)

In [112]:
plant_clean = pd.read_csv('../Data/clean/plantanet_internet_gdp.csv', sep=',')
display(plant_clean.shape)
display(plant_clean.columns)
plant_clean.head()

(150000, 18)

Index(['occurrenceid', 'class', 'order', 'family', 'genus', 'species',
       'scientificname', 'countrycode', 'decimallatitude', 'decimallongitude',
       'eventdate', 'day', 'month', 'year', 'country', 'incomeperperson',
       'internetuserate', 'urbanrate'],
      dtype='object')

Unnamed: 0,occurrenceid,class,order,family,genus,species,scientificname,countrycode,decimallatitude,decimallongitude,eventdate,day,month,year,country,incomeperperson,internetuserate,urbanrate
0,q-10050789731,Magnoliopsida,Lamiales,Plantaginaceae,Cymbalaria,Cymbalaria muralis,"Cymbalaria muralis P.Gaertn., B.Mey. & Scherb.",ES,39.737365,-0.828874,2016-05-28 11:32:01.482,28,5,2016,Spain,15461.75837,65.808554,77.12
1,q-10103707122,Magnoliopsida,Sapindales,Anacardiaceae,Pistacia,Pistacia lentiscus,Pistacia lentiscus L.,FR,43.06862,5.817762,2019-08-18 12:02:28.413,18,8,2019,France,22878.46657,77.498619,77.36
2,q-10593121245,Liliopsida,Asparagales,Orchidaceae,Anacamptis,Anacamptis pyramidalis,Anacamptis pyramidalis (L.) Rich.,FR,45.8525,-0.155,2022-05-20 21:10:09.158,20,5,2022,France,22878.46657,77.498619,77.36
3,q-10067365606,Magnoliopsida,Boraginales,Boraginaceae,Cynoglossum,Cynoglossum creticum,Cynoglossum creticum Mill.,FR,43.151389,5.736667,2019-04-14 10:50:38.537,14,4,2019,France,22878.46657,77.498619,77.36
4,q-10536959012,Magnoliopsida,Lamiales,Orobanchaceae,Lathraea,Lathraea squamaria,Lathraea squamaria L.,DE,48.07944,7.587583,2022-03-14 14:36:23.989,14,3,2022,Germany,25306.18719,82.526898,73.64
