# Exploratory Data Analysis (EDA)

In [1]:
import numpy as np
import pandas as pd

## Load data

In [4]:
plant = pd.read_csv('../Data/clean/plantanet_clean.csv', sep='\t')
display(plant.shape)
plant.head()

(150000, 26)

Unnamed: 0,gbifid,datasetkey,occurrenceid,kingdom,phylum,class,order,family,genus,species,...,decimallatitude,decimallongitude,eventdate,day,month,year,taxonkey,license,lastinterpreted,issue
0,3956314347,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10050789731,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Plantaginaceae,Cymbalaria,Cymbalaria muralis,...,39.737365,-0.828874,2016-05-28 11:32:01.482,28,5,2016,8200663,CC_BY_4_0,2023-02-08T17:06:19.959Z,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...
1,3952045610,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10103707122,Plantae,Tracheophyta,Magnoliopsida,Sapindales,Anacardiaceae,Pistacia,Pistacia lentiscus,...,43.06862,5.817762,2019-08-18 12:02:28.413,18,8,2019,3190583,CC_BY_4_0,2023-02-08T17:30:41.400Z,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...
2,3999022620,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10593121245,Plantae,Tracheophyta,Liliopsida,Asparagales,Orchidaceae,Anacamptis,Anacamptis pyramidalis,...,45.8525,-0.155,2022-05-20 21:10:09.158,20,5,2022,2808330,CC_BY_4_0,2023-02-08T17:39:19.787Z,COUNTRY_DERIVED_FROM_COORDINATES;CONTINENT_DER...
3,3951723097,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10067365606,Plantae,Tracheophyta,Magnoliopsida,Boraginales,Boraginaceae,Cynoglossum,Cynoglossum creticum,...,43.151389,5.736667,2019-04-14 10:50:38.537,14,4,2019,4064467,CC_BY_4_0,2023-02-08T17:29:23.909Z,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...
4,3952394627,14d5676a-2c54-4f94-9023-1e8dcd822aa0,q-10536959012,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Orobanchaceae,Lathraea,Lathraea squamaria,...,48.07944,7.587583,2022-03-14 14:36:23.989,14,3,2022,3738478,CC_BY_4_0,2023-02-08T17:38:06.790Z,COORDINATE_ROUNDED;COUNTRY_DERIVED_FROM_COORDI...


In [3]:
plant.isnull().sum()/len(plant) # double-check that there are no missing values

gbifid                              0.0
datasetkey                          0.0
occurrenceid                        0.0
kingdom                             0.0
phylum                              0.0
class                               0.0
order                               0.0
family                              0.0
genus                               0.0
species                             0.0
scientificname                      0.0
verbatimscientificname              0.0
verbatimscientificnameauthorship    0.0
countrycode                         0.0
individualcount                     0.0
publishingorgkey                    0.0
decimallatitude                     0.0
decimallongitude                    0.0
eventdate                           0.0
day                                 0.0
month                               0.0
year                                0.0
taxonkey                            0.0
license                             0.0
lastinterpreted                     0.0


## Split numerical-categorial

In [None]:
def var_types_split (df):
    numerical = df.select_dtypes(np.number)
    categorical = df.select_dtypes('object')
    return numerical, categorical

numerical_df, categorical_df = var_types_split(customer_df)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Numerical

In [None]:
# Correlation matrix
corr_matrix=numerical_df.corr() # default: Pearson
fig, ax = plt.subplots(figsize=(10, 8))
ax = sns.heatmap(corr_matrix, annot=True)
plt.show()

Discrete-continuous

In [None]:
def numerical_split(df, cat=36):
    num_continuous_df = pd.DataFrame()
    num_discrete_df = pd.DataFrame()

    for col in df.columns:
        if df[col].nunique() > cat:
            num_continuous_df[col] = df[col]

        else:
            num_discrete_df[col] = df[col]

    return num_continuous_df, num_discrete_df

continuous_df, discrete_df = numerical_split(numerical_df)

### Plot

In [None]:
# countplot for descrete variables
def plot_discrete2(df):
    plt.figure(figsize=(16, 16))
    num_plots = len(df.columns)
    for i, col in enumerate(df.columns):
        plt.subplot((num_plots+1)//2, 2, i+1)
        sns.countplot(x=col, data=df)
    plt.tight_layout()
    plt.show()

plot_discrete2(discrete_df)    

In [None]:
# histograms for continuous variables
def plot_continuous(df):
    plt.figure(figsize=(16, 16))
    num_plots = len(df.columns)
    for i, col in enumerate(df.columns):
        plt.subplot((num_plots+1)//2, 2, i+1)
        sns.histplot(x=col, data=df, bins=150)
    plt.tight_layout()
    plt.show()

plot_continuous(continuous_df)    

Outliers in numerical continuous variables

In [None]:
def boxplot_continuous(df):
    plt.figure(figsize=(16, 16))
    num_plots = len(df.columns)
    for i, col in enumerate(df.columns):
        plt.subplot((num_plots+1)//2, 2, i+1)
        sns.boxplot(y=col, data=df)
    plt.tight_layout()
    plt.show()

boxplot_continuous(continuous_df)    

## Categorical

### Plots

In [None]:
def plot_categorical(df, cat=6):
    plt.figure(figsize=(16, 24))
    num_plots = len(df.columns)
    for i, col in enumerate(df.columns):
        plt.subplot((num_plots+1)//2, 2, i+1)
        if df[col].nunique() < cat:
            sns.countplot(x=col, data=df, order=list(df[col].value_counts().index), palette="cubehelix")
        else:
            sns.countplot(y=col, data=df, order=list(df[col].value_counts().index), palette="cubehelix")   
    plt.tight_layout()
    plt.show()

plot_categorical(categorical_df)    

## Temporal analysis

In [None]:
plt.figure(figsize=(20, 10))
sns.lineplot(x= 'effective_to_date', y='total_claim_amount', data=customer_df, ci=None)

plt.xticks(rotation=20)
plt.show()