# ðŸ“‹Table of contents
* [Features](#features)
* [Targets](#targets)
* [Targets vs Features](#targets_vs_features)
* [Examples](#examples)
* [Pivot Table and Target Correlations](#pivot)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import seaborn as sns

# images
from PIL import Image

In [None]:
# configs
pd.set_option('display.max_columns', None) # we want to display all columns in this notebook
pd.set_option('display.max_rows', 100) # increase rows to be displayed
pd.set_option('display.max_colwidth', None) # show full cell contents

# random seed
my_random_seed = 123

# aesthetics
default_color_1 = 'darkblue'
default_color_2 = 'darkgreen'
default_color_3 = 'darkred'

import warnings
warnings.filterwarnings('ignore')

In [None]:
# load data
t1 = time.time()
df_train = pd.read_csv('/kaggle/input/csiro-biomass/train.csv', low_memory=False)
df_test = pd.read_csv('/kaggle/input/csiro-biomass/test.csv', low_memory=False)
df_sub = pd.read_csv('/kaggle/input/csiro-biomass/sample_submission.csv', low_memory=False)
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,4))

In [None]:
# preview
df_train.head(10)

In [None]:
# show structure of data - train
df_train.info(show_counts=True, verbose=True)

In [None]:
# show structure of data - test
df_test.info(show_counts=True, verbose=True)

In [None]:
# convert date
df_train['Sampling_Date'] = pd.to_datetime(df_train['Sampling_Date'])

In [None]:
# define target and predictors
target = 'target'

# numerical features
features_num = ['Pre_GSHH_NDVI', 'Height_Ave_cm']

# categorical features
features_cat = ['State', 'Species']

<a id='features'></a>
# Features

In [None]:
# plot histograms (train only);
# note that each observation (=image) occurs 5 times (one for each target)!
n_bins = 20
for f in features_num:
    plt.figure(figsize=(7,3))
    df_train[f].plot(kind='hist', bins=n_bins, color=default_color_1)
    plt.title(f + ' - Train')
    plt.grid()

In [None]:
# bivariate scatter
plt.figure(figsize=(4,4))
plt.scatter(df_train['Pre_GSHH_NDVI'], df_train['Height_Ave_cm'],
            color=default_color_1, s=10)
plt.xlabel('Pre_GSHH_NDVI')
plt.ylabel('Height_Ave_cm')
plt.grid()
plt.show()

In [None]:
# plot categorical feature distributions (train only);
# note that each observation (=image) occurs 5 times (one for each target)!
for f in features_cat:
    plt.figure(figsize=(12,3))
    df_train[f].value_counts().sort_index().plot(kind='bar', color=default_color_1)
    plt.title(f + ' - Train')
    plt.grid()
    plt.show()

In [None]:
# bivariate table of categories;
# note that each observation (=image) occurs 5 times (one for each target)!
pd.crosstab(df_train['Species'], df_train['State'])

In [None]:
# dates evaluation
print(df_train.Sampling_Date.describe())

plt.figure(figsize=(10,3))
plt.hist(df_train.Sampling_Date, bins=n_bins, color=default_color_1)
plt.title('Sampling_Date')
plt.grid()
plt.show()

<a id='targets'></a>
# Targets

In [None]:
# plot target distributions - option 1
sns.histplot(data=df_train, x=target, hue='target_name')
plt.show()

In [None]:
# plot target distributions - option 2
sns.histplot(data=df_train, x=target, y='target_name')
plt.show()

In [None]:
# check frequency of different targets
target_freqs = df_train.target_name.value_counts()
target_freqs

<a id='targets_vs_features'></a>
# Targets vs Features

In [None]:
# get target names
targets = target_freqs.index.tolist()
targets

In [None]:
# scatter plot target vs features
for t in targets:
    df_temp = df_train[df_train.target_name==t]
    for f in features_num:
        title = 'Target ' + t + ' vs ' + f
        plt.figure(figsize=(4,3))
        plt.scatter(df_temp[f], df_temp.target, color=default_color_3, s=10)
        plt.title(title)
        plt.grid()
        plt.show()

<a id='examples'></a>
# Examples

In [None]:
# pick an example via id
my_id = 'ID1011485656'

# corresponding pathes
image_path = 'train/' + my_id + '.jpg'
full_path = '/kaggle/input/csiro-biomass/' + image_path

# select only this example from the data frame
df_ex = df_train[df_train.image_path == image_path]
df_ex

In [None]:
# show corresponding image
image = Image.open(full_path)
image

In [None]:
# pick another example - this case shows clover
my_id = 'ID1025234388'

# corresponding pathes
image_path = 'train/' + my_id + '.jpg'
full_path = '/kaggle/input/csiro-biomass/' + image_path

# select only this example from the data frame
df_ex = df_train[df_train.image_path == image_path]
df_ex

In [None]:
# show corresponding image
image = Image.open(full_path)
image

<a id='pivot'></a>
# Pivot Table and Target Correlations

In [None]:
# create pivot table with different targets in the columns
df_pivot = df_train.pivot(index='image_path', columns='target_name', values=target)
df_pivot

In [None]:
# multivariate scatter plot of targets
sns.pairplot(df_pivot, 
             plot_kws = { 'color' : default_color_3, 's' : 15},
             diag_kws = { 'color' : default_color_3})
plt.show()

In [None]:
# correlations
corr_pearson = df_pivot[targets].corr(method='pearson')
corr_spearman = df_pivot[targets].corr(method='spearman')

plt.figure(figsize=(15,5))
ax1 = plt.subplot(1,2,1)
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1,
            fmt='.2g', linecolor='black', linewidths=0.5)
plt.title('Pearson Correlation')

ax2 = plt.subplot(1,2,2, sharex=ax1)
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1,
            fmt='.2g', linecolor='black', linewidths=0.5)
plt.title('Spearman Correlation')
plt.show()