# Exploratory Data Analysis (EDA)

EDA is used to analyze and investigate the dataset and summarize their main characteristics, employing data visualization methods.

This notebooks loads the data, shows maps of different factors and agreggated statistics.

### Import libraries

In [None]:
import numpy as np
import pandas as pd
import xarray as xr
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib.colors as mcolors
%matplotlib inline 

In [None]:
%load_ext autoreload
%autoreload 2
from utils.utils import *

### Load datacube

In [None]:
cube = xr.open_zarr("..//data//clim_env_input_features_2012_2022.zarr")
cube.dims

### Load Country boundaries for visualization

In [None]:
bound_country = gpd.read_file("..//data//world-administrative-boundaries.geojson")

### Mapping data for one time slice

Select one year (2012-2022)

In [None]:
year = 2013

In [None]:
set_time = f"{year}-06-01 00:00:00"
my_date = pd.to_datetime(cube.sel(time=set_time, method="nearest").coords["time"].item()).strftime("%Y-%m-%d")

Combined Drought Index (CDI)

In [None]:
# Discrete color dictionary
col_dict = {
    0: "white",        # No drought
    1: "#F2E94E",      # Watch (yellow)
    2: "#F0A000",      # Warning (orange)
    3: "#D90000",      # Alert (red)
}

# Create colormap (order matters!)
cm = ListedColormap([col_dict[k] for k in sorted(col_dict.keys())])

fig, ax = plt.subplots(figsize=(20, 7))
bound_country.plot(ax=ax, edgecolor='black', facecolor='none')
# plotting the raster
cdinx = cube.CDI.sel(time=set_time, method="nearest")
cdinx.plot.imshow(cmap=cm, ax=ax)
ax.set_aspect("equal", adjustable="box") 
plt.title(f"CDI ({my_date})")
plt.show()


SPEI-1M

In [None]:
# Define boundaries and colors
bounds = [-100,-2,-1.5,-1,1,1.5,2,100]
colors = ['#b72738','#ec855f','#fcd8c4','#f6f6f5','#cfe4ef','#65a7ce','#2c6db0']
legnames = ["extremely dry", "severely dry", "moderately dry", "near normal", "moderately wet", "severely wet", "extremely wet"]

cmap = mcolors.ListedColormap(colors)
norm = mcolors.BoundaryNorm(bounds, cmap.N)

# Plot
fig, ax = plt.subplots(figsize=(20, 7))
# boundaries
bound_country.plot(ax=ax, edgecolor='black', facecolor='none')
# plotting the raster
SPEI1 = cube.SPEI1.sel(time=set_time, method="nearest")
SPEI1.plot.imshow(cmap=cmap, norm=norm)
ax.set_aspect("equal", adjustable="box") 
plt.title(f"SPEI1 ({my_date})")
plt.show()

SPI-1M

In [None]:
# Define boundaries and colors
bounds = [-3, -2, -1.5, -1, 1, 1.5, 2, 3]
colors = [
    "#ff0000",  # red ≤ -2
    "#ffa500",  # orange (-2, -1.5)
    "#ffff00",  # yellow (-1.5, -1)
    "#ffffff",  # white (-1, 1)
    "#e6ccff",  # light purple (1, 1.5)
    "#9900cc",  # purple (1.5, 2)
    "#660066",  # dark violet ≥ 2
]

cmap = mcolors.ListedColormap(colors)
norm = mcolors.BoundaryNorm(bounds, cmap.N)

# Plot
fig, ax = plt.subplots(figsize=(20, 7))
bound_country.plot(ax=ax, edgecolor='black', facecolor='none')
# plotting the raster
SPI1 = cube.SPI1.sel(time=set_time, method="nearest")
SPI1.plot.imshow(cmap=cmap, norm=norm)
ax.set_aspect("equal", adjustable="box") 
plt.title(f"SPI1 ({my_date})")
plt.show()

Total precipitation anomaly

In [None]:
fig, ax = plt.subplots(figsize=(20, 7))
bound_country.plot(ax=ax, edgecolor='black', facecolor='none')
# plotting the raster
tp = cube.tp_ds.sel(time=set_time, method="nearest")
tp.plot.imshow(cmap="RdYlBu", ax=ax)
plt.title(f"TP ({my_date})")
ax.set_aspect("equal", adjustable="box") 
plt.show()

Soil moisture anomaly (SMA)

In [None]:
col_dict = {
    0: "white",          # Normal conditions or wet
    1: "#FBE3C0",      # Moderate low
    2: "#F4A742",      # Severe low
    3: "#8C3B00"      # Extreme (red)
}

# Colormap + normalization
cmap = ListedColormap([col_dict[k] for k in sorted(col_dict.keys())])

# Plot
fig, ax = plt.subplots(figsize=(20, 7))
bound_country.plot(ax=ax, edgecolor='black', facecolor='none')
# plotting the raster
SMA = cube.SMA.sel(time=set_time, method="nearest")
SMA.plot.imshow(cmap=cmap)
ax.set_aspect("equal", adjustable="box") 
plt.title(f"SMA ({my_date})")
plt.show()

## Visualize the regions

**Koeppen-Geiger (2-character) climate classification (KG2)**:

|Id| Code | Name |
|--|--|--|
|1|Af | Equatorial rainforest |
|2| Am |Equatorial monsoon|
|3| As |Equatorial savannah, dry summer|
|4| Aw |Equatorial savannah, dry winter|
|5| BS |Steppe climate|
|6| BW |Desert climate|
|7| Cs |Warm temperate, dry summer|
|8| Cw |Warm temperate, dry winter|
|9| Cf |Warm temperate, fully humid|
|10| Ds| Snow climate, dry summer|
|11| Dw| Snow climate, dry winter|
|12| Df| Snow climate, fully humid|
|13| ET| Tundra climate|
|14| EF| Frost climate|

In [None]:
kg2_plot = cube.kg2

# 1. Get the unique values present in your data
# This removes NaNs and zeros (if you're masking them)
unique_values = np.unique(kg2_plot.values[~np.isnan(kg2_plot.values)])
unique_values = unique_values[unique_values != 0] # Remove 0 if it's just background
unique_values.sort()

# 2. Define the discrete boundaries and colormap
# We create boundaries centered around your integers
boundaries = np.append(unique_values, unique_values[-1] + 1) - 0.5
norm = mcolors.BoundaryNorm(boundaries, len(unique_values))

# 3. Plotting
fig, ax = plt.subplots(figsize=(15, 8))

# Plot your country borders
bound_country.plot(ax=ax, edgecolor='black', facecolor='none', zorder=2)

# Plot the raster with the discrete norm
# We use "Set1" or any other qualitative map
im = kg2_plot.where(kg2_plot != 0).plot.imshow(
    ax=ax, 
    cmap="Set1", 
    norm=norm, 
    add_colorbar=False # We will handle the colorbar manually for better control
)

# 4. Create a clean, discrete colorbar
cbar = plt.colorbar(im, ax=ax, ticks=unique_values, spacing='uniform')
cbar.set_label('Koeppen-Geiger Classification (KG2)')

plt.title("KG Climate Zones")
ax.set_aspect("equal")
plt.show()

**Thermal regime class (THZ)**:

|Id| Name |
|--|--|
|1 |TRC1: Tropics, lowland|
|2 |TRC2: Tropics, highland|
|3 |TRC3: Subtropics, warm|
|4 |TRC4: Subtropics, moderately cool|
|5 |TRC5: Subtropics, cool|
|6 |TRC6: Temperate, moderately cool|
|7 |TRC7: Temperate, cool|
|8 |TRC8: Boreal / Cold, no permafrost|
|9 |TRC9: Boreal / Cold, with permafrost|
|10 |TRC10: Arctic / Very cold|

In [None]:
thz_plot = cube.thz

# 1. Get the unique values present in your data
# This removes NaNs and zeros (if you're masking them)
unique_values = np.unique(thz_plot.values[~np.isnan(thz_plot.values)])
unique_values = unique_values[unique_values != 0] # Remove 0 if it's just background
unique_values.sort()

# 2. Define the discrete boundaries and colormap
# We create boundaries centered around your integers
boundaries = np.append(unique_values, unique_values[-1] + 1) - 0.5
norm = mcolors.BoundaryNorm(boundaries, len(unique_values))

# 3. Plotting
fig, ax = plt.subplots(figsize=(15, 8))

# Plot your country borders
bound_country.plot(ax=ax, edgecolor='black', facecolor='none', zorder=2)

# Plot the raster with the discrete norm
im = thz_plot.where(thz_plot != 0).plot.imshow(
    ax=ax, 
    cmap="Set2", 
    norm=norm, 
    add_colorbar=False # We will handle the colorbar manually for better control
)

# 4. Create a clean, discrete colorbar
cbar = plt.colorbar(im, ax=ax, ticks=unique_values, spacing='uniform')
cbar.set_label('Thermal regime (Thz)')

plt.title("Thermal regimes")
ax.set_aspect("equal")
plt.show()

In [None]:
basin_plot = cube.basin_lv2

# 1. Get the unique values present in your data
# This removes NaNs and zeros (if you're masking them)
unique_values = np.unique(basin_plot.values[~np.isnan(basin_plot.values)])
unique_values = unique_values[unique_values != 0] # Remove 0 if it's just background
unique_values.sort()

# 2. Define the discrete boundaries and colormap
# We create boundaries centered around your integers
boundaries = np.append(unique_values, unique_values[-1] + 1) - 0.5
norm = mcolors.BoundaryNorm(boundaries, len(unique_values))

# 3. Plotting
fig, ax = plt.subplots(figsize=(15, 8))

# Plot your country borders
bound_country.plot(ax=ax, edgecolor='black', facecolor='none', zorder=2)

# Plot the raster with the discrete norm
im = basin_plot.where(basin_plot != 0).plot.imshow(
    ax=ax, 
    cmap="tab20", 
    norm=norm, 
    add_colorbar=False # We will handle the colorbar manually for better control
)

# 4. Create a clean, discrete colorbar
cbar = plt.colorbar(im, ax=ax, ticks=unique_values, spacing='uniform')
cbar.set_label('Hydrobasins (level 2)')

plt.title("Hydrobasins (level 2)")
ax.set_aspect("equal")
plt.show()


Outcome / Target

In [None]:
fig, ax = plt.subplots(figsize=(20, 7))
bound_country.plot(ax=ax, edgecolor='black', facecolor='none')
# plotting the raster
edid_plot = cube.DI_agri_extreme_M7.sel(time=set_time, method="nearest")
edid_plot.plot.imshow(vmin = 0, vmax = 1, cmap="Reds", ax=ax)
plt.title(f"DI_agri_extreme_M7 ({my_date})")
ax.set_aspect("equal", adjustable="box") 
plt.show()

In [None]:
# 1. SETUP: Choose your variable and define the figure
var_name = 'tp_basin_mean_ds'  # Replace with the attribute you want to plot

# Calculate min/max across all time steps so colors are comparable
vmin = cube[var_name].min().values
vmax = cube[var_name].max().values

# Create the 3x4 grid
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 12))
axes = axes.flatten()  # Flatten the 2D grid into a 1D list for easy looping

# 2. LOOP: Iterate over time steps
for i, ax in enumerate(axes):
    # Check if we still have time steps left to plot
    if i < len(cube.time):
        # Select the single time step
        data_slice = cube[var_name].isel(time=i)
        
        # Get a nice date string for the title
        date_str = pd.to_datetime(data_slice.time.values).strftime('%Y-%m-%d')
        
        # Plot the map on the specific axis 'ax'
        # add_colorbar=False avoids cluttering every single plot
        im = data_slice.plot(ax=ax, vmin=vmin, vmax=vmax, cmap="coolwarm_r", add_colorbar=False)
        bound_country.plot(ax=ax, edgecolor='black', facecolor='none')
        
        ax.set_title(date_str, fontsize=12, fontweight='bold')
        ax.set_xlabel('')
        ax.set_ylabel('')
    else:
        # 3. CLEANUP: Hide the empty 12th axis (since you only have 11 steps)
        ax.axis('off')

# 4. FINISH: Add a single shared colorbar and adjust layout
# Add a colorbar on the right side of the figure
cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7]) # [left, bottom, width, height]
fig.colorbar(im, cax=cbar_ax, label=var_name)

plt.suptitle(f"Time Series Mosaic for {var_name}", fontsize=16, y=1.02)
plt.tight_layout(rect=[0, 0, 0.9, 1]) # Make room for the colorbar
plt.show()

---

### Load tabular data (samples)

In [None]:
samples = pd.read_csv("../data/dataset_clim_env_oci_norm.csv")

In [None]:
print("samples", samples.shape)

In [None]:
samples.head()

#### Visualize correlation between variables

Remove categorical ones

In [None]:
covar = samples.columns.values.tolist()
covar = [v for v in covar if v not in ['lat','lon','time','id',
                                       'DI_agri_extreme_M6','DI_agri_extreme_M7',
                                       'kg2','thz','basin_lv2','basin_lv3']]
nvar = len(covar)
print(nvar, covar)

In [None]:
plot_correlation_matrix(samples, covar, annot=False)

Plot feature statistics by target group

In [None]:
plot_split_violin_mosaic(samples, target_var='DI_agri_extreme_M7', 
                          define_features_list=covar, ncols=5)