In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# =============================================================================
# SCRIPT CONFIGURATION
# =============================================================================

DATASET_PATH = "euro_final_dataset_v2.csv"

PLOT_OUTPUT_FOLDER = "dataset_analysis_plots"

VALUE_COUNTS_DISPLAY_LIMIT = 10

In [None]:
# =============================================================================
# SETUP
# =============================================================================

os.makedirs(PLOT_OUTPUT_FOLDER, exist_ok=True)
print(f"--- Plots will be saved to the '{PLOT_OUTPUT_FOLDER}' folder. ---")

try:
    df = pd.read_csv(DATASET_PATH, parse_dates=['time'])
    print(f"\n Successfully loaded '{DATASET_PATH}'.")
except FileNotFoundError:
    print(f"CRITICAL ERROR: The file '{DATASET_PATH}' was not found. Please ensure it is in the correct directory.")
    exit()

In [None]:
# =============================================================================
# 1. GENERAL DATASET HEALTH CHECK
# =============================================================================
print("\n" + "="*80)
print("1. GENERAL DATASET HEALTH CHECK")
print("="*80)

print("\n--- Shape (Rows, Columns) ---\n")
print(df.shape)

print("\n--- Data Types and Non-Null Counts ---\n")
df.info()

print("\n--- Missing Value Counts per Column ---\n")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])
if missing_values.sum() == 0:
    print("No missing values found. Excellent!")

print("\n--- First 5 Rows (Data Preview) ---\n")
print(df.head())

In [None]:
# =============================================================================
# 2. DISTRIBUTION OF KEY NUMERICAL FEATURES
# =============================================================================
print("\n" + "="*80)
print("2. DISTRIBUTION OF KEY NUMERICAL FEATURES")
print("="*80)


features_to_plot = [
    'TX', 'era5_temp_max', 'delta_temp', 'ndvi_local', 'ndvi_global',
    'delta_ndvi', 'perc_water', 'perc_urban', 'perc_suburban', 'perc_forest',
    'wind_speed', 'era5_precip', 'rain_7day_avg'
]

for column in features_to_plot:
    if column in df.columns:
        print(f"\n--- Analyzing distribution of '{column}' ---")
        print(df[column].describe())

        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        fig.suptitle(f"Distribution of '{column}'", fontsize=16)

        sns.histplot(df[column], kde=True, ax=axes[0])
        axes[0].set_title("Histogram")
        
        sns.boxplot(x=df[column], ax=axes[1])
        axes[1].set_title("Box Plot (for outlier detection)")
        
        plot_filename = os.path.join(PLOT_OUTPUT_FOLDER, f"dist_{column}.png")
        plt.savefig(plot_filename)
        plt.close(fig)
        print(f"Saved distribution plot to: {plot_filename}")

In [None]:
# =============================================================================
# 3. CORRELATION ANALYSIS (CRITICAL FOR MODELING)
# =============================================================================
print("\n" + "="*80)
print("3. CORRELATION ANALYSIS")
print("="*80)

numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
if 'STAID' in numeric_cols: numeric_cols.remove('STAID')
if 'doy' in numeric_cols: numeric_cols.remove('doy')

corr_matrix = df[numeric_cols].corr()

if 'delta_temp' in corr_matrix:
    print("\n--- Correlation of Features with Target ('delta_temp') ---\n")
    corr_with_target = corr_matrix['delta_temp'].sort_values(ascending=False)
    print(corr_with_target)
else:
    print("\nWarning: Target variable 'delta_temp' not found for correlation analysis.")


plt.figure(figsize=(20, 16))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', linewidths=.5)
plt.title("Correlation Matrix of Numerical Features", fontsize=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()

heatmap_filename = os.path.join(PLOT_OUTPUT_FOLDER, "correlation_heatmap.png")
plt.savefig(heatmap_filename)
plt.close()
print(f"\nSaved correlation heatmap to: {heatmap_filename}")


In [None]:
# =============================================================================
# 4. KEY INSIGHTS & GROUPED ANALYSIS
# =============================================================================
print("\n" + "="*80)
print("4. KEY INSIGHTS & GROUPED ANALYSIS")
print("="*80)

# IMPROVEMENT: Let's find which stations have the highest and lowest average bias.
if 'delta_temp' in df.columns:
    station_bias = df.groupby('STAID')['delta_temp'].mean().sort_values()
    
    print("\n--- Top 10 Stations with LOWEST Average Bias (ERA5 is much WARMER than station) ---\n")
    print(station_bias.head(10))
    
    print("\n--- Top 10 Stations with HIGHEST Average Bias (ERA5 is much COLDER than station) ---\n")
    print(station_bias.tail(10))

print("\n\n--- Analysis Complete ---")

--- Plots will be saved to the 'dataset_analysis_plots' folder. ---

âœ… Successfully loaded 'euro_final_dataset_v2.csv'.

1. GENERAL DATASET HEALTH CHECK

--- Shape (Rows, Columns) ---

(3034327, 22)

--- Data Types and Non-Null Counts ---

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3034327 entries, 0 to 3034326
Data columns (total 22 columns):
 #   Column         Dtype         
---  ------         -----         
 0   time           datetime64[ns]
 1   STAID          int64         
 2   latitude       float64       
 3   longitude      float64       
 4   elevation      int64         
 5   TX             float64       
 6   era5_temp_max  float64       
 7   delta_temp     float64       
 8   ndvi_local     float64       
 9   ndvi_global    float64       
 10  delta_ndvi     float64       
 11  perc_water     float64       
 12  perc_urban     float64       
 13  perc_suburban  float64       
 14  perc_forest    float64       
 15  era5_u10       float64       
 16  era5_v10  

# DATA CLEANING

In [None]:
# =============================================================================
# 5. FILTERED ANALYSIS (REMOVE EXTREME TEMPERATURES AND LARGE BIASES)
# =============================================================================

print("\n" + "=" * 80)
print("5. FILTERED ANALYSIS: |delta_temp| <= 20 AND -45 <= TX <= 50")
print("=" * 80)

n_before = len(df)

filter_mask = (
    df["TX"].between(-45, 50) &   
    df["delta_temp"].between(-20, 20)
)

df_filt = df[filter_mask].copy()
n_after = len(df_filt)
n_removed = n_before - n_after

print("\n--- Filtering summary ---")
print(f"Rows before filter : {n_before:,}")
print(f"Rows after  filter : {n_after:,}")
print(f"Rows removed    : {n_removed:,}")

filtered_dataset_path = "euro_final_dataset_v2_filtered.csv"
df_filt.to_csv(filtered_dataset_path, index=False)
print(f"\n Filtered dataset saved to: '{filtered_dataset_path}'")

PLOT_OUTPUT_FOLDER_FILTERED = "dataset_analysis_plots_filtered"
os.makedirs(PLOT_OUTPUT_FOLDER_FILTERED, exist_ok=True)
print(f"\nFiltered plots will be saved to '{PLOT_OUTPUT_FOLDER_FILTERED}'.")

features_to_plot = [
    "TX", "era5_temp_max", "delta_temp",
    "ndvi_local", "ndvi_global", "delta_ndvi",
    "perc_water", "perc_urban", "perc_suburban", "perc_forest",
    "wind_speed", "era5_precip", "rain_7day_avg",
]

for column in features_to_plot:
    if column not in df_filt.columns:
        continue

    print(f"\n--- (FILTERED) Analyzing distribution of '{column}' ---")
    print(df_filt[column].describe())

    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    fig.suptitle(f"Distribution of '{column}' (filtered)", fontsize=16)

    sns.histplot(df_filt[column], kde=True, ax=axes[0])
    axes[0].set_title("Histogram")

    sns.boxplot(x=df_filt[column], ax=axes[1])
    axes[1].set_title("Box Plot (for outlier detection)")

    plot_filename = os.path.join(
        PLOT_OUTPUT_FOLDER_FILTERED,
        f"dist_{column}_filtered.png"
    )
    plt.savefig(plot_filename, bbox_inches="tight")
    plt.close(fig)
    print(f"Saved filtered distribution plot to: {plot_filename}")

print("\n--- Computing correlation matrix on filtered data ---")

numeric_cols_filt = df_filt.select_dtypes(include=np.number).columns.tolist()
if "STAID" in numeric_cols_filt:
    numeric_cols_filt.remove("STAID")
if "doy" in numeric_cols_filt:
    numeric_cols_filt.remove("doy")

corr_matrix_filt = df_filt[numeric_cols_filt].corr()

if "delta_temp" in corr_matrix_filt:
    print("\n--- (FILTERED) Correlation of Features with Target ('delta_temp') ---\n")
    corr_with_target_filt = corr_matrix_filt["delta_temp"].sort_values(ascending=False)
    print(corr_with_target_filt)

plt.figure(figsize=(20, 16))
sns.heatmap(corr_matrix_filt, annot=True, fmt=".2f", cmap="coolwarm", linewidths=.5)
plt.title("Correlation Matrix of Numerical Features (Filtered)", fontsize=20)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()

heatmap_filtered_filename = os.path.join(
    PLOT_OUTPUT_FOLDER_FILTERED,
    "correlation_heatmap_filtered.png"
)
plt.savefig(heatmap_filtered_filename)
plt.close()
print(f"\nSaved filtered correlation heatmap to: {heatmap_filtered_filename}")

print("\n--- Filtered analysis complete ---")



5. FILTERED ANALYSIS: |delta_temp| <= 20 AND -45 <= TX <= 50

--- Filtering summary ---
Rows before filter : 3,034,327
Rows after  filter : 3,033,739
Rows removed       : 588

ðŸ’¾ Filtered dataset saved to: 'euro_final_dataset_v2_filtered.csv'

Filtered plots will be saved to 'dataset_analysis_plots_filtered'.

--- (FILTERED) Analyzing distribution of 'TX' ---
count    3.033739e+06
mean     2.003247e+01
std      8.764066e+00
min     -1.500000e+01
25%      1.360000e+01
50%      1.950000e+01
75%      2.680000e+01
max      5.000000e+01
Name: TX, dtype: float64
Saved filtered distribution plot to: dataset_analysis_plots_filtered\dist_TX_filtered.png

--- (FILTERED) Analyzing distribution of 'era5_temp_max' ---
count    3.033739e+06
mean     1.846580e+01
std      8.419886e+00
min     -1.642056e+01
25%      1.233209e+01
50%      1.778232e+01
75%      2.465880e+01
max      4.457812e+01
Name: era5_temp_max, dtype: float64
Saved filtered distribution plot to: dataset_analysis_plots_filtered\d