# Quartile Mapping

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import QuantileTransformer

# --- 1. CONFIGURATION: Define All File Paths and Settings ---
print("--- Initializing Configuration for Quantile Mapping ---")
ROOT_DATA_DIR = r"C:\Users\91788\Downloads\ERA5 Data\Extracted" # IMPORTANT: Use your actual path

# Target coordinates for Minneapolis
TARGET_LAT = 45.125
TARGET_LON = 266.625

# Input files for calculating the bias (2000-2020)
HISTORICAL_NASA_FILE_P1 = os.path.join(ROOT_DATA_DIR, "NASA_Standardized_Minnesota_2000-2014.csv")
HISTORICAL_NASA_FILE_P2 = os.path.join(ROOT_DATA_DIR, "NASA_Standardized_Minnesota_2015-2020.csv")
HISTORICAL_ERA5_FILE = os.path.join(ROOT_DATA_DIR, "ERA5_Train_2000-2020.csv")

# Input file to be corrected (2021-2024)
VALIDATION_NASA_FILE = os.path.join(ROOT_DATA_DIR, "NASA_Standardized_Minnesota_2021-2024.csv")

# Final output file for this method
OUTPUT_CORRECTED_FILE = os.path.join(ROOT_DATA_DIR, "NASA_Corrected_Quantile_Minneapolis_2021-2024.csv")

# Define the full calibration period
CALIBRATION_START = '2000-01-01'
CALIBRATION_END = '2020-12-31'


# --- 2. DATA LOADING AND FILTERING ---
print("\n--- Step 2: Loading and Filtering all necessary data files for Minneapolis ---")

def load_and_filter_nasa(file_path, lat, lon):
    df = pd.read_csv(file_path, parse_dates=['time'])
    filtered_df = df[(np.isclose(df['lat'], lat)) & (np.isclose(df['lon'], lon))].copy()
    return filtered_df.drop(columns=['lat', 'lon']).set_index('time')

nasa_hist_p1_df = load_and_filter_nasa(HISTORICAL_NASA_FILE_P1, TARGET_LAT, TARGET_LON)
nasa_hist_p2_df = load_and_filter_nasa(HISTORICAL_NASA_FILE_P2, TARGET_LAT, TARGET_LON)
nasa_hist_df = pd.concat([nasa_hist_p1_df, nasa_hist_p2_df])
print(f"  Successfully loaded and filtered historical NASA data for Minneapolis ({nasa_hist_df.index.min().year}-{nasa_hist_df.index.max().year}).")

nasa_val_df = load_and_filter_nasa(VALIDATION_NASA_FILE, TARGET_LAT, TARGET_LON)
print("  Successfully loaded and filtered validation NASA data for Minneapolis.")

era5_hist_df_hourly = pd.read_csv(HISTORICAL_ERA5_FILE, parse_dates=['time']).set_index('time')
print("  All data loading complete.")


# --- 3. PREPARE CALIBRATION DATA ---
print(f"\n--- Step 3: Preparing daily summaries for the calibration period ({CALIBRATION_START} to {CALIBRATION_END}) ---")

era5_daily_summary = era5_hist_df_hourly.resample('D').agg(
    air_temperature_k=('air_temperature_k', 'mean'),
    tasmin=('air_temperature_k', 'min'),
    tasmax=('air_temperature_k', 'max'),
    wind_speed_ms=('wind_speed_ms', 'mean'),
    relative_humidity_percent=('relative_humidity_percent', 'mean'),
    solar_radiation_w_m2=('solar_radiation_w_m2', 'mean'),
    thermal_radiation_w_m2=('thermal_radiation_w_m2', 'mean'),
    precip_hourly_mm=('precip_hourly_mm', 'sum')
).loc[CALIBRATION_START:CALIBRATION_END]

COLUMN_MAP = {
    'tas': 'air_temperature_k',
    'sfcWind': 'wind_speed_ms',
    'hurs': 'relative_humidity_percent',
    'rsds': 'solar_radiation_w_m2',
    'rlds': 'thermal_radiation_w_m2',
    'precip_daily_mm': 'precip_hourly_mm'
}
nasa_hist_df_renamed = nasa_hist_df.rename(columns=COLUMN_MAP)
print("  Daily summaries created and columns aligned.")


# --- 4. PERFORM QUANTILE MAPPING ---
print("\n--- Step 4: Applying Quantile Mapping to validation data ---")
nasa_corrected_df = nasa_val_df.copy()

# List of original NASA column names to be corrected
VARS_TO_CORRECT = ['tas', 'tasmin', 'tasmax', 'sfcWind', 'hurs', 'rsds', 'rlds', 'precip_daily_mm']

for nasa_col in VARS_TO_CORRECT:
    # Find the corresponding column name in the ERA5/renamed NASA data
    # For tasmin/tasmax, the name is the same. For others, use the map.
    era5_col = COLUMN_MAP.get(nasa_col, nasa_col)
    
    print(f"  Processing variable: '{nasa_col}' -> '{era5_col}'")
    
    # Isolate the historical data for the current variable
    source_data = nasa_hist_df_renamed[[era5_col]].dropna()
    target_data = era5_daily_summary[[era5_col]].dropna()
    
    # Isolate the validation data for the current variable
    validation_data = nasa_val_df[[nasa_col]].dropna()

    # Reshape data for scikit-learn (expects a 2D array)
    source_reshaped = source_data.values.reshape(-1, 1)
    target_reshaped = target_data.values.reshape(-1, 1)
    validation_reshaped = validation_data.values.reshape(-1, 1)

    # Learn the distributions of the source (NASA) and target (ERA5) data
    # n_quantiles determines the resolution of the mapping. 1000 is a good default.
    source_qt = QuantileTransformer(output_distribution='uniform', n_quantiles=1000).fit(source_reshaped)
    target_qt = QuantileTransformer(output_distribution='uniform', n_quantiles=1000).fit(target_reshaped)

    # Apply the mapping
    # 1. Transform the validation data to its quantile values using the source (NASA) transformer
    quantiles = source_qt.transform(validation_reshaped)
    # 2. Inverse-transform the quantiles using the target (ERA5) transformer to get corrected values
    corrected_values = target_qt.inverse_transform(quantiles)

    # Place the corrected values back into our results DataFrame
    nasa_corrected_df.loc[validation_data.index, nasa_col] = corrected_values.flatten()

print("  Quantile mapping applied successfully.")


# --- 5. SAVE THE CORRECTED FILE ---
print(f"\n--- Step 5: Saving bias-corrected data to {OUTPUT_CORRECTED_FILE} ---")
nasa_corrected_df.to_csv(OUTPUT_CORRECTED_FILE)
print(f"\nSave complete. The file '{os.path.basename(OUTPUT_CORRECTED_FILE)}' is ready for evaluation.")

--- Initializing Configuration for Quantile Mapping ---

--- Step 2: Loading and Filtering all necessary data files for Minneapolis ---
  Successfully loaded and filtered historical NASA data for Minneapolis (2000-2020).
  Successfully loaded and filtered validation NASA data for Minneapolis.
  All data loading complete.

--- Step 3: Preparing daily summaries for the calibration period (2000-01-01 to 2020-12-31) ---
  Daily summaries created and columns aligned.

--- Step 4: Applying Quantile Mapping to validation data ---
  Processing variable: 'tas' -> 'air_temperature_k'
  Processing variable: 'tasmin' -> 'tasmin'
  Processing variable: 'tasmax' -> 'tasmax'
  Processing variable: 'sfcWind' -> 'wind_speed_ms'
  Processing variable: 'hurs' -> 'relative_humidity_percent'
  Processing variable: 'rsds' -> 'solar_radiation_w_m2'
  Processing variable: 'rlds' -> 'thermal_radiation_w_m2'
  Processing variable: 'precip_daily_mm' -> 'precip_hourly_mm'
  Quantile mapping applied successfully.


In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# --- 1. CONFIGURATION ---
print("--- Step 1: Configuring file paths for QUANTILE MAPPING evaluation ---")
ROOT_DATA_DIR = r"C:\Users\91788\Downloads\ERA5 Data\Extracted" # IMPORTANT: Use your actual path

# *** CHANGED ***: Use the Quantile-Corrected file as the input
NASA_DAILY_INPUT_FILE = os.path.join(ROOT_DATA_DIR, "NASA_Corrected_Quantile_Minneapolis_2021-2024.csv")

# Other required files
ERA5_HOURLY_GROUND_TRUTH_FILE = os.path.join(ROOT_DATA_DIR, "ERA5_Test_2021-2024.csv")
MODEL_SAVE_DIR = os.path.join(ROOT_DATA_DIR, "trained_models_temporal_holdout")

# *** CHANGED ***: Define a new output file for this validation run
FINAL_VALIDATION_OUTPUT_FILE = os.path.join(ROOT_DATA_DIR, "QUANTILE_METHOD_VALIDATION_NASA_vs_ERA5_2021-2024.csv")


# --- 2. LOAD DATA AND TRAINED MODELS ---
print("\n--- Step 2: Loading data and pre-trained models ---")

# Load the bias-corrected daily NASA data
try:
    nasa_df = pd.read_csv(NASA_DAILY_INPUT_FILE, parse_dates=['time']).set_index('time')
    print(f"  Successfully loaded bias-corrected (Quantile Method) NASA daily data.")
except FileNotFoundError:
    print(f"Error: NASA data file not found at {NASA_DAILY_INPUT_FILE}.")
    print("Please ensure you have run the Quantile Mapping script first.")
    exit()

# Load the hourly ERA5 ground truth data
try:
    era5_df = pd.read_csv(ERA5_HOURLY_GROUND_TRUTH_FILE, index_col='time', parse_dates=True)
    print("  Successfully loaded ERA5 hourly ground truth data.")
except FileNotFoundError:
    print(f"Error: ERA5 ground truth file not found at {ERA5_HOURLY_GROUND_TRUTH_FILE}.")
    exit()

# Load the library of trained models
try:
    print("  Loading pre-trained models (trained on 2000-2018 ERA5)...")
    predictor_map = {
        'air_temperature_k': ['air_temperature_k_mean', 'air_temperature_k_min', 'air_temperature_k_max'],
        'wind_speed_ms': ['wind_speed_ms_mean', 'wind_speed_ms_max', 'wind_speed_ms_std'],
        'relative_humidity_percent': ['relative_humidity_percent_mean']
    }
    trained_models = {}
    for var_name in predictor_map.keys():
        model_path = os.path.join(MODEL_SAVE_DIR, f'models_{var_name}.pkl')
        trained_models[var_name] = joblib.load(model_path)
    
    trained_models['wind_max_from_mean'] = joblib.load(os.path.join(MODEL_SAVE_DIR, 'model_wind_max.pkl'))
    trained_models['wind_std_from_mean'] = joblib.load(os.path.join(MODEL_SAVE_DIR, 'model_wind_std.pkl'))
    print("  All models loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading model file: {e.filename}. Please ensure training was successful.")
    exit()


# --- 3. PREPARE NASA DATA FOR PREDICTION ---
print("\n--- Step 3: Preparing bias-corrected NASA data for prediction ---")

# Rename the NASA columns to match the names the models were trained on
nasa_predictors_df = nasa_df.rename(columns={
    'tas': 'air_temperature_k_mean',
    'tasmin': 'air_temperature_k_min',
    'tasmax': 'air_temperature_k_max',
    'sfcWind': 'wind_speed_ms_mean',
    'hurs': 'relative_humidity_percent_mean'
})

# Apply the two-stage model to generate wind characteristics
print("  Applying Stage-1 models to generate wind characteristics...")
X_wind_mean = nasa_predictors_df[['wind_speed_ms_mean']]
nasa_predictors_df['wind_speed_ms_max'] = trained_models['wind_max_from_mean'].predict(X_wind_mean)
nasa_predictors_df['wind_speed_ms_std'] = trained_models['wind_std_from_mean'].predict(X_wind_mean)
print("  Predictor data prepared.")


# --- 4. GENERATE HOURLY PREDICTIONS ---
print("\n--- Step 4: Generating hourly predictions from corrected daily data ---")
final_predictions = {}
for var_name, predictors in predictor_map.items():
    print(f"  Predicting hourly values for: {var_name}...")
    hourly_preds_list = []
    if not all(p in nasa_predictors_df.columns for p in predictors):
        print(f"    Skipping {var_name}, missing one or more predictors: {predictors}")
        continue
        
    X_predict = nasa_predictors_df[predictors]
    for hour in range(24):
        model = trained_models[var_name].get(hour)
        if model:
            preds = model.predict(X_predict)
            hourly_preds_list.append(pd.Series(preds, index=X_predict.index, name=hour))
            
    if hourly_preds_list:
        var_df_wide = pd.concat(hourly_preds_list, axis=1)
        var_stacked = var_df_wide.stack()
        var_stacked.index = var_stacked.index.map(lambda x: x[0] + pd.to_timedelta(x[1], unit='h'))
        final_predictions[f'predicted_{var_name}'] = var_stacked

predictions_df = pd.DataFrame(final_predictions)
print("--- Hourly predictions generated successfully. ---")


# --- 5. VALIDATE PREDICTIONS AGAINST ERA5 GROUND TRUTH ---
print("\n--- Step 5: Validating predictions against ERA5 ground truth ---")
validation_df = pd.merge(
    era5_df.rename(columns=lambda c: f"actual_{c}"),
    predictions_df,
    left_index=True,
    right_index=True,
    how="inner"
)

# *** CHANGED ***: Updated the title of the results printout
print("\n  Final Validation Results (QUANTILE MAPPING METHOD vs. ERA5 Actuals for 2021-2024):")
for var_name in predictor_map.keys():
    actual_col = f'actual_{var_name}'
    predicted_col = f'predicted_{var_name}'
    if actual_col in validation_df.columns and predicted_col in validation_df.columns:
        temp_compare_df = validation_df[[actual_col, predicted_col]].dropna()
        if not temp_compare_df.empty:
            mae = mean_absolute_error(temp_compare_df[actual_col], temp_compare_df[predicted_col])
            rmse = np.sqrt(mean_squared_error(temp_compare_df[actual_col], temp_compare_df[predicted_col]))
            r2 = r2_score(temp_compare_df[actual_col], temp_compare_df[predicted_col])
            
            print(f"    - {var_name}:")
            print(f"        Mean Absolute Error (MAE):    {mae:.4f}")
            print(f"        Root Mean Squared Error (RMSE): {rmse:.4f}")
            print(f"        R-squared (R²):               {r2:.4f}")

# --- 6. SAVE THE FINAL VALIDATION RESULTS ---
print(f"\n--- Step 6: Saving final validation results to {FINAL_VALIDATION_OUTPUT_FILE} ---")
validation_df.to_csv(FINAL_VALIDATION_OUTPUT_FILE)
print("Save complete.")

--- Step 1: Configuring file paths for QUANTILE MAPPING evaluation ---

--- Step 2: Loading data and pre-trained models ---
  Successfully loaded bias-corrected (Quantile Method) NASA daily data.
  Successfully loaded ERA5 hourly ground truth data.
  Loading pre-trained models (trained on 2000-2018 ERA5)...
  All models loaded successfully.

--- Step 3: Preparing bias-corrected NASA data for prediction ---
  Applying Stage-1 models to generate wind characteristics...
  Predictor data prepared.

--- Step 4: Generating hourly predictions from corrected daily data ---
  Predicting hourly values for: air_temperature_k...
  Predicting hourly values for: wind_speed_ms...
  Predicting hourly values for: relative_humidity_percent...
--- Hourly predictions generated successfully. ---

--- Step 5: Validating predictions against ERA5 ground truth ---

  Final Validation Results (QUANTILE MAPPING METHOD vs. ERA5 Actuals for 2021-2024):
    - air_temperature_k:
        Mean Absolute Error (MAE):   