In [1]:
import openpyxl
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import folium
from folium import plugins
from folium.plugins import HeatMap, MarkerCluster
from IPython.display import IFrame
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Point
from math import radians, sin, cos, sqrt, atan2
import branca
import requests

In [2]:
from src.data import generate_df, generate_gdf, preprocess_df, preprocess_gdf
from src.util import haversine_distance, create_point

In [7]:
# Load the CSV files into Pandas DataFrames
file_name = ['modified/gis_weather_station_with_elevation.csv', 'raw/src_vri_snapshot_2024_03_20.csv', 
             'raw/src_wings_meteorology_windspeed_snapshot_2023_08_02.csv', 'modified/nam_with_elevation.csv']

elevation_dfs = generate_df(file_name, False)
gis_weather_station, src_vri_snapshot, windspeed_snapshot, nam = elevation_dfs

In [None]:
def plot_histogram(data, column, bins=30, title=None, xlabel=None, ylabel="Frequency", color="blue"):
    """
    Plots a histogram for a specified column in the DataFrame.

    Parameters:
        data (pd.DataFrame): The DataFrame containing the data.
        column (str): The column name to plot the histogram for.
        bins (int): Number of bins for the histogram. Default is 30.
        title (str): The title of the histogram. Default is None.
        xlabel (str): The label for the x-axis. Default is the column name.
        ylabel (str): The label for the y-axis. Default is "Frequency".
        color (str): The color of the bars in the histogram. Default is "blue".
    """
    if column not in data.columns:
        raise ValueError(f"Column '{column}' not found in the DataFrame.")

    plt.figure(figsize=(10, 6))
    plt.hist(data[column], bins=bins, color=color, edgecolor="black", alpha=0.7)
    plt.title(title if title else f"Histogram of {column}", fontsize=14)
    plt.xlabel(xlabel if xlabel else column, fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.grid(axis="y", linestyle="--", alpha=0.7)
    plt.tight_layout()
    plt.show()

In [8]:
gis_weather_station, windspeed_snapshot = preprocess_df(gis_weather_station, windspeed_snapshot)

In [9]:
elevation_gdfs = generate_gdf([gis_weather_station, src_vri_snapshot, nam])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gis_weather_station['geometry'] = gis_weather_station['shape'].apply(wkt.loads)


Weather Station CRS:    EPSG:4326
VRI Polygon CRS:        EPSG:4326
NAM CRS:                EPSG:4326


In [10]:
gis_weather_station_gpd, src_vri_snapshot_gpd, nam_gpd = elevation_gdfs

In [11]:
model_gdf = preprocess_gdf(gis_weather_station_gpd, src_vri_snapshot_gpd, nam_gpd, windspeed_snapshot)

In [12]:
model_gdf

Unnamed: 0,objectid,weatherstationcode,weatherstationname,scadartuid,structureid_left,nwszone,district_left,thomasbrospagegrid,constructionstatus,creationuser,...,polygon_shape,shape_srid_right,snapshot_date,nam_geometry,station_geometry,polygon_geometry,nam_distance_from_station_km,month,day_of_year,abs_wind_speed_error
0,1,CBD,Carlsbad,5158.0,P124785,Coastal-243,6.0,1126-G1,A,seu_gis_elec,...,"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",4326,2024-03-20,POINT (-117.32819 33.14228),POINT (-117.32717 33.13735),"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",0.555858,12,345,3.285754
1,1,CBD,Carlsbad,5158.0,P124785,Coastal-243,6.0,1126-G1,A,seu_gis_elec,...,"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",4326,2024-03-20,POINT (-117.31195 33.14219),POINT (-117.32717 33.13735),"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",1.518626,12,345,3.148903
2,1,CBD,Carlsbad,5158.0,P124785,Coastal-243,6.0,1126-G1,A,seu_gis_elec,...,"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",4326,2024-03-20,POINT (-117.32809 33.15589),POINT (-117.32717 33.13735),"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",2.058544,12,345,3.598310
3,1,CBD,Carlsbad,5158.0,P124785,Coastal-243,6.0,1126-G1,A,seu_gis_elec,...,"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",4326,2024-03-20,POINT (-117.31183 33.15581),POINT (-117.32717 33.13735),"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",2.498237,12,345,3.535970
4,1,CBD,Carlsbad,5158.0,P124785,Coastal-243,6.0,1126-G1,A,seu_gis_elec,...,"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",4326,2024-03-20,POINT (-117.31174 33.16941),POINT (-117.32717 33.13735),"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",3.836253,12,345,3.631954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102827,159,CLM,Creelman,1996.0,Z104188,Inland Valley-250,3.0,1172-G2,A,seu_gis_elec,...,"MULTIPOLYGON Z (((-116.908853 33.012693 0, -11...",4326,2024-03-20,POINT (-116.8421 33.02989),POINT (-116.871 33.01565),"MULTIPOLYGON Z (((-116.908853 33.012693 0, -11...",3.127864,12,359,5.978363
102828,159,CLM,Creelman,1996.0,Z104188,Inland Valley-250,3.0,1172-G2,A,seu_gis_elec,...,"MULTIPOLYGON Z (((-116.908853 33.012693 0, -11...",4326,2024-03-20,POINT (-116.85815 33.04365),POINT (-116.871 33.01565),"MULTIPOLYGON Z (((-116.908853 33.012693 0, -11...",3.328952,12,359,5.092426
102829,159,CLM,Creelman,1996.0,Z104188,Inland Valley-250,3.0,1172-G2,A,seu_gis_elec,...,"MULTIPOLYGON Z (((-116.908853 33.012693 0, -11...",4326,2024-03-20,POINT (-116.84193 33.0435),POINT (-116.871 33.01565),"MULTIPOLYGON Z (((-116.908853 33.012693 0, -11...",4.112495,12,359,5.454389
102830,159,CLM,Creelman,1996.0,Z104188,Inland Valley-250,3.0,1172-G2,A,seu_gis_elec,...,"MULTIPOLYGON Z (((-116.871922 33.01855 0, -116...",4326,2024-03-20,POINT (-116.98926 32.93591),POINT (-116.871 33.01565),"MULTIPOLYGON Z (((-116.871922 33.01855 0, -116...",14.156771,12,359,14.118581


In [13]:
from src.model import light_gbm

In [14]:
model = light_gbm(model_gdf)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'subsample': 0.8, 'num_leaves': 40, 'n_estimators': 700, 'min_child_samples': 5, 'max_depth': 7, 'learning_rate': 0.2, 'colsample_bytree': 1.0}
Mean Absolute Error: 2.709
R² Score: 0.666


In [15]:
from src.data import filter_nam_outside_vri

In [16]:
filtered_NAM = filter_nam_outside_vri(nam_gpd, model_gdf)

In [17]:
from src.data import nam_outside_vri_model_data

In [20]:
testing_data = nam_outside_vri_model_data(filtered_NAM, gis_weather_station_gpd)

In [21]:
model.

Unnamed: 0,latitude,longitude,date,average_wind_speed,geometry,station_elevation_m,nam_distance_from_station_km


In [None]:
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score

# Define features and target variable
features = [
    "nam_wind_speed", "nam_elevation_m", "station_elevation_m",
    "nam_distance_from_station_km", "month", "day_of_year"
]

# Define preprocessing for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),  # Impute missing values
            ("scaler", StandardScaler())  # Standardize numerical features
        ]), features)
    ]
)

# Define LightGBM model
model = lgb.LGBMRegressor(random_state=42, verbose=-1)

# Define hyperparameter grid for tuning
param_dist = {
    "n_estimators": [100, 300, 500, 700, 1000, 1500],  # Increased upper bound
    "learning_rate": [0.005, 0.01, 0.03, 0.05, 0.07, 0.1, 0.2, 0.3],  # Added finer granularity
    "max_depth": [3, 5, 7, 10, 12, 15, -1],  # Increased range
    "num_leaves": [20, 31, 40, 50, 60, 80, 100],  # Added larger values
    "min_child_samples": [5, 10, 20, 30, 50, 100],  # Added larger values
    "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],  # Added more granularity
    "colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],  # Increased variety
    "reg_alpha": [0, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0],  # L1 regularization (new)
    "reg_lambda": [0, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0],  # L2 regularization (new)
    "min_split_gain": [0.0, 0.01, 0.05, 0.1, 0.2],  # Minimum gain to split (new)
    "min_child_weight": [1e-3, 1e-2, 0.1, 1.0, 10.0, 50.0],  # Minimum child weight (new)
    "boosting_type": ["gbdt", "dart"],  # Added DART boosting method (new)
    "importance_type": ["split", "gain"]  # Feature importance calculation method (new)
}

# Define pipeline with preprocessing + model
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# Extract features (X) and target variable (y)
X = model_gdf[features]
y = model_gdf["abs_wind_speed_error"]

# Train/Test Split (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning using RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=50,  # Number of random combinations to try
    scoring="neg_mean_absolute_error",
    cv=5,  # 5-fold cross-validation
    verbose=0,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Best model from tuning
best_model = random_search.best_estimator_

# Evaluate model on test data
y_pred = best_model.predict(X_test)

# Compute evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Hyperparameters: {random_search.best_params_}")
print(f"Mean Absolute Error: {mae:.3f}")
print(f"R² Score: {r2:.3f}")


In [None]:
grouped_columns = ['NAM_geometry']
filtered_nam_vri_wind_speed_gpd_grouped = filtered_nam_vri_wind_speed_gpd.groupby(grouped_columns, sort=False)

# Mean Absolute Error
def calculate_mae(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed'])
    return abs_error.mean()

# Mean Squared Error
def calculate_mse(group):
    squared_error = (group['NAM_wind_speed'] - group['station_wind_speed']) ** 2
    return squared_error.mean()

# Normalized Mean Absolute Error
def calculate_nmae(group):
    group['abs_error'] = abs(group['NAM_wind_speed'] - group['station_wind_speed'])
    mae = group['abs_error'].mean()
    actual_range = group['station_wind_speed'].max() - group['station_wind_speed'].min()
    nmae = mae / actual_range if actual_range != 0 else None
    return nmae

# Normalized Mean Squared Error
def calculate_nmse(group):
    group['squared_error'] = (group['NAM_wind_speed'] - group['station_wind_speed']) ** 2
    mse = group['squared_error'].mean()
    actual_range = group['station_wind_speed'].max() - group['station_wind_speed'].min()
    nmse = mse / actual_range if actual_range != 0 else None
    return nmse

# Sigmoid-Damped Distance Error
def calculate_sdwe(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed'])
    distance = group['nam_distance_from_station_km'].iloc[0]
    sigmoid_weight = 1 / (1 + np.exp(-(distance - d0)/tau))
    return (abs_error * sigmoid_weight).mean()

# Distance-Weighted Absolute Error (DWAE)
def calculate_dwae(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed'])
    distance = group['nam_distance_from_station_km'].iloc[0]
    return (abs_error * distance).mean()

d0 = filtered_nam_vri_wind_speed_gpd['nam_distance_from_station_km'].median()
tau = filtered_nam_vri_wind_speed_gpd['nam_distance_from_station_km'].std()

mae = filtered_nam_vri_wind_speed_gpd_grouped.apply(calculate_mae, include_groups=False).reset_index(name='MAE')
mse = filtered_nam_vri_wind_speed_gpd_grouped.apply(calculate_mse, include_groups=False).reset_index(name='MSE')
nmae = filtered_nam_vri_wind_speed_gpd_grouped.apply(calculate_nmae, include_groups=False).reset_index(name='NMAE')
nmse = filtered_nam_vri_wind_speed_gpd_grouped.apply(calculate_nmse, include_groups=False).reset_index(name='NMSE')
dwae = filtered_nam_vri_wind_speed_gpd_grouped.apply(calculate_dwae, include_groups=False).reset_index(name='DWAE')
sdwe = filtered_nam_vri_wind_speed_gpd_grouped.apply(calculate_sdwe, include_groups=False).reset_index(name='SDWE')

errors = (
    mae
    .merge(mse, on=grouped_columns, how='inner')
    .merge(nmae, on=grouped_columns, how='inner')
    .merge(nmse, on=grouped_columns, how='inner')
    .merge(dwae, on=grouped_columns, how='inner')
    .merge(sdwe, on=grouped_columns, how='inner')
)

# errors['distance_from_station_km'] = errors.apply(
#     lambda row: haversine_distance(row['station_geometry'], row['NAM_geometry']), axis=1
# )

errors.head()

In [None]:
errors.describe()

In [None]:
errors['NAM_geometry'].nunique

In [None]:
columns = list(errors.columns)[4:]

for i, col in enumerate(columns):
    color = "blue" if i % 2 == 0  else "black"
    
    plot_histogram(
        data=errors,
        column=col,
        bins=100,
        title=f"Histogram of {col}",
        xlabel=f"{col} Values",
        color=color
    )

In [None]:
# Initialize the map centering at San Diego City
m = folium.Map(location=[32.7157, -117.1611], zoom_start=10, tiles="OpenStreetMap")

# NAM Coordinates
NAM_coordinates = folium.FeatureGroup(name='NAM_coordinates')

# Normalize the MAE values to ensure colors are mapped to a range
min_mae, max_mae = errors["MAE"].min(), errors["MAE"].max()

# Define colormap for yellow to red
colormap = branca.colormap.LinearColormap(['#FFFF00', '#FF0000'], vmin=min_mae, vmax=max_mae)

# Plot each point on the map with constant opacity and color based on MAE
for _, row in errors.iterrows():
    latitude, longitude = row["NAM_geometry"].y, row["NAM_geometry"].x
    
    # Color based on the MAE value using the colormap
    color = colormap(row["MAE"])
    
    folium.CircleMarker(
        location=(latitude, longitude),
        radius=3,
        color=color,
        fill=True,
        fill_color=color, 
        fill_opacity=0.9,  
        opacity=0.9,    
        tooltip=(f"MAE: {row['MAE']:.3f}<br>"
                 f"MSE: {row['MSE']:.3f}<br>"
                 f"NMAE: {row['NMAE']:.3f}<br>"
                 f"NMSE: {row['NMSE']:.3f}<br>"
                 f"DWAE: {row['DWAE']:.3f}<br>"
                 f"SDWE: {row['SDWE']:.3f}<br>"
                 f"Dist: {row['distance_from_station_km']:.3f}km<br>"
    )
    ).add_to(NAM_coordinates)

# Weather Station
weather_stations = folium.FeatureGroup(name='Weather Stations')

for idx, row in weather_station_summary_gpd.iterrows():
    folium.CircleMarker(
        location=(row["latitude"], row["longitude"]),
        radius=4,
        color="green",
        fill=True,
        fill_color="green",
        fill_opacity=1,
        opacity=1,
        tooltip=(f"Station: {row['weatherstationname']}<br>")
    ).add_to(weather_stations)

# VRI Snapshot
vri_snapshot = folium.FeatureGroup(name='VRI Snapshot')

# Load simplified GeoJSON with tooltip
vri_tooltip = folium.GeoJsonTooltip(
    fields=["name", "vri_risk", "shape_area"],
    aliases=["Name:", "VRI Risk:", "Shape Area:"],
    localize=True,
    sticky=False,
    labels=True,
    style="""
        background-color: #F0EFEF;
        border: 2px solid black;
        border-radius: 3px;
        box-shadow: 3px;
    """,
    max_width=800,
)

# Load VRI GeoJSON
vri_map = folium.GeoJson(
    src_vri_snapshot_gpd,
    style_function=lambda x: {
        "fillColor": "#0059b3",
        "color": "black",
        "weight": 0.3,
        "fillOpacity": 0.5
    },
    tooltip=vri_tooltip,
)
vri_map.add_to(vri_snapshot)

# Add feature groups to the map
vri_snapshot.add_to(m)
NAM_coordinates.add_to(m)
weather_stations.add_to(m)

# Add layer control to toggle feature groups
folium.LayerControl().add_to(m)

# Save the map
map_path = "san_diego_map_MAE.html"
m.save(map_path)

# Render the map in the notebook using IFrame
IFrame(map_path, width=700, height=500)


In [None]:
# Initialize the map centering at San Diego City
m = folium.Map(location=[32.7157, -117.1611], zoom_start=10, tiles="OpenStreetMap")

# NAM Coordinates
NAM_coordinates = folium.FeatureGroup(name='NAM_coordinates')

# Normalize the MAE values to ensure colors are mapped to a range
min_mae, max_mae = errors["SDWE"].min(), errors["SDWE"].max()

# Define colormap for yellow to red
colormap = branca.colormap.LinearColormap(['#FFFF00', '#FF0000'], vmin=min_mae, vmax=max_mae)

# Plot each point on the map with constant opacity and color based on MAE
for _, row in errors.iterrows():
    latitude, longitude = row["NAM_geometry"].y, row["NAM_geometry"].x
    
    # Color based on the MAE value using the colormap
    color = colormap(row["SDWE"])
    
    folium.CircleMarker(
        location=(latitude, longitude),
        radius=3,
        color=color,
        fill=True,
        fill_color=color, 
        fill_opacity=0.9,  
        opacity=0.9,    
        tooltip=(f"MAE: {row['MAE']:.3f}<br>"
                 f"MSE: {row['MSE']:.3f}<br>"
                 f"NMAE: {row['NMAE']:.3f}<br>"
                 f"NMSE: {row['NMSE']:.3f}<br>"
                 f"DWAE: {row['DWAE']:.3f}<br>"
                 f"SDWE: {row['SDWE']:.3f}<br>"
                 f"Dist: {row['distance_from_station_km']:.3f}km<br>"
    )
    ).add_to(NAM_coordinates)

# Weather Station
weather_stations = folium.FeatureGroup(name='Weather Stations')

for idx, row in weather_station_summary_gpd.iterrows():
    folium.CircleMarker(
        location=(row["latitude"], row["longitude"]),
        radius=4,
        color="green",
        fill=True,
        fill_color="green",
        fill_opacity=1,
        opacity=1,
        tooltip=(f"Station: {row['weatherstationname']}<br>")
    ).add_to(weather_stations)

# VRI Snapshot
vri_snapshot = folium.FeatureGroup(name='VRI Snapshot')

# Load simplified GeoJSON with tooltip
vri_tooltip = folium.GeoJsonTooltip(
    fields=["name", "vri_risk", "shape_area"],
    aliases=["Name:", "VRI Risk:", "Shape Area:"],
    localize=True,
    sticky=False,
    labels=True,
    style="""
        background-color: #F0EFEF;
        border: 2px solid black;
        border-radius: 3px;
        box-shadow: 3px;
    """,
    max_width=800,
)

# Load VRI GeoJSON
vri_map = folium.GeoJson(
    src_vri_snapshot_gpd,
    style_function=lambda x: {
        "fillColor": "#0059b3",
        "color": "black",
        "weight": 0.3,
        "fillOpacity": 0.5
    },
    tooltip=vri_tooltip,
)
vri_map.add_to(vri_snapshot)

# Add feature groups to the map
vri_snapshot.add_to(m)
NAM_coordinates.add_to(m)
weather_stations.add_to(m)

# Add layer control to toggle feature groups
folium.LayerControl().add_to(m)

# Save the map
map_path = "san_diego_map_MAE.html"
m.save(map_path)

# Render the map in the notebook using IFrame
IFrame(map_path, width=700, height=500)


In [None]:
nam_vri_wind_speed_gpd = nam_vri_wind_speed_gpd.rename(columns={
    'date_left' : 'wind_speed_date',
    'date_right': 'nam_date',
    'wind_speed': 'station_wind_speed',
    'average_wind_speed' : 'NAM_wind_speed',
    'shape_right': 'polygon_shape',
})

nam_vri_wind_speed_gpd['nam_date'] = pd.to_datetime(nam_vri_wind_speed_gpd['nam_date']).dt.strftime('%m/%d/%Y')

filtered_nam_vri_wind_speed_gpd = nam_vri_wind_speed_gpd[nam_vri_wind_speed_gpd['wind_speed_date'] == nam_vri_wind_speed_gpd['nam_date']].copy()
filtered_nam_vri_wind_speed_gpd['station_geometry'] = filtered_nam_vri_wind_speed_gpd['geometry']
filtered_nam_vri_wind_speed_gpd['polygon_geometry'] = filtered_nam_vri_wind_speed_gpd['polygon_shape'].apply(wkt.loads)
filtered_nam_vri_wind_speed_gpd['distance_from_station_km'] = filtered_nam_vri_wind_speed_gpd.apply(
    lambda row: haversine_distance(row['station_geometry'], row['NAM_geometry']), axis=1
)

pd.set_option('display.max_columns', None)
filtered_nam_vri_wind_speed_gpd = filtered_nam_vri_wind_speed_gpd.reset_index().drop(columns=['index'])
filtered_nam_vri_wind_speed_gpd.crs

In [None]:
filtered_nam_vri_wind_speed_gpd.head()

In [None]:
NAM_points = filtered_nam_vri_wind_speed_gpd[['NAM_geometry', 'polygon_geometry', 'weatherstationcode', 'station_geometry', 'geometry']]
NAM_points = NAM_points.drop_duplicates()
NAM_points

In [None]:
current_point = NAM_points['NAM_geometry'][0]
station_geometry = NAM_points['station_geometry'][0]
haversine_distance(current_point, station_geometry)

In [None]:
# Calculate the nearest station excluding the station in the same row based on weatherstationcode
nearest_stations = []
nearest_station_codes = []
nearest_station_distances = []

for idx, row in NAM_points.iterrows():
    current_point = row["NAM_geometry"]
    current_station_code = row["weatherstationcode"]
    
    # Exclude the current row's station based on weatherstationcode
    other_stations = NAM_points[NAM_points["weatherstationcode"] != current_station_code]
    
    # Compute distances to all other stations and find the nearest
    min_distance = float("inf")
    nearest_station = None
    nearest_station_code = None
    
    for _, other_row in other_stations.iterrows():
        station_geometry = other_row["station_geometry"] 
        station_code = other_row["weatherstationcode"]  # Get the station code
        
        distance = haversine_distance(current_point, station_geometry)
        if distance < min_distance:
            min_distance = distance
            nearest_station = station_geometry
            nearest_station_code = station_code
    
    nearest_stations.append(nearest_station)
    nearest_station_codes.append(nearest_station_code)
    nearest_station_distances.append(min_distance)

# Add results to the DataFrame
NAM_points["nearest_station_geometry"] = nearest_stations
NAM_points["nearest_weather_station_code"] = nearest_station_codes
NAM_points["nearest_station_distance_km"] = nearest_station_distances

In [None]:
other_stations

In [None]:
NAM_points.head()

In [None]:
NAM_points = NAM_points[['NAM_geometry', 'nearest_weather_station_code', 'nearest_station_distance_km']]
NAM_points.head()

In [None]:


merged_nearest = filtered_nam_vri_wind_speed_gpd.merge(
    NAM_points, 
    how='inner', 
    left_on='NAM_geometry', 
    right_on='NAM_geometry'
)

                                 
merged_nearest.head()
                              
   

In [None]:
weather_station_poly = merged_nearest.drop_duplicates(subset=['wind_speed_date', 'weatherstationcode'])[['weatherstationcode', 'wind_speed_date',
                                                                         'station_wind_speed', 'station_geometry' ,'polygon_geometry' ,'name']]
weather_station_poly.head()

In [None]:
pd.set_option('display.max_columns', None)
merged_nearest_windspeed = merged_nearest.merge(  
    weather_station_poly,
    how='inner', 
    left_on=['nearest_weather_station_code', 'wind_speed_date'],
    right_on=['weatherstationcode', 'wind_speed_date'])
merged_nearest_windspeed = merged_nearest_windspeed

merged_nearest_windspeed.columns

In [None]:
merged_nearest_windspeed.head()

In [None]:
# Mean Absolute Error
def calculate_mae(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed_x'])
    return abs_error.mean()

# Mean Squared Error
def calculate_mse(group):
    squared_error = (group['NAM_wind_speed'] - group['station_wind_speed_x']) ** 2
    return squared_error.mean()

# Normalized Mean Absolute Error
def calculate_nmae(group):
    group['abs_error'] = abs(group['NAM_wind_speed'] - group['station_wind_speed_x'])
    mae = group['abs_error'].mean()
    actual_range = group['station_wind_speed_x'].max() - group['station_wind_speed_x'].min()
    nmae = mae / actual_range if actual_range != 0 else None
    return nmae

# Normalized Mean Squared Error
def calculate_nmse(group):
    group['squared_error'] = (group['NAM_wind_speed'] - group['station_wind_speed_x']) ** 2
    mse = group['squared_error'].mean()
    actual_range = group['station_wind_speed_x'].max() - group['station_wind_speed_x'].min()
    nmse = mse / actual_range if actual_range != 0 else None
    return nmse

# Sigmoid-Damped Distance Error
def calculate_sdwe(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed_x'])
    distance = group['distance_from_station_km'].iloc[0]
    sigmoid_weight = 1 / (1 + np.exp(-(distance - d0)/tau))
    return (abs_error * sigmoid_weight).mean()

# Distance-Weighted Absolute Error (DWAE)
def calculate_dwae(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed_x'])
    distance = group['distance_from_station_km'].iloc[0]
    return (abs_error * distance).mean()


In [None]:
grouped_columns = ['NAM_geometry', 'station_geometry_x', 'polygon_geometry_x', 'name_x']
merged_nearest_windspeed_group = merged_nearest_windspeed.groupby(grouped_columns, sort=False)
merged_nearest_windspeed_group.head()

In [None]:
d0 = merged_nearest_windspeed['distance_from_station_km'].median()
tau = merged_nearest_windspeed['distance_from_station_km'].std()


mae = merged_nearest_windspeed_group.apply(calculate_mae, include_groups=False).reset_index(name='MAE')
mse = merged_nearest_windspeed_group.apply(calculate_mse, include_groups=False).reset_index(name='MSE')
nmae = merged_nearest_windspeed_group.apply(calculate_nmae, include_groups=False).reset_index(name='NMAE')
nmse = merged_nearest_windspeed_group.apply(calculate_nmse, include_groups=False).reset_index(name='NMSE')
dwae = merged_nearest_windspeed_group.apply(calculate_dwae, include_groups=False).reset_index(name='DWAE')
sdwe = merged_nearest_windspeed_group.apply(calculate_sdwe, include_groups=False).reset_index(name='SDWE')


errors_2 = (
    mae
    .merge(mse, on=grouped_columns, how='inner')
    .merge(nmae, on=grouped_columns, how='inner')
    .merge(nmse, on=grouped_columns, how='inner')
    .merge(dwae, on=grouped_columns, how='inner')
    .merge(sdwe, on=grouped_columns, how='inner')
)

errors_2['distance_from_station_km'] = errors.apply(
    lambda row: haversine_distance(row['station_geometry'], row['NAM_geometry']), axis=1
)

errors_2 = errors_2.drop_duplicates()
errors_2

In [None]:
errors_2.describe()

In [None]:
# Mean Absolute Error
def calculate_mae(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed_y'])
    return abs_error.mean()

# Mean Squared Error
def calculate_mse(group):
    squared_error = (group['NAM_wind_speed'] - group['station_wind_speed_y']) ** 2
    return squared_error.mean()

# Normalized Mean Absolute Error
def calculate_nmae(group):
    group['abs_error'] = abs(group['NAM_wind_speed'] - group['station_wind_speed_y'])
    mae = group['abs_error'].mean()
    actual_range = group['station_wind_speed_y'].max() - group['station_wind_speed_y'].min()
    nmae = mae / actual_range if actual_range != 0 else None
    return nmae

# Normalized Mean Squared Error
def calculate_nmse(group):
    group['squared_error'] = (group['NAM_wind_speed'] - group['station_wind_speed_y']) ** 2
    mse = group['squared_error'].mean()
    actual_range = group['station_wind_speed_y'].max() - group['station_wind_speed_y'].min()
    nmse = mse / actual_range if actual_range != 0 else None
    return nmse

# Sigmoid-Damped Distance Error
def calculate_sdwe(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed_y'])
    distance = group['distance_from_station_km'].iloc[0]
    sigmoid_weight = 1 / (1 + np.exp(-(distance - d0)/tau))
    return (abs_error * sigmoid_weight).mean()

# Distance-Weighted Absolute Error (DWAE)
def calculate_dwae(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed_y'])
    distance = group['distance_from_station_km'].iloc[0]
    return (abs_error * distance).mean()


In [None]:
grouped_columns = ['NAM_geometry', 'station_geometry_y', 'polygon_geometry_y', 'name_y']
merged_nearest_windspeed_group = merged_nearest_windspeed.groupby(grouped_columns, sort=False)
merged_nearest_windspeed_group.head()

In [None]:
d0 = merged_nearest_windspeed['distance_from_station_km'].median()
tau = merged_nearest_windspeed['distance_from_station_km'].std()


mae = merged_nearest_windspeed_group.apply(calculate_mae, include_groups=False).reset_index(name='MAE')
mse = merged_nearest_windspeed_group.apply(calculate_mse, include_groups=False).reset_index(name='MSE')
nmae = merged_nearest_windspeed_group.apply(calculate_nmae, include_groups=False).reset_index(name='NMAE')
nmse = merged_nearest_windspeed_group.apply(calculate_nmse, include_groups=False).reset_index(name='NMSE')
dwae = merged_nearest_windspeed_group.apply(calculate_dwae, include_groups=False).reset_index(name='DWAE')
sdwe = merged_nearest_windspeed_group.apply(calculate_sdwe, include_groups=False).reset_index(name='SDWE')


errors_3 = (
    mae
    .merge(mse, on=grouped_columns, how='inner')
    .merge(nmae, on=grouped_columns, how='inner')
    .merge(nmse, on=grouped_columns, how='inner')
    .merge(dwae, on=grouped_columns, how='inner')
    .merge(sdwe, on=grouped_columns, how='inner')
)

errors_3['distance_from_station_km'] = errors.apply(
    lambda row: haversine_distance(row['station_geometry'], row['NAM_geometry']), axis=1
)

errors_3 = errors_3.drop_duplicates()
errors_3

In [None]:
errors_3['NAM_geometry'].nunique

In [None]:
import geopandas as gpd

# Convert errors_2 and errors_3 to GeoDataFrames
errors_2_gdf = gpd.GeoDataFrame(errors_2, geometry=errors_2['NAM_geometry'], crs="EPSG:4326")
errors_3_gdf = gpd.GeoDataFrame(errors_3, geometry=errors_3['NAM_geometry'], crs="EPSG:4326")

# Perform an exact geometry match
merged_error_gdf = errors_2_gdf.merge(
    errors_3_gdf,
    how='inner',
    left_on='NAM_geometry',
    right_on='NAM_geometry',
    suffixes=('_2', '_3')
)

# Reset index for clarity
merged_error_gdf.reset_index(drop=True, inplace=True)

# Display the resulting GeoDataFrame
merged_error_gdf
