In [1]:
import openpyxl
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import folium
from folium import plugins
from folium.plugins import HeatMap, MarkerCluster
from IPython.display import IFrame
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Point
from math import radians, sin, cos, sqrt, atan2
import branca
import requests

In [11]:
# Load the CSV files into Pandas DataFrames
gis_weather_station = pd.read_csv('./data/gis_weather_station.csv') # Contain the elevation & nearest coastline coordinate
station_summary_snapshot = pd.read_csv('./data/src_wings_meteorology_station_summary_snapshot_2023_08_02.csv')
windspeed_snapshot = pd.read_csv('./data/src_wings_meteorology_windspeed_snapshot_2023_08_02.csv')
src_vri_snapshot = pd.read_csv('./data/src_vri_snapshot_2024_03_20.csv')

nam = pd.read_csv('./data/nam.csv')
nam_vri_additional_columns = pd.read_csv('./data/NAM_vri_additional_columns.csv') # Contain the elevation & nearest coastline coordinate


In [3]:
def create_point(row):
    return Point(row['longitude'], row['latitude'])

def get_elevation(lat, lon):
    url = f"https://api.open-elevation.com/api/v1/lookup?locations={lat},{lon}"
    response = requests.get(url).json()
    return response['results'][0]['elevation'] if "results" in response else None

def get_land_cover_openlandmap(lat, lon):
    url = f"https://rest.isric.org/soilgrids/v2.0/classification?lon={lon}&lat={lat}"
    response = requests.get(url).json()
    if "properties" in response:
        return response["properties"]
    return None

def get_nearest_coastline(lat, lon):
    query = f"""
    [out:json];
    (
      way[natural=coastline](around:50000,{lat},{lon});
    );
    out geom;
    """
    url = "https://overpass-api.de/api/interpreter"
    response = requests.get(url, params={"data": query}).json()
    
    if "elements" in response and response["elements"]:
        coast = response["elements"][0]["geometry"][0]  # First coastline point
        coast_lat, coast_lon = coast["lat"], coast["lon"]
        return (coast_lat, coast_lon)
    return None

def haversine_distance(point1: Point, point2: Point) -> float:
    R = 6371.0
    lon1, lat1 = point1.x, point1.y
    lon2, lat2 = point2.x, point2.y
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    delta_lat = lat2 - lat1
    delta_lon = lon2 - lon1
    a = sin(delta_lat / 2)**2 + cos(lat1) * cos(lat2) * sin(delta_lon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

def plot_histogram(data, column, bins=30, title=None, xlabel=None, ylabel="Frequency", color="blue"):
    """
    Plots a histogram for a specified column in the DataFrame.

    Parameters:
        data (pd.DataFrame): The DataFrame containing the data.
        column (str): The column name to plot the histogram for.
        bins (int): Number of bins for the histogram. Default is 30.
        title (str): The title of the histogram. Default is None.
        xlabel (str): The label for the x-axis. Default is the column name.
        ylabel (str): The label for the y-axis. Default is "Frequency".
        color (str): The color of the bars in the histogram. Default is "blue".
    """
    if column not in data.columns:
        raise ValueError(f"Column '{column}' not found in the DataFrame.")

    plt.figure(figsize=(10, 6))
    plt.hist(data[column], bins=bins, color=color, edgecolor="black", alpha=0.7)
    plt.title(title if title else f"Histogram of {column}", fontsize=14)
    plt.xlabel(xlabel if xlabel else column, fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.grid(axis="y", linestyle="--", alpha=0.7)
    plt.tight_layout()
    plt.show()

In [4]:
gis_weather_station = gis_weather_station.drop_duplicates(subset=['weatherstationcode'], keep='first')
station_summary_snapshot = station_summary_snapshot.drop_duplicates(subset=['station'], keep='first')
windspeed_snapshot = windspeed_snapshot[(windspeed_snapshot['wind_speed'] < max(windspeed_snapshot['wind_speed']))]

In [5]:
gis_weather_station['geometry'] = gis_weather_station['shape'].apply(wkt.loads)
gis_weather_station_gpd = gpd.GeoDataFrame(gis_weather_station, geometry='geometry', crs=f"EPSG:{gis_weather_station['shape_srid'][0]}")

src_vri_snapshot['geometry'] = src_vri_snapshot['shape'].apply(wkt.loads)
src_vri_snapshot_gpd = gpd.GeoDataFrame(src_vri_snapshot, geometry='geometry', crs=f"EPSG:{src_vri_snapshot['shape_srid'][0]}")

nam_crs = src_vri_snapshot['shape_srid'][0]

nam['geometry'] = nam.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)
nam_gpd = gpd.GeoDataFrame(nam, geometry='geometry', crs=nam_crs)

print(f"Weather Station CRS:    {gis_weather_station_gpd.crs}")
print(f"VRI Polygon CRS:        {src_vri_snapshot_gpd.crs}")
print(f"NAM CRS:                {nam_gpd.crs}")

Weather Station CRS:    EPSG:4431
VRI Polygon CRS:        EPSG:4326
NAM CRS:                EPSG:4326


In [6]:
gis_weather_station_gpd = gis_weather_station_gpd.to_crs(src_vri_snapshot_gpd.crs)

print(f"Weather Station CRS:    {gis_weather_station_gpd.crs}")
print(f"VRI Polygon CRS:        {src_vri_snapshot_gpd.crs}")
print(f"NAM CRS:                {nam_gpd.crs}")

Weather Station CRS:    EPSG:4326
VRI Polygon CRS:        EPSG:4326
NAM CRS:                EPSG:4326


In [7]:
gis_weather_station_gpd = gis_weather_station_gpd.rename(columns={'elevation_m': 'station_elevation_m',
                                                                  'nearest_coastline_geometry': 'station_nearest_coastline_geometry'})

gis_weather_station_gpd['station_nearest_coastline_geometry'] = gis_weather_station_gpd['station_nearest_coastline_geometry'].apply(
    lambda x: wkt.loads(x) if pd.notnull(x) else None
)

gis_weather_station_gpd['station_distance_from_coastline_km'] = gis_weather_station_gpd.apply(
    lambda row: haversine_distance(row['geometry'], row['station_nearest_coastline_geometry']) 
    if pd.notnull(row['station_nearest_coastline_geometry']) else None, 
    axis=1
)

In [12]:
nam_vri_additional_columns['geometry'] = nam_vri_additional_columns['geometry'].apply(wkt.loads)
nam_vri_additional_columns['nearest_coastline_geometry'] = nam_vri_additional_columns['nearest_coastline_geometry'].apply(
    lambda x: wkt.loads(x) if pd.notnull(x) else None
)
nam_vri_additional_columns = nam_vri_additional_columns.drop(columns='Unnamed: 0')
nam_vri_additional_columns = nam_vri_additional_columns.rename(columns={'elevation_m':'nam_elevation_m', 
                                                                        'nearest_coastline_geometry' : 'nam_nearest_coastline_geometry',
                                                                        'geometry' : 'temp_geometry'})

nam_vri_additional_columns['nam_distance_from_coastline_km'] = nam_vri_additional_columns.apply(
    lambda row: haversine_distance(row['temp_geometry'], row['nam_nearest_coastline_geometry']) 
    if pd.notnull(row['nam_nearest_coastline_geometry']) else None, 
    axis=1
)

nam_vri_additional_columns.head()

Unnamed: 0,temp_geometry,nam_elevation_m,nam_nearest_coastline_geometry,nam_distance_from_coastline_km
0,POINT (-117.10672 32.542244),8.0,POINT (-117.1321894 32.6184705),8.805538
1,POINT (-117.09059 32.54212),94.0,POINT (-117.1321894 32.6184705),9.341781
2,POINT (-117.07446 32.542015),47.0,POINT (-117.1321894 32.6184705),10.076358
3,POINT (-116.89679 32.554165),156.0,POINT (-117.1321894 32.6184705),23.184914
4,POINT (-116.977325 32.56846),152.0,POINT (-117.1321894 32.6184705),15.537427


In [13]:
weather_station_summary_gpd = gis_weather_station_gpd.merge(station_summary_snapshot, left_on='weatherstationcode', right_on='station',
                                                   how='inner').drop(columns=['station'])

weather_station_wind_speed_gpd = weather_station_summary_gpd.merge(windspeed_snapshot, left_on='weatherstationcode', right_on='station', 
                                                         how='inner').drop(columns=['station'])

In [14]:
nam_vri_gpd = gpd.sjoin(nam_gpd, src_vri_snapshot_gpd, how='right', predicate='within')
nam_vri_gpd['NAM_geometry'] = nam_vri_gpd.apply(create_point, axis=1)
nam_vri_gpd = nam_vri_gpd.merge(nam_vri_additional_columns, left_on='NAM_geometry', right_on='temp_geometry', how='inner')
nam_vri_gpd = nam_vri_gpd.reset_index().drop(columns=['index', 'index_left', 'temp_geometry'])

nam_vri_wind_speed_gpd = gpd.sjoin(weather_station_wind_speed_gpd, nam_vri_gpd, how='inner', predicate='within')
nam_vri_wind_speed_gpd = nam_vri_wind_speed_gpd.reset_index().drop(columns=['index', 'index_right'])
nam_vri_wind_speed_gpd.columns

Index(['objectid', 'weatherstationcode', 'weatherstationname', 'scadartuid',
       'structureid_left', 'nwszone', 'district_left', 'thomasbrospagegrid',
       'constructionstatus', 'creationuser', 'datecreated', 'datemodified',
       'lastuser', 'structureguid', 'symbolrotation', 'latitude_left',
       'longitude_left', 'elevation', 'twinguid', 'hftd_left', 'zone1idc_left',
       'hftdidc_left', 'gdb_geomattr_data', 'globalid_left', 'shape_left',
       'shape_srid_left', 'snapshot_date_x', 'geometry', 'station_elevation_m',
       'station_nearest_coastline_geometry',
       'station_distance_from_coastline_km', 'vri', 'alert', 'max_gust',
       '99th', '95th', 'snapshot_date_y', 'date_left', 'wind_speed',
       'snapshot_date_left', 'latitude_right', 'longitude_right', 'date_right',
       'average_wind_speed', 'name', 'tessellate', 'extrude', 'visibility',
       'globalid_right', 'anemometer', 'anemometercode', 'circuit',
       'district_right', 'secdevice', 'structureid_ri

In [15]:
nam_vri_wind_speed_gpd = nam_vri_wind_speed_gpd.rename(columns={
    'date_left' : 'wind_speed_date',
    'date_right': 'nam_date',
    'wind_speed': 'station_wind_speed',
    'average_wind_speed' : 'NAM_wind_speed',
    'shape_right': 'polygon_shape'
})

nam_vri_wind_speed_gpd['nam_date'] = pd.to_datetime(nam_vri_wind_speed_gpd['nam_date']).dt.strftime('%m/%d/%Y')
filtered_nam_vri_wind_speed_gpd = nam_vri_wind_speed_gpd[nam_vri_wind_speed_gpd['wind_speed_date'] == nam_vri_wind_speed_gpd['nam_date']].copy()

filtered_nam_vri_wind_speed_gpd['station_geometry'] = filtered_nam_vri_wind_speed_gpd['geometry']
filtered_nam_vri_wind_speed_gpd['polygon_geometry'] = filtered_nam_vri_wind_speed_gpd['polygon_shape'].apply(wkt.loads)
filtered_nam_vri_wind_speed_gpd['nam_distance_from_station_km'] = filtered_nam_vri_wind_speed_gpd.apply(
    lambda row: haversine_distance(row['station_geometry'], row['NAM_geometry']), axis=1
)

filtered_nam_vri_wind_speed_gpd = filtered_nam_vri_wind_speed_gpd.reset_index(drop=True)
filtered_nam_vri_wind_speed_gpd.head()

Unnamed: 0,objectid,weatherstationcode,weatherstationname,scadartuid,structureid_left,nwszone,district_left,thomasbrospagegrid,constructionstatus,creationuser,...,polygon_shape,shape_srid_right,snapshot_date_right,NAM_geometry,nam_elevation_m,nam_nearest_coastline_geometry,nam_distance_from_coastline_km,station_geometry,polygon_geometry,nam_distance_from_station_km
0,1,CBD,Carlsbad,5158.0,P124785,Coastal-243,6.0,1126-G1,A,seu_gis_elec,...,"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",4326,2024-03-20,POINT (-117.328186 33.142284),0.0,POINT (-117.2377552 32.7596851),43.371754,POINT (-117.32717 33.13735),"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",0.557226
1,1,CBD,Carlsbad,5158.0,P124785,Coastal-243,6.0,1126-G1,A,seu_gis_elec,...,"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",4326,2024-03-20,POINT (-117.31195 33.142193),4.0,POINT (-117.2377552 32.7596851),43.092663,POINT (-117.32717 33.13735),"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",1.516308
2,1,CBD,Carlsbad,5158.0,P124785,Coastal-243,6.0,1126-G1,A,seu_gis_elec,...,"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",4326,2024-03-20,POINT (-117.328094 33.15589),50.0,POINT (-117.2377552 32.7596851),44.854994,POINT (-117.32717 33.13735),"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",2.06387
3,1,CBD,Carlsbad,5158.0,P124785,Coastal-243,6.0,1126-G1,A,seu_gis_elec,...,"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",4326,2024-03-20,POINT (-117.31183 33.155807),28.0,POINT (-117.2377552 32.7596851),44.585652,POINT (-117.32717 33.13735),"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",2.500861
4,1,CBD,Carlsbad,5158.0,P124785,Coastal-243,6.0,1126-G1,A,seu_gis_elec,...,"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",4326,2024-03-20,POINT (-117.31174 33.169407),96.0,POINT (-117.2377552 32.7596851),46.078882,POINT (-117.32717 33.13735),"MULTIPOLYGON Z (((-117.328519 33.134906 0, -11...",3.843664


In [17]:
filtered_nam_vri_wind_speed_gpd.shape

(102832, 78)

In [64]:
filtered_nam_vri_wind_speed_gpd_copy = filtered_nam_vri_wind_speed_gpd.copy()

filtered_nam_vri_wind_speed_gpd_copy['month'] = pd.to_datetime(filtered_nam_vri_wind_speed_gpd_copy['wind_speed_date']).dt.month
filtered_nam_vri_wind_speed_gpd_copy['day_of_year'] = pd.to_datetime(filtered_nam_vri_wind_speed_gpd_copy['wind_speed_date']).dt.dayofyear

grouped_columns = ["NAM_geometry"]

filtered_nam_vri_wind_speed_gpd_copy["abs_wind_speed_diff"] = (
    filtered_nam_vri_wind_speed_gpd_copy["NAM_wind_speed"] - filtered_nam_vri_wind_speed_gpd_copy["station_wind_speed"]
).abs()

hyperparameters = (
    filtered_nam_vri_wind_speed_gpd_copy
    .groupby(grouped_columns, sort=False)["abs_wind_speed_diff"]
    .agg(mean_diff="mean", std_diff="std")
    .reset_index()
)

filtered_nam_vri_wind_speed_gpd_copy = filtered_nam_vri_wind_speed_gpd_copy.merge(
    hyperparameters, on=grouped_columns, how="left"
)

filtered_nam_vri_wind_speed_gpd_copy['std_diff'] = filtered_nam_vri_wind_speed_gpd_copy['std_diff'].fillna(1)

filtered_nam_vri_wind_speed_gpd_copy["sigmoid_error"] = 1 / (
    1 + np.exp((filtered_nam_vri_wind_speed_gpd_copy["abs_wind_speed_diff"] - 
                filtered_nam_vri_wind_speed_gpd_copy["mean_diff"]) / 
               filtered_nam_vri_wind_speed_gpd_copy["std_diff"])
)

In [72]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score

# Define features and target variable
features = [
    "NAM_wind_speed", "station_wind_speed", "nam_elevation_m", "station_elevation_m",
    "nam_distance_from_coastline_km", "station_distance_from_coastline_km",
    "nam_distance_from_station_km", "max_gust", "95th", "99th",
    "mean_diff", "std_diff", "month", "day_of_year"
]

# Define preprocessing for categorical and numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),  # Impute missing values in numerical features
            ("scaler", StandardScaler())  # Standardize the numerical features
        ]), features)
    ]
)

# Define pipeline with preprocessing + model
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Extract features (X) and target variable (y)
X = filtered_nam_vri_wind_speed_gpd_copy[features]
y = filtered_nam_vri_wind_speed_gpd_copy["sigmoid_error"]

# Train/Test Split (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model with pipeline
pipeline.fit(X_train, y_train)

# Evaluate model
y_pred = pipeline.predict(X_test)

# Compute evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.3f}")
print(f"R² Score: {r2:.3f}")


Mean Absolute Error: 0.008
R² Score: 0.996


In [75]:
grouped_columns = ['NAM_geometry']
filtered_nam_vri_wind_speed_gpd_grouped = filtered_nam_vri_wind_speed_gpd.groupby(grouped_columns, sort=False)

# Mean Absolute Error
def calculate_mae(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed'])
    return abs_error.mean()

# Mean Squared Error
def calculate_mse(group):
    squared_error = (group['NAM_wind_speed'] - group['station_wind_speed']) ** 2
    return squared_error.mean()

# Normalized Mean Absolute Error
def calculate_nmae(group):
    group['abs_error'] = abs(group['NAM_wind_speed'] - group['station_wind_speed'])
    mae = group['abs_error'].mean()
    actual_range = group['station_wind_speed'].max() - group['station_wind_speed'].min()
    nmae = mae / actual_range if actual_range != 0 else None
    return nmae

# Normalized Mean Squared Error
def calculate_nmse(group):
    group['squared_error'] = (group['NAM_wind_speed'] - group['station_wind_speed']) ** 2
    mse = group['squared_error'].mean()
    actual_range = group['station_wind_speed'].max() - group['station_wind_speed'].min()
    nmse = mse / actual_range if actual_range != 0 else None
    return nmse

# Sigmoid-Damped Distance Error
def calculate_sdwe(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed'])
    distance = group['nam_distance_from_station_km'].iloc[0]
    sigmoid_weight = 1 / (1 + np.exp(-(distance - d0)/tau))
    return (abs_error * sigmoid_weight).mean()

# Distance-Weighted Absolute Error (DWAE)
def calculate_dwae(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed'])
    distance = group['nam_distance_from_station_km'].iloc[0]
    return (abs_error * distance).mean()

d0 = filtered_nam_vri_wind_speed_gpd['nam_distance_from_station_km'].median()
tau = filtered_nam_vri_wind_speed_gpd['nam_distance_from_station_km'].std()

mae = filtered_nam_vri_wind_speed_gpd_grouped.apply(calculate_mae, include_groups=False).reset_index(name='MAE')
mse = filtered_nam_vri_wind_speed_gpd_grouped.apply(calculate_mse, include_groups=False).reset_index(name='MSE')
nmae = filtered_nam_vri_wind_speed_gpd_grouped.apply(calculate_nmae, include_groups=False).reset_index(name='NMAE')
nmse = filtered_nam_vri_wind_speed_gpd_grouped.apply(calculate_nmse, include_groups=False).reset_index(name='NMSE')
dwae = filtered_nam_vri_wind_speed_gpd_grouped.apply(calculate_dwae, include_groups=False).reset_index(name='DWAE')
sdwe = filtered_nam_vri_wind_speed_gpd_grouped.apply(calculate_sdwe, include_groups=False).reset_index(name='SDWE')

errors = (
    mae
    .merge(mse, on=grouped_columns, how='inner')
    .merge(nmae, on=grouped_columns, how='inner')
    .merge(nmse, on=grouped_columns, how='inner')
    .merge(dwae, on=grouped_columns, how='inner')
    .merge(sdwe, on=grouped_columns, how='inner')
)

# errors['distance_from_station_km'] = errors.apply(
#     lambda row: haversine_distance(row['station_geometry'], row['NAM_geometry']), axis=1
# )

errors.head()

Unnamed: 0,NAM_geometry,MAE,MSE,NMAE,NMSE,DWAE,SDWE
0,POINT (-117.328186 33.142284),5.870465,48.89751,0.234819,1.9559,3.271176,2.379555
1,POINT (-117.31195 33.142193),5.858967,48.963066,0.234359,1.958523,8.884001,2.600711
2,POINT (-117.328094 33.15589),5.876088,48.885729,0.235044,1.955429,12.127479,2.739599
3,POINT (-117.31183 33.155807),5.906324,49.278501,0.236253,1.97114,14.770896,2.859616
4,POINT (-117.31174 33.169407),5.970476,49.542936,0.238819,1.981717,22.948506,3.220017


In [76]:
errors.describe()

Unnamed: 0,MAE,MSE,NMAE,NMSE,DWAE,SDWE
count,962.0,962.0,960.0,960.0,962.0,962.0
mean,10.035529,146.326588,0.339684,4.647264,45.479693,5.321506
std,3.823477,125.567398,0.180907,3.745645,119.098809,3.097962
min,4.681099,31.963627,0.164428,1.397648,0.355772,1.813195
25%,7.392293,73.292177,0.263235,2.879561,13.556178,3.763452
50%,9.273113,114.282994,0.312561,3.913015,24.15591,4.841081
75%,11.763722,175.025654,0.383349,5.269281,43.096477,5.911219
max,37.29056,1490.096003,3.309344,53.494454,1754.130646,37.247743


In [None]:
errors['NAM_geometry'].nunique

In [None]:
columns = list(errors.columns)[4:]

for i, col in enumerate(columns):
    color = "blue" if i % 2 == 0  else "black"
    
    plot_histogram(
        data=errors,
        column=col,
        bins=100,
        title=f"Histogram of {col}",
        xlabel=f"{col} Values",
        color=color
    )

In [None]:
# Initialize the map centering at San Diego City
m = folium.Map(location=[32.7157, -117.1611], zoom_start=10, tiles="OpenStreetMap")

# NAM Coordinates
NAM_coordinates = folium.FeatureGroup(name='NAM_coordinates')

# Normalize the MAE values to ensure colors are mapped to a range
min_mae, max_mae = errors["MAE"].min(), errors["MAE"].max()

# Define colormap for yellow to red
colormap = branca.colormap.LinearColormap(['#FFFF00', '#FF0000'], vmin=min_mae, vmax=max_mae)

# Plot each point on the map with constant opacity and color based on MAE
for _, row in errors.iterrows():
    latitude, longitude = row["NAM_geometry"].y, row["NAM_geometry"].x
    
    # Color based on the MAE value using the colormap
    color = colormap(row["MAE"])
    
    folium.CircleMarker(
        location=(latitude, longitude),
        radius=3,
        color=color,
        fill=True,
        fill_color=color, 
        fill_opacity=0.9,  
        opacity=0.9,    
        tooltip=(f"MAE: {row['MAE']:.3f}<br>"
                 f"MSE: {row['MSE']:.3f}<br>"
                 f"NMAE: {row['NMAE']:.3f}<br>"
                 f"NMSE: {row['NMSE']:.3f}<br>"
                 f"DWAE: {row['DWAE']:.3f}<br>"
                 f"SDWE: {row['SDWE']:.3f}<br>"
                 f"Dist: {row['distance_from_station_km']:.3f}km<br>"
    )
    ).add_to(NAM_coordinates)

# Weather Station
weather_stations = folium.FeatureGroup(name='Weather Stations')

for idx, row in weather_station_summary_gpd.iterrows():
    folium.CircleMarker(
        location=(row["latitude"], row["longitude"]),
        radius=4,
        color="green",
        fill=True,
        fill_color="green",
        fill_opacity=1,
        opacity=1,
        tooltip=(f"Station: {row['weatherstationname']}<br>")
    ).add_to(weather_stations)

# VRI Snapshot
vri_snapshot = folium.FeatureGroup(name='VRI Snapshot')

# Load simplified GeoJSON with tooltip
vri_tooltip = folium.GeoJsonTooltip(
    fields=["name", "vri_risk", "shape_area"],
    aliases=["Name:", "VRI Risk:", "Shape Area:"],
    localize=True,
    sticky=False,
    labels=True,
    style="""
        background-color: #F0EFEF;
        border: 2px solid black;
        border-radius: 3px;
        box-shadow: 3px;
    """,
    max_width=800,
)

# Load VRI GeoJSON
vri_map = folium.GeoJson(
    src_vri_snapshot_gpd,
    style_function=lambda x: {
        "fillColor": "#0059b3",
        "color": "black",
        "weight": 0.3,
        "fillOpacity": 0.5
    },
    tooltip=vri_tooltip,
)
vri_map.add_to(vri_snapshot)

# Add feature groups to the map
vri_snapshot.add_to(m)
NAM_coordinates.add_to(m)
weather_stations.add_to(m)

# Add layer control to toggle feature groups
folium.LayerControl().add_to(m)

# Save the map
map_path = "san_diego_map_MAE.html"
m.save(map_path)

# Render the map in the notebook using IFrame
IFrame(map_path, width=700, height=500)


In [None]:
# Initialize the map centering at San Diego City
m = folium.Map(location=[32.7157, -117.1611], zoom_start=10, tiles="OpenStreetMap")

# NAM Coordinates
NAM_coordinates = folium.FeatureGroup(name='NAM_coordinates')

# Normalize the MAE values to ensure colors are mapped to a range
min_mae, max_mae = errors["SDWE"].min(), errors["SDWE"].max()

# Define colormap for yellow to red
colormap = branca.colormap.LinearColormap(['#FFFF00', '#FF0000'], vmin=min_mae, vmax=max_mae)

# Plot each point on the map with constant opacity and color based on MAE
for _, row in errors.iterrows():
    latitude, longitude = row["NAM_geometry"].y, row["NAM_geometry"].x
    
    # Color based on the MAE value using the colormap
    color = colormap(row["SDWE"])
    
    folium.CircleMarker(
        location=(latitude, longitude),
        radius=3,
        color=color,
        fill=True,
        fill_color=color, 
        fill_opacity=0.9,  
        opacity=0.9,    
        tooltip=(f"MAE: {row['MAE']:.3f}<br>"
                 f"MSE: {row['MSE']:.3f}<br>"
                 f"NMAE: {row['NMAE']:.3f}<br>"
                 f"NMSE: {row['NMSE']:.3f}<br>"
                 f"DWAE: {row['DWAE']:.3f}<br>"
                 f"SDWE: {row['SDWE']:.3f}<br>"
                 f"Dist: {row['distance_from_station_km']:.3f}km<br>"
    )
    ).add_to(NAM_coordinates)

# Weather Station
weather_stations = folium.FeatureGroup(name='Weather Stations')

for idx, row in weather_station_summary_gpd.iterrows():
    folium.CircleMarker(
        location=(row["latitude"], row["longitude"]),
        radius=4,
        color="green",
        fill=True,
        fill_color="green",
        fill_opacity=1,
        opacity=1,
        tooltip=(f"Station: {row['weatherstationname']}<br>")
    ).add_to(weather_stations)

# VRI Snapshot
vri_snapshot = folium.FeatureGroup(name='VRI Snapshot')

# Load simplified GeoJSON with tooltip
vri_tooltip = folium.GeoJsonTooltip(
    fields=["name", "vri_risk", "shape_area"],
    aliases=["Name:", "VRI Risk:", "Shape Area:"],
    localize=True,
    sticky=False,
    labels=True,
    style="""
        background-color: #F0EFEF;
        border: 2px solid black;
        border-radius: 3px;
        box-shadow: 3px;
    """,
    max_width=800,
)

# Load VRI GeoJSON
vri_map = folium.GeoJson(
    src_vri_snapshot_gpd,
    style_function=lambda x: {
        "fillColor": "#0059b3",
        "color": "black",
        "weight": 0.3,
        "fillOpacity": 0.5
    },
    tooltip=vri_tooltip,
)
vri_map.add_to(vri_snapshot)

# Add feature groups to the map
vri_snapshot.add_to(m)
NAM_coordinates.add_to(m)
weather_stations.add_to(m)

# Add layer control to toggle feature groups
folium.LayerControl().add_to(m)

# Save the map
map_path = "san_diego_map_MAE.html"
m.save(map_path)

# Render the map in the notebook using IFrame
IFrame(map_path, width=700, height=500)


In [None]:
nam_vri_wind_speed_gpd = nam_vri_wind_speed_gpd.rename(columns={
    'date_left' : 'wind_speed_date',
    'date_right': 'nam_date',
    'wind_speed': 'station_wind_speed',
    'average_wind_speed' : 'NAM_wind_speed',
    'shape_right': 'polygon_shape',
})

nam_vri_wind_speed_gpd['nam_date'] = pd.to_datetime(nam_vri_wind_speed_gpd['nam_date']).dt.strftime('%m/%d/%Y')

filtered_nam_vri_wind_speed_gpd = nam_vri_wind_speed_gpd[nam_vri_wind_speed_gpd['wind_speed_date'] == nam_vri_wind_speed_gpd['nam_date']].copy()
filtered_nam_vri_wind_speed_gpd['station_geometry'] = filtered_nam_vri_wind_speed_gpd['geometry']
filtered_nam_vri_wind_speed_gpd['polygon_geometry'] = filtered_nam_vri_wind_speed_gpd['polygon_shape'].apply(wkt.loads)
filtered_nam_vri_wind_speed_gpd['distance_from_station_km'] = filtered_nam_vri_wind_speed_gpd.apply(
    lambda row: haversine_distance(row['station_geometry'], row['NAM_geometry']), axis=1
)

pd.set_option('display.max_columns', None)
filtered_nam_vri_wind_speed_gpd = filtered_nam_vri_wind_speed_gpd.reset_index().drop(columns=['index'])
filtered_nam_vri_wind_speed_gpd.crs

In [None]:
filtered_nam_vri_wind_speed_gpd.head()

In [None]:
NAM_points = filtered_nam_vri_wind_speed_gpd[['NAM_geometry', 'polygon_geometry', 'weatherstationcode', 'station_geometry', 'geometry']]
NAM_points = NAM_points.drop_duplicates()
NAM_points

In [None]:
current_point = NAM_points['NAM_geometry'][0]
station_geometry = NAM_points['station_geometry'][0]
haversine_distance(current_point, station_geometry)

In [None]:
# Calculate the nearest station excluding the station in the same row based on weatherstationcode
nearest_stations = []
nearest_station_codes = []
nearest_station_distances = []

for idx, row in NAM_points.iterrows():
    current_point = row["NAM_geometry"]
    current_station_code = row["weatherstationcode"]
    
    # Exclude the current row's station based on weatherstationcode
    other_stations = NAM_points[NAM_points["weatherstationcode"] != current_station_code]
    
    # Compute distances to all other stations and find the nearest
    min_distance = float("inf")
    nearest_station = None
    nearest_station_code = None
    
    for _, other_row in other_stations.iterrows():
        station_geometry = other_row["station_geometry"] 
        station_code = other_row["weatherstationcode"]  # Get the station code
        
        distance = haversine_distance(current_point, station_geometry)
        if distance < min_distance:
            min_distance = distance
            nearest_station = station_geometry
            nearest_station_code = station_code
    
    nearest_stations.append(nearest_station)
    nearest_station_codes.append(nearest_station_code)
    nearest_station_distances.append(min_distance)

# Add results to the DataFrame
NAM_points["nearest_station_geometry"] = nearest_stations
NAM_points["nearest_weather_station_code"] = nearest_station_codes
NAM_points["nearest_station_distance_km"] = nearest_station_distances

In [None]:
other_stations

In [None]:
NAM_points.head()

In [None]:
NAM_points = NAM_points[['NAM_geometry', 'nearest_weather_station_code', 'nearest_station_distance_km']]
NAM_points.head()

In [None]:


merged_nearest = filtered_nam_vri_wind_speed_gpd.merge(
    NAM_points, 
    how='inner', 
    left_on='NAM_geometry', 
    right_on='NAM_geometry'
)

                                 
merged_nearest.head()
                              
   

In [None]:
weather_station_poly = merged_nearest.drop_duplicates(subset=['wind_speed_date', 'weatherstationcode'])[['weatherstationcode', 'wind_speed_date',
                                                                         'station_wind_speed', 'station_geometry' ,'polygon_geometry' ,'name']]
weather_station_poly.head()

In [None]:
pd.set_option('display.max_columns', None)
merged_nearest_windspeed = merged_nearest.merge(  
    weather_station_poly,
    how='inner', 
    left_on=['nearest_weather_station_code', 'wind_speed_date'],
    right_on=['weatherstationcode', 'wind_speed_date'])
merged_nearest_windspeed = merged_nearest_windspeed

merged_nearest_windspeed.columns

In [None]:
merged_nearest_windspeed.head()

In [None]:
# Mean Absolute Error
def calculate_mae(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed_x'])
    return abs_error.mean()

# Mean Squared Error
def calculate_mse(group):
    squared_error = (group['NAM_wind_speed'] - group['station_wind_speed_x']) ** 2
    return squared_error.mean()

# Normalized Mean Absolute Error
def calculate_nmae(group):
    group['abs_error'] = abs(group['NAM_wind_speed'] - group['station_wind_speed_x'])
    mae = group['abs_error'].mean()
    actual_range = group['station_wind_speed_x'].max() - group['station_wind_speed_x'].min()
    nmae = mae / actual_range if actual_range != 0 else None
    return nmae

# Normalized Mean Squared Error
def calculate_nmse(group):
    group['squared_error'] = (group['NAM_wind_speed'] - group['station_wind_speed_x']) ** 2
    mse = group['squared_error'].mean()
    actual_range = group['station_wind_speed_x'].max() - group['station_wind_speed_x'].min()
    nmse = mse / actual_range if actual_range != 0 else None
    return nmse

# Sigmoid-Damped Distance Error
def calculate_sdwe(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed_x'])
    distance = group['distance_from_station_km'].iloc[0]
    sigmoid_weight = 1 / (1 + np.exp(-(distance - d0)/tau))
    return (abs_error * sigmoid_weight).mean()

# Distance-Weighted Absolute Error (DWAE)
def calculate_dwae(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed_x'])
    distance = group['distance_from_station_km'].iloc[0]
    return (abs_error * distance).mean()


In [None]:
grouped_columns = ['NAM_geometry', 'station_geometry_x', 'polygon_geometry_x', 'name_x']
merged_nearest_windspeed_group = merged_nearest_windspeed.groupby(grouped_columns, sort=False)
merged_nearest_windspeed_group.head()

In [None]:
d0 = merged_nearest_windspeed['distance_from_station_km'].median()
tau = merged_nearest_windspeed['distance_from_station_km'].std()


mae = merged_nearest_windspeed_group.apply(calculate_mae, include_groups=False).reset_index(name='MAE')
mse = merged_nearest_windspeed_group.apply(calculate_mse, include_groups=False).reset_index(name='MSE')
nmae = merged_nearest_windspeed_group.apply(calculate_nmae, include_groups=False).reset_index(name='NMAE')
nmse = merged_nearest_windspeed_group.apply(calculate_nmse, include_groups=False).reset_index(name='NMSE')
dwae = merged_nearest_windspeed_group.apply(calculate_dwae, include_groups=False).reset_index(name='DWAE')
sdwe = merged_nearest_windspeed_group.apply(calculate_sdwe, include_groups=False).reset_index(name='SDWE')


errors_2 = (
    mae
    .merge(mse, on=grouped_columns, how='inner')
    .merge(nmae, on=grouped_columns, how='inner')
    .merge(nmse, on=grouped_columns, how='inner')
    .merge(dwae, on=grouped_columns, how='inner')
    .merge(sdwe, on=grouped_columns, how='inner')
)

errors_2['distance_from_station_km'] = errors.apply(
    lambda row: haversine_distance(row['station_geometry'], row['NAM_geometry']), axis=1
)

errors_2 = errors_2.drop_duplicates()
errors_2

In [None]:
errors_2.describe()

In [None]:
# Mean Absolute Error
def calculate_mae(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed_y'])
    return abs_error.mean()

# Mean Squared Error
def calculate_mse(group):
    squared_error = (group['NAM_wind_speed'] - group['station_wind_speed_y']) ** 2
    return squared_error.mean()

# Normalized Mean Absolute Error
def calculate_nmae(group):
    group['abs_error'] = abs(group['NAM_wind_speed'] - group['station_wind_speed_y'])
    mae = group['abs_error'].mean()
    actual_range = group['station_wind_speed_y'].max() - group['station_wind_speed_y'].min()
    nmae = mae / actual_range if actual_range != 0 else None
    return nmae

# Normalized Mean Squared Error
def calculate_nmse(group):
    group['squared_error'] = (group['NAM_wind_speed'] - group['station_wind_speed_y']) ** 2
    mse = group['squared_error'].mean()
    actual_range = group['station_wind_speed_y'].max() - group['station_wind_speed_y'].min()
    nmse = mse / actual_range if actual_range != 0 else None
    return nmse

# Sigmoid-Damped Distance Error
def calculate_sdwe(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed_y'])
    distance = group['distance_from_station_km'].iloc[0]
    sigmoid_weight = 1 / (1 + np.exp(-(distance - d0)/tau))
    return (abs_error * sigmoid_weight).mean()

# Distance-Weighted Absolute Error (DWAE)
def calculate_dwae(group):
    abs_error = abs(group['NAM_wind_speed'] - group['station_wind_speed_y'])
    distance = group['distance_from_station_km'].iloc[0]
    return (abs_error * distance).mean()


In [None]:
grouped_columns = ['NAM_geometry', 'station_geometry_y', 'polygon_geometry_y', 'name_y']
merged_nearest_windspeed_group = merged_nearest_windspeed.groupby(grouped_columns, sort=False)
merged_nearest_windspeed_group.head()

In [None]:
d0 = merged_nearest_windspeed['distance_from_station_km'].median()
tau = merged_nearest_windspeed['distance_from_station_km'].std()


mae = merged_nearest_windspeed_group.apply(calculate_mae, include_groups=False).reset_index(name='MAE')
mse = merged_nearest_windspeed_group.apply(calculate_mse, include_groups=False).reset_index(name='MSE')
nmae = merged_nearest_windspeed_group.apply(calculate_nmae, include_groups=False).reset_index(name='NMAE')
nmse = merged_nearest_windspeed_group.apply(calculate_nmse, include_groups=False).reset_index(name='NMSE')
dwae = merged_nearest_windspeed_group.apply(calculate_dwae, include_groups=False).reset_index(name='DWAE')
sdwe = merged_nearest_windspeed_group.apply(calculate_sdwe, include_groups=False).reset_index(name='SDWE')


errors_3 = (
    mae
    .merge(mse, on=grouped_columns, how='inner')
    .merge(nmae, on=grouped_columns, how='inner')
    .merge(nmse, on=grouped_columns, how='inner')
    .merge(dwae, on=grouped_columns, how='inner')
    .merge(sdwe, on=grouped_columns, how='inner')
)

errors_3['distance_from_station_km'] = errors.apply(
    lambda row: haversine_distance(row['station_geometry'], row['NAM_geometry']), axis=1
)

errors_3 = errors_3.drop_duplicates()
errors_3

In [None]:
errors_3['NAM_geometry'].nunique

In [None]:
import geopandas as gpd

# Convert errors_2 and errors_3 to GeoDataFrames
errors_2_gdf = gpd.GeoDataFrame(errors_2, geometry=errors_2['NAM_geometry'], crs="EPSG:4326")
errors_3_gdf = gpd.GeoDataFrame(errors_3, geometry=errors_3['NAM_geometry'], crs="EPSG:4326")

# Perform an exact geometry match
merged_error_gdf = errors_2_gdf.merge(
    errors_3_gdf,
    how='inner',
    left_on='NAM_geometry',
    right_on='NAM_geometry',
    suffixes=('_2', '_3')
)

# Reset index for clarity
merged_error_gdf.reset_index(drop=True, inplace=True)

# Display the resulting GeoDataFrame
merged_error_gdf
