# NAM-VRI Data Analysis

In [None]:
import sys
import os
import json
import numpy as np
import pandas as pd
import geopandas as gpd
from IPython.display import IFrame

# Change dir to root dir
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
os.chdir(parent_dir)
print("Current Working Directory:", os.getcwd())

from src.data import generate_df, convert_to_gdf, generate_gdf, preprocess_df
from src.plot import plot_data, plot_kde, plot_correlation_matrix, plot_map
from src.model import light_gbm, predict_light_gbm_model, cluster_kmeans_nam_data
from src.data import generate_df, convert_to_gdf, generate_gdf, preprocess_df, preprocess_gdf, filter_nam_outside_vri, get_nam_outside_vri_nearest_station
from src.scripts.generateNamCSV import generate_nam_csv
from src.scripts.generateElevationCSV import generate_elevation_csv
from src.analysis import custom_groupby, find_outliers_iqr, create_polygon_gdf, create_polygons_from_geometries

In [None]:
# Prepare file names to be read
with open('config/data_params.json') as fh:
        data_params = json.load(fh)    

raw_data_path = [os.path.join('./data/raw', file_path) for file_path in data_params["raw_data"]]
dev_wings_agg_span_path = [os.path.join('./data/raw', file_path) for file_path in data_params["data_analysis"]]
modified_data_path = [os.path.join('./data/modified', file_path) for file_path in data_params["modified_data"]]
output_model_path = [os.path.join('./data/modified', file_path) for file_path in data_params["model_prediction"]]

In [None]:
# Reading raw data
gis_weather_station, src_vri_snapshot, nam, windspeed_snapshot = generate_df(raw_data_path) 
gis_weather_station, windspeed_snapshot = preprocess_df(gis_weather_station, windspeed_snapshot)
dev_wings_agg_span = generate_df(dev_wings_agg_span_path)[0]

# Reading filtered data with elevation from API
gis_weather_station_with_elevation, nam_with_elevation = generate_df(modified_data_path)
gis_weather_station_with_elevation_gpd, src_vri_snapshot_gpd, nam_with_elevation_gpd = generate_gdf(
    gis_weather_station_with_elevation, src_vri_snapshot, nam_with_elevation)

# Readiing data from LightGBM model
nam_within_vri_prediction, nam_outside_vri_prediction = generate_df(output_model_path)
nam_within_vri_prediction_gpd, nam_outside_vri_prediction_gpd = convert_to_gdf(nam_within_vri_prediction, col = 'nam_geometry'), convert_to_gdf(nam_outside_vri_prediction)

# Convert dev_wings_agg_span data to gpd
dev_wings_agg_span_gpd = convert_to_gdf(dev_wings_agg_span, col = 'shape', espg = 'EPSG:2230')
dev_wings_agg_span_gpd = dev_wings_agg_span_gpd.to_crs(nam_within_vri_prediction_gpd.crs)

## NAM Points Within VRI Polygon

In [None]:
agg_dict_within = {
    'abs_wind_speed_error': 'mean',
    'nam_distance_from_station_km': 'mean',
    'station_elevation_m': 'mean',
    'nam_elevation_m': 'mean',
}

# Group the data based on each NAM points
nam_mae_within = custom_groupby(nam_within_vri_prediction_gpd, ['geometry'], agg_dict_within)
nam_mae_within['distance_weight_error'] = nam_mae_within['abs_wind_speed_error'] * nam_mae_within['nam_distance_from_station_km']
nam_mae_within['abs_elevation_difference_m'] = (nam_mae_within['station_elevation_m'] - nam_mae_within['nam_elevation_m']).abs()
nam_mae_within.head()

### Error Distribution & Correlation

In [None]:
plot_data(nam_mae_within, x='abs_wind_speed_error', title='Wind Speed Absolute Error Distribution',
          xlabel='Mean Absolute Error')

This histogram visualizes the distribution of the Mean Absolute Error of each NAM points. This indicates that the Mean Absolute Error is right-skewed centered at 10 which suggests that there are some points with significantly higher errors, pulling the tail of the distribution to the right.

In [None]:
plot_data(nam_mae_within, x='nam_distance_from_station_km', y='abs_wind_speed_error', plot_type='scatter', 
          title='Scatter Plot: MAE vs Distance from Station', xlabel='NAM Points Distance from Station (km)', ylabel='Wind Speed MAE')

In [None]:
plot_correlation_matrix(nam_mae_within[['abs_wind_speed_error', 'nam_distance_from_station_km', 'station_elevation_m', 'nam_elevation_m', 'abs_elevation_difference_m']], method="pearson", 
                        title="Correlation Matrix", cmap="coolwarm", annot=True)

We aim to explore the relationship between various factors—such as the distance of NAM points from their respective weather stations and elevation—with the Mean Absolute Error (MAE) of NAM predictions. Based on the correlation matrix, the correlation coefficient between the distance from the station and MAE is 0.31, indicating a weak positive correlation. Elevation appears to have a stronger impact on MAE, with a correlation of 0.59 for station elevation, 0.43 for NAM elevation, and 0.35 for the absolute elevation difference between NAM points and stations. These findings suggest that elevation differences may play a more significant role in influencing MAE than spatial distance alone.

### NAM Points Error Analysis

In [None]:
# Outlier wind speed error on upgrouped nam data
abs_wind_speed_error_outlier = find_outliers_iqr(nam_within_vri_prediction_gpd, 'abs_wind_speed_error')
print(f"Outlier of Ungrouped NAM data         : {abs_wind_speed_error_outlier[1]:.3f}")

# Outlier wind speed error grouped based on each nam points
MAE_outlier = find_outliers_iqr(nam_mae_within, 'abs_wind_speed_error')
print(f"NAM MAE Outlier                       : {MAE_outlier[1]:.3f}")

# Average wind speed error of ungrouped data between nam points and weather stations points
average_wind_speed_error = nam_within_vri_prediction_gpd['abs_wind_speed_error'].mean()
print(f"Average Ungrouped NAM Wind Speed Error: {average_wind_speed_error:.3f}")

# Average Mean Absolute Error
average_MAE = nam_mae_within['abs_wind_speed_error'].mean()
print(f"Average MAE                           : {average_MAE:.3f}")

The upper bound wind speed error for ungrouped NAM data is 25.302 while the upper bound of the Mean Absolute Error (MAE) that is grouped based on each NAM points is 18.321. We are going to be using the latter threshold value as it removes the temporal dependencies that may exist in the ungrouped dataset.

The average wind speed error of ungrouped NAM data is 9.937 while the average of the MAE of NAM points is 10.036.

In [None]:
plot_map(gis_weather_station_with_elevation_gpd, src_vri_snapshot_gpd, nam_mae_within, 'abs_wind_speed_error', "nam_within_vri.html")
IFrame("../plots/nam_within_vri.html", width=700, height=500)

This map visualizes the Mean Absolute Error of NAM points within the VRI polygons. The color gradient ranges from yellow (indicating low error) to red (indicating high error).

In [None]:
# Outlier MAE
nam_mae_within_outlier = nam_mae_within[nam_mae_within['abs_wind_speed_error'] > MAE_outlier[1]]
print(f"Number of NAM Points: {nam_mae_within_outlier.shape[0]}")
print(f"Outlier NAM Points Mean MAE: {nam_mae_within_outlier['abs_wind_speed_error'].mean():.3f}")
print(f"Outlier NAM Points Highest MAE: {nam_mae_within_outlier['abs_wind_speed_error'].max():.3f}")
nam_mae_within_outlier.head()

In [None]:
plot_map(gis_weather_station_with_elevation_gpd, src_vri_snapshot_gpd, nam_mae_within_outlier, 
         'abs_wind_speed_error', "nam_within_vri_outlier.html")
IFrame("../plots/nam_within_vri_outlier.html", width=700, height=500)

This map highlights NAM points with outlier Mean Absolute Errors (MAE) in wind speed predictions. The outlier threshold for wind speed MAE is 18.321, and 31 NAM points exceed this value. The mean MAE for these outliers is 23.353, which is approximately 13 units higher than the overall mean MAE for all NAM points. The highest MAE is 37.291.

In [None]:
# Outlier MAE
nam_mae_within_top_20 = nam_mae_within.sort_values(by='abs_wind_speed_error', ascending=False).head(20)
nam_mae_within_top_20.head()

In [None]:
plot_map(gis_weather_station_with_elevation_gpd, src_vri_snapshot_gpd, nam_mae_within_top_20, 
         'abs_wind_speed_error', "nam_within_vri_top_20.html")
IFrame("../plots/nam_within_vri_top_20.html", width=700, height=500)

This map highlights the NAM points with the 20 highest Mean Absolute Errors (MAE) in wind speed predictions.

In [None]:
nam_mae_within_top_20_dwe = nam_mae_within.sort_values(by='distance_weight_error', ascending=False).head(20)
nam_mae_within_top_20_dwe.head()

In [None]:
plot_map(gis_weather_station_with_elevation_gpd, src_vri_snapshot_gpd, nam_mae_within_top_20_dwe, 
         'distance_weight_error', "nam_within_vri_top_20_distance_weighted.html")
IFrame("../plots/nam_within_vri_top_20_distance_weighted.html", width=700, height=500)

This map highlights the NAM points with the 20 highest Distance-Weighted Mean Absolute Errors (MAE) in wind speed predictions.

In [None]:
overlapping_top_20 = nam_mae_within_top_20[nam_mae_within_top_20['geometry'].isin(nam_mae_within_top_20_dwe['geometry'])].shape[0]
overlapping_top_20

We compare the top 20 NAM points with the highest Mean Absolute Error (MAE) to the top 20 NAM points with the highest distance-weighted NAM error. The analysis reveals that 12 points appear in both lists, indicating a notable overlap. This suggests that while raw MAE and distance-weighted error highlight different aspects of prediction accuracy, a significant portion of the highest-error NAM points remain consistent across both metrics.

### VRI Polygon MAE Analysis

In [None]:
agg_dict_within_polygon = {
    'abs_wind_speed_error': 'mean',
    'shape_area': 'mean'
}

# Group data based on VRI Polygon
vri_mae_within = custom_groupby(nam_within_vri_prediction_gpd, ['name'], agg_dict_within_polygon)
vri_mae_within.head()

In [None]:
plot_data(vri_mae_within, x='shape_area', y='abs_wind_speed_error', plot_type='scatter', 
          title='Scatter Plot: MAE vs Polygon Size', xlabel='Polygon Size', ylabel='Wind Speed MAE')

In [None]:
plot_correlation_matrix(vri_mae_within[['abs_wind_speed_error', 'shape_area']], method="pearson", 
                        title="Correlation Matrix", cmap="coolwarm", annot=True)

We also wanted to explore whether the size of the VRI polygon has any impact on the Mean Absolute Error (MAE). Initially, we hypothesized that larger polygons would result in higher MAE values. However, our plot, which shows a correlation matrix of -0.15, reveals a weak negative correlation between polygon size and MAE. This suggests that, contrary to our initial assumption, larger polygons do not necessarily correspond to higher errors. In fact, the weak negative correlation indicates that as the size of the polygon increases, the MAE tends to slightly decrease. This might be skewed by the thin and elongated polygons which have a relatively smaller area but larger error.

In [None]:
# Get the VRI Polygons with the top 20 error
vri_top_20_error = vri_mae_within.sort_values(by='abs_wind_speed_error', ascending=False).head(20)
vri_top_20_error.head()

In [None]:
plot_correlation_matrix(vri_top_20_error[['abs_wind_speed_error', 'shape_area']], method="pearson", 
                        title="Correlation Matrix", cmap="coolwarm", annot=True)

However, when we compute the correlation matrix using the top 20 polygons with the highest error, the correlation between NAM error and distance-weighted NAM error is 0.24. 

In [None]:
# Get VRI Polygon Geopandas with the top 20 error
src_vri_top_20_error = src_vri_snapshot_gpd[src_vri_snapshot_gpd['name'].isin(vri_top_20_error['name'])]

# Get the NAM points within the top 20 error VRi Polygon
nam_within_vri_prediction_gpd_top_20_error = nam_within_vri_prediction_gpd[nam_within_vri_prediction_gpd['name'].isin(
    vri_top_20_error['name'])]
nam_mae_within_top_error = custom_groupby(nam_within_vri_prediction_gpd_top_20_error, ['geometry'], agg_dict_within_polygon)
nam_mae_within_top_error.head()

In [None]:
plot_map(gis_weather_station_with_elevation_gpd, src_vri_top_20_error, nam_mae_within_top_error, 
         'abs_wind_speed_error', "nam_within_vri_top_polygon_error.html")
IFrame("../plots/nam_within_vri_top_polygon_error.html", width=700, height=500)

This map visualizes the top 20 VRI polygons with the highest Average Mean Absolute Error (MAE). From the visualization, we can observe that both thin, elongated polygons and smaller polygons tend to exhibit higher errors. 

## NAM Points Outside VRI Polygon

In [None]:
nam_outside_vri_prediction_gpd.columns

In [None]:
agg_dict_outside = {
    'abs_wind_speed_error_pred': 'mean',
    'nam_distance_from_station_km': 'mean',
    'nam_elevation_m': 'mean',
    'station_elevation_m': 'mean',
    'nam_wind_speed': 'mean',
}

nam_mae_outside = custom_groupby(nam_outside_vri_prediction_gpd, ['geometry'], agg_dict_outside)
nam_mae_outside.head()

### NAM Points Error Analysis

In [None]:
plot_correlation_matrix(nam_mae_outside[['abs_wind_speed_error_pred', 'nam_distance_from_station_km', 'station_elevation_m', 'nam_elevation_m']], method="pearson", 
                        title="Correlation Matrix", cmap="coolwarm", annot=True)

Based on the correlation matrix, the predicted data error shows a weak correlation of 0.13 with the distance to the nearest station. In contrast, it has a strong correlation of 0.90 with station elevation and 0.55 with NAM elevation. These findings highlight the importance of elevation in NAM prediction errors, potentially outweighing the impact of spatial distance alone.

In [None]:
abs_wind_speed_pred_error_outlier = find_outliers_iqr(nam_outside_vri_prediction_gpd, 'abs_wind_speed_error_pred')
print(f"Wind Speed Error Outlier Threshold: {abs_wind_speed_pred_error_outlier[1]:.3f}")

nam_mae_outside_outlier = find_outliers_iqr(nam_mae_outside, 'abs_wind_speed_error_pred')
print(f"MAE Outside VRI Polygon Outlier: {nam_mae_outside_outlier[1]:.3f}")

The outlier for the raw temporal data for NAM points outside the VRI polygon is 25.372, while the outlier for the grouped NAM data is 20.096. Given that we are focusing on grouped NAM data, we will use the latter value of 20.0964 as the outlier threshold.

In [None]:
plot_map(gis_weather_station_with_elevation_gpd, src_vri_snapshot_gpd, nam_mae_outside, 
         'abs_wind_speed_error_pred', "nam_outside_vri_error.html")
IFrame("../plots/nam_outside_vri_error.html", width=700, height=500)

This map shows the predicted MAE of NAM points located outside the VRI polygon. We observe that several areas exhibit high predicted MAE, including the region around Sentenac Mountain, Rancho Vallecito Airstrip, and Sawtooth Mountains Wilderness. Another zone with elevated errors is the Otay Mountain Wilderness, along with Cleveland National Forest and the Marine Corps Base Camp Pendleton.

In [None]:
nam_mae_outside_outlier_points = nam_mae_outside[nam_mae_outside['abs_wind_speed_error_pred'] > nam_mae_outside_outlier[1]].copy()

plot_map(gis_weather_station_with_elevation_gpd, src_vri_snapshot_gpd, nam_mae_outside_outlier_points, 
         'abs_wind_speed_error_pred', "nam_outside_vri_error_outlier.html")
IFrame("../plots/nam_outside_vri_error_outlier.html", width=700, height=500)

In [None]:
# Manual Geometry Coordinates
otay_mountains_coordinates = [(-116.830960, 32.648796), (-116.879944, 32.608430), (-116.88013, 32.59483), (-116.848190, 32.567333), (-116.815000, 32.635033), 
                              (-116.79977, 32.56687), (-116.79959, 32.58048)]

cleveland_national_forest = [(-116.71349, 32.94698), (-116.71368, 32.93337), (-116.69827, 32.87879), (-116.68208, 32.87861), (-116.66505, 32.93285), (-116.66484, 32.94646)]

CA_78 = [(-116.54796, 33.14928), (-116.5484, 33.12206), (-116.53285, 33.08105), (-116.4364, 33.02544), (-116.40396, 33.02503), (-116.41794, 33.14768)]

sawtooth_mountains = [(-116.4691, 33.01224), (-116.46933, 32.99863), (-116.45384, 32.95762), (-116.34216, 32.86091), (-116.19922, 32.72284), (-116.1785, 32.94024), 
                      (-116.33957, 32.99696), (-116.4042, 33.01142),]

monkey_hill = [(-116.51079, 33.42104), (-116.47937, 33.3526),  (-116.46213, 33.40684),  (-116.46188, 33.42044)]

In [None]:
# Define a tuple of coordinates with associated names
geometries = (
    (otay_mountains_coordinates, "Otay Mountains"),
    (cleveland_national_forest, "Cleveland National Forest"),
    (CA_78, "CA 78"),
    (sawtooth_mountains, "Sawtooth Mountains"),
    (monkey_hill, "Monkey Hill")
)

outlier_boundary_gdf = create_polygons_from_geometries(geometries)
outlier_boundary_gdf

In [None]:
spans_within_boundary = gpd.sjoin(outlier_boundary_gdf, dev_wings_agg_span_gpd, how="inner", predicate="intersects")
spans_within_boundary = convert_to_gdf(spans_within_boundary, col = 'shape', espg = 'EPSG:2230')
spans_within_boundary = spans_within_boundary.to_crs(nam_within_vri_prediction_gpd.crs)
spans_within_boundary.head()

In [None]:
plot_map(gis_weather_station_with_elevation_gpd, src_vri_snapshot_gpd, nam_mae_outside_outlier_points, 
         'abs_wind_speed_error_pred', "nam_outside_vri_error_outlier.html", error_boundary=outlier_boundary_gdf, spans=spans_within_boundary)
IFrame("../plots/nam_outside_vri_error_outlier.html", width=700, height=500)

Only the points located within the boundaries of the Sawtooth Mountains and CA-78 contain electrical assets.

In [None]:
boundary_groups = spans_within_boundary.groupby(['boundary_name']).agg(
    span_count=('globalid', 'count'),
    sum_of_customers=('cust_total', 'sum')
)
boundary_groups

We can disregard the span counts and customer counts within the Otay Mountains, as the points are already encompassed within a VRI Polygon.

For CA-78, there are 459 span counts and 232 customers, while the Sawtooth Mountains have 401 span counts and 118 customers. These two polygon boundaries highlight additional areas where further wind speed data can be collected.