In [None]:
import sys
import os
import json
import pandas as pd


# Change dir to root dir
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
os.chdir(parent_dir)
print("Current Working Directory:", os.getcwd())

from src.data import generate_df, convert_to_gdf, generate_gdf, preprocess_df
from src.plot import plot_data
from src.model import light_gbm, predict_light_gbm_model
from src.data import generate_df, generate_gdf, preprocess_df, preprocess_gdf, filter_nam_outside_vri, get_nam_outside_vri_nearest_station
from src.scripts.generateNamCSV import generate_nam_csv
from src.scripts.generateElevationCSV import generate_elevation_csv
from src.analysis import custom_groupby, find_outliers_iqr


In [None]:
with open('config/data_params.json') as fh:
        data_params = json.load(fh)    

raw_data_path = [os.path.join('./data/raw', file_path) for file_path in data_params["raw_data"]]
modified_data_path = [os.path.join('./data/modified', file_path) for file_path in data_params["modified_data"]]
output_model_path = [os.path.join('./data/modified', file_path) for file_path in data_params["model_prediction"]]

In [None]:
# Reading raw data
gis_weather_station, src_vri_snapshot, nam, windspeed_snapshot = generate_df(raw_data_path) 
gis_weather_station, windspeed_snapshot = preprocess_df(gis_weather_station, windspeed_snapshot)

# Reading filtered data with elevation from API
gis_weather_station_with_elevation, nam_with_elevation = generate_df(modified_data_path)
gis_weather_station_with_elevation_gpd, src_vri_snapshot_gpd, nam_with_elevation_gpd = generate_gdf(
    gis_weather_station_with_elevation, src_vri_snapshot, nam_with_elevation)

# Readiing data from LightGBM model
nam_within_vri_prediction, nam_outside_vri_prediction = generate_df(output_model_path)
nam_within_vri_prediction_gpd, nam_outside_vri_prediction_gpd = convert_to_gdf(nam_within_vri_prediction, True), convert_to_gdf(nam_outside_vri_prediction)

In [None]:
nam_within_vri_prediction_gpd.head()

In [None]:
nam_within_vri_prediction_gpd.columns

In [None]:
# # Select only numeric columns
# numeric_columns = nam_within_vri_prediction.select_dtypes(include=['number']).columns

# # Group by polygon and compute the mean only for numeric columns
# nam_grouped_within_vri_prediction = nam_within_vri_prediction.groupby('polygon_geometry')[numeric_columns].mean()


# nam_grouped_within_vri_prediction = nam_within_vri_prediction.groupby('polygon_geometry').mean()

In [None]:
# Wind speed error difference to evaluate the LightGBM model
nam_within_vri_prediction_gpd['wind_speed_error_diff'] = (nam_within_vri_prediction_gpd['abs_wind_speed_error'] - 
                                                     nam_within_vri_prediction_gpd['abs_wind_speed_error_pred']).abs()

agg_dict_within = {
    'abs_wind_speed_error': 'mean',
    'wind_speed_error_diff': 'mean',
    'nam_distance_from_station_km': 'mean'
}

# Group the data based on each NAM points
nam_mae_within = custom_groupby(nam_within_vri_prediction_gpd, ['geometry'], agg_dict_within)
nam_mae_within['distance_weight_error'] = nam_mae_within['abs_wind_speed_error'] * nam_mae_within['nam_distance_from_station_km']
nam_mae_within.head()

In [None]:
nam_mae_within.sort_values(
    by='abs_wind_speed_error', ascending=False
).head(20)['geometry']

In [None]:
nam_points_to_update = nam_mae_within.sort_values(
    by='abs_wind_speed_error', ascending=False
).head(20)['geometry'].tolist()
nam_points_to_update

In [None]:
nam_within_vri_prediction_gpd = nam_within_vri_prediction_gpd[nam_within_vri_prediction_gpd['geometry'].isin(nam_points_to_update)]
nam_within_vri_prediction_gpd

In [None]:
# Import necessary libraries
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
from geopy.distance import geodesic
from shapely.ops import nearest_points
from sklearn.preprocessing import MinMaxScaler

# Load datasets (Assuming the data is already loaded as GeoDataFrames)
# Replace 'your_file_path' with the actual paths if needed
# gis_weather_station = gpd.read_file('your_file_path')
# windspeed_snapshot = gpd.read_file('your_file_path')
# src_vri_snapshot = gpd.read_file('your_file_path')
# nam_within_vri_prediction_gpd = gpd.read_file('your_file_path')

# nam_points_to_update = [
#     "POINT (-116.621796 32.592236)",
#     "POINT (-116.81482 32.64864)",
#     "POINT (-116.8631 32.662697)",
#     "POINT (-116.49263 32.590714)",
#     "POINT (-116.84729 32.63534)",
#     "POINT (-116.08725 32.666706)",
#     "POINT (-116.07048 32.693657)",
#     "POINT (-116.7005 32.729134)",
#     "POINT (-116.05371 32.720608)",
#     "POINT (-116.69971 32.783554)"
# ]
# # List of NAM points to update
# nam_points_to_update = [
#     "POINT (-116.08725 32.66671)",
#     "POINT (-116.05371 32.72061)",
#     "POINT (-116.07048 32.69366)",
#     "POINT (-116.69971 32.78355)",
#     "POINT (-116.66733 32.7832)",
#     "POINT (-116.81482 32.64864)",
#     "POINT (-116.8631 32.6627)",
#     "POINT (-116.88751 33.28887)",
#     "POINT (-116.45044 33.14809)",
#     "POINT (-116.84729 32.63534)",
#     "POINT (-116.68271 32.8378)",
#     "POINT (-116.92093 33.22111)",
#     "POINT (-116.4182 33.13408)",
#     "POINT (-116.36945 33.13344)",
#     "POINT (-116.33694 33.13301)",
#     "POINT (-116.32071 33.13279)",
#     "POINT (-116.7279 33.0696)",
#     "POINT (-116.7001 32.75634)",
#     "POINT (-116.30446 33.13256)",
#     "POINT (-116.93619 33.3029)"
# ]

# Convert NAM points to Shapely geometries
#nam_points_geom = nam_points_to_update

# Function to find the nearest alternative VRI polygon (excluding current)
def find_nearest_vri(nam_point, vri_polygons, current_vri_name):
    nam_coords = (nam_point.y, nam_point.x)  # (lat, lon)
    nearest_vri = None
    min_distance = float("inf")

    for _, row in vri_polygons.iterrows():
        if row['name'] == current_vri_name:
            continue  # Skip the current VRI polygon

        vri_centroid = row['geometry'].centroid
        vri_coords = (vri_centroid.y, vri_centroid.x)

        distance = geodesic(nam_coords, vri_coords).km  # Haversine distance in km
        if distance < min_distance:
            min_distance = distance
            nearest_vri = row

    return nearest_vri

In [None]:
# Create new columns with default value "no change"
nam_within_vri_prediction_gpd["New_VRI_Anemometer"] = "no change"
nam_within_vri_prediction_gpd["New_Polygon_Shape"] = "no change"
nam_within_vri_prediction_gpd

In [None]:
# Apply nearest VRI assignment for the specific NAM points
for point_str in nam_points_to_update:
    current_vri_info = nam_within_vri_prediction_gpd.loc[
        nam_within_vri_prediction_gpd['geometry'] == point_str,
        ['polygon_geometry', 'name']
    ]

    if current_vri_info.empty:
        continue  # Skip if no current VRI info found

    current_vri_name = current_vri_info.iloc[0]['name']

    # Find the nearest alternative VRI polygon (excluding current)
    nearest_vri = find_nearest_vri(point_str, src_vri_snapshot, current_vri_name)

    if nearest_vri is not None:
        nam_within_vri_prediction_gpd.loc[
            nam_within_vri_prediction_gpd['geometry'] == point_str,
            ["New_VRI_Anemometer", "New_Polygon_Shape"]
        ] = [nearest_vri['anemometer'], nearest_vri['shape']]
nam_within_vri_prediction_gpd.head()

In [None]:
merged_wind_data = gis_weather_station_with_elevation_gpd.merge(
    windspeed_snapshot,
    left_on=["weatherstationcode"],  
    right_on=["station"],
    how="inner"
)


pd.set_option('display.max_columns', None)
# Spatial join to merge merged_wind_data with src_vri_snapshot
merged_wind_data = gpd.sjoin(merged_wind_data, src_vri_snapshot_gpd, predicate="within")
merged_wind_data

In [None]:
nam_within_vri_prediction_gpd['New_VRI_Anemometer'].unique()

In [None]:
#merged_wind_data_gdf = convert_to_gdf(merged_wind_data)
df = nam_within_vri_prediction_gpd[['geometry', 'abs_wind_speed_error']]
df_filtered = df[df['geometry'].isin(nam_points_to_update)]
df_filtered.groupby('geometry').mean()

In [None]:
filtered_nam_within_vri_subset = nam_within_vri_prediction_gpd[
    nam_within_vri_prediction_gpd['geometry'].isin(nam_points_to_update)
]
filtered_nam_within_vri_subset

In [None]:
merged_filtered_wind_data = filtered_nam_within_vri_subset.merge(
    merged_wind_data,
    left_on=["New_VRI_Anemometer", "nam_date"],  # Columns from filtered_nam_within_vri_subset
    right_on=["anemometer", "date"],            # Columns from merged_wind_data
    how="left"  # Keep all rows from filtered_nam_within_vri_subset
)

In [None]:
merged_filtered_wind_data.head()

In [None]:
# Select relevant columns for calculation
important_columns = [
    "geometry_x",
    "nam_wind_speed",
    "New_VRI_Anemometer",
    "wind_speed",
    "anemometer_x"
]

filtered_merged_wind_data = merged_filtered_wind_data[important_columns]

# Filter to only include rows where New_VRI_Anemometer has changed
filtered_merged_wind_data = filtered_merged_wind_data[
    filtered_merged_wind_data["New_VRI_Anemometer"] != "no change"
]

# Calculate new absolute wind speed error
filtered_merged_wind_data["new_abs_wind_speed_error"] = (
    filtered_merged_wind_data["nam_wind_speed"] - filtered_merged_wind_data["wind_speed"]
).abs()

filtered_merged_wind_data = filtered_merged_wind_data.rename(columns={'anemometer_x': 'Old_VRI_Polygon'})
filtered_merged_wind_data

In [None]:
df_filtered

In [None]:
# Group both DataFrames by 'nam_geometry' and compute mean errors
new_error_df = filtered_merged_wind_data.groupby("geometry_x")[["new_abs_wind_speed_error"]].mean().reset_index()
old_error_df = df_filtered.groupby("geometry")[["abs_wind_speed_error"]].mean().reset_index()

# Merge both DataFrames to compare old vs new error
comparison_df = old_error_df.merge(new_error_df, left_on='geometry', right_on='geometry_x').drop('geometry_x', axis=1)

# Calculate the difference (improvement)
comparison_df["error_difference"] = comparison_df["abs_wind_speed_error"] - comparison_df["new_abs_wind_speed_error"]

In [None]:
filtered_merged_wind_data

In [None]:
comparison_df

In [None]:
# Extract new VRI polygon information
filtered_merged_wind_data.rename(columns={'geometry_x': 'geometry'}, inplace=True)
new_vri_info = filtered_merged_wind_data[["geometry", "New_VRI_Anemometer", "Old_VRI_Polygon"]].drop_duplicates()

# Merge to include the new VRI polygons
comparison_df = comparison_df.merge(new_vri_info, on="geometry", how="left")

# Filter for optimal geometries where the error difference is positive (improvement)
optimum_geometry = comparison_df[comparison_df["error_difference"] > 0]

# Rename column for clarity
optimum_geometry.rename(columns={"New_VRI_Anemometer": "New_VRI_Polygon"}, inplace=True)
optimum_geometry

In [None]:
optimum_geometry = optimum_geometry.sort_values('error_difference', ascending=False)\
                                          .drop_duplicates(subset=['geometry'], keep='first')\
                                          .reset_index(drop=True)
optimum_geometry

In [None]:
summary_df = pd.DataFrame({
    'Metric': [
        'Mean Old MAE',
        'Mean New MAE',
        'Mean MAE Reduction',
        'Percentage MAE Reduction (%)'
    ],
    'Value': [
        optimum_geometry['abs_wind_speed_error'].mean(),
        optimum_geometry['new_abs_wind_speed_error'].mean(),
        optimum_geometry['error_difference'].mean(),
        (optimum_geometry['error_difference'].mean() / optimum_geometry['abs_wind_speed_error'].mean()) * 100
    ]
})

# Format nicely for display
summary_df['Value'] = summary_df['Value'].round(2)
summary_df