In [None]:
import sys
import os
import json
import pandas as pd

# Change dir to root dir
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
os.chdir(parent_dir)
print("Current Working Directory:", os.getcwd())

from src.data import generate_df, convert_to_gdf, generate_gdf, preprocess_df
from src.plot import plot_data

In [None]:
with open('config/data_params.json') as fh:
        data_params = json.load(fh)    

raw_data_path = [os.path.join('./data/raw', file_path) for file_path in data_params["raw_data"]]
modified_data_path = [os.path.join('./data/modified', file_path) for file_path in data_params["modified_data"]]
output_model_path = [os.path.join('./data/modified', file_path) for file_path in data_params["model_prediction"]]

In [None]:
# Reading raw data
gis_weather_station, src_vri_snapshot, nam, windspeed_snapshot = generate_df(raw_data_path) 
gis_weather_station, windspeed_snapshot = preprocess_df(gis_weather_station, windspeed_snapshot)

# Reading filtered data with elevation from API
gis_weather_station_with_elevation, nam_with_elevation = generate_df(modified_data_path)
gis_weather_station_with_elevation_gpd, src_vri_snapshot_gpd, nam_with_elevation_gpd = generate_gdf(
    gis_weather_station_with_elevation, src_vri_snapshot, nam_with_elevation)

# Readiing data from LightGBM model
nam_within_vri_prediction, nam_outside_vri_prediction = generate_df(output_model_path)

## Raw Data

In [None]:
gis_weather_station.describe()

There are 221 unique weather stations in the gis_weather_station dataset.

In [None]:
src_vri_snapshot.describe()

There are 308 VRI polygons with an average area size of 1.444339e+07 km2 in the src_vri_snapshot.

In [None]:
print(f"Unique NAM dates: {nam['date'].nunique()}")
nam.describe()

There are 15,696,970 unique rows of latitude, longitude, date, and wind speed that spans across 179 unique dates.

In [None]:
print(f"Unique Weather Station dates: {windspeed_snapshot['date'].nunique()}")
windspeed_snapshot.describe()

There are 29,939 wind speeds spanning across 179 unique dates

### Raw Data - Missing Data

In [None]:
gis_weather_station.isna().sum()

In [None]:
src_vri_snapshot.isna().sum()

There are missing values in the gis_weather_station and src_vri_snapshot dataset. However, since the columns with missing values are not used in the analysis, imputation of these columns can be omitted.

In [None]:
nam.isna().sum()

In [None]:
windspeed_snapshot.isna().sum()

There are no missing data in the nam and windspeed_snapshot since the missing values has been omitted from the dataset by calling the preprocess_df function.

### Raw Data - Duplicate Data

In [None]:
print(gis_weather_station.duplicated().sum())
print(src_vri_snapshot.duplicated().sum())
print(nam.duplicated().sum())
print(windspeed_snapshot.duplicated().sum())

None of the raw dataset contain duplicate values.

### Raw Data - Data Type

In [None]:
print(f"NAM date data type: {nam['date'].iloc[0]}")
print(f"NAM date data type: {type(nam['date'].iloc[0])}")
print()
print(f"Weather Station date data type: {windspeed_snapshot['date'].iloc[0]}")
print(f"Weather Station date data type: {type(windspeed_snapshot['date'].iloc[0])}")

The date format is different between the NAM data and the Weather Station data. The date format will be standardized during data processing to enable accurate date comparison.

### Raw Data - Data Distribution

In [None]:
plot_data(windspeed_snapshot, x="wind_speed", plot_type="hist", title="Station Wind Speed Distribution", 
          xlabel="Wind Speed (mph)", ylabel="Frequency", bins=100)

Based on the histogram, the weather station wind speed is right-skewed centered at 24mph.

In [None]:
plot_data(nam, x="average_wind_speed", plot_type="hist", bins=100)

Based on the histogram, the nam wind speed is right-skewed centered at 14mph.

In [None]:
plot_data(windspeed_snapshot, x='date', y='wind_speed', plot_type="line", title="Station Wind Speed Across the Years", 
          xlabel="Year", ylabel="Wind Speed (mph)")

In [None]:
plot_data(nam, x='date', y='average_wind_speed', plot_type="line", title="NAM Wind Speed Across the Years", 
          xlabel="Year", ylabel="Wind Speed (mph)")

This line plot shows that the weather station wind speed tend to change throughout the year. This highlight the importance of capturing the temporal features in the model.