In [1]:
import pandas as pd
import xarray as xr

# Load in Data

In [2]:
nwp_path = "../data/nwp/20220101.zarr"
pv_metadata_path = "../data/pv/PV_PVOutput.org_Italy_PVOutput_Italy_systems_metadata.csv"
pv_timeseries_path = "../data/pv/pv_italy_pv_time_series.csv"

In [3]:
nwp = xr.open_dataset(nwp_path, engine='zarr', chunks='auto')
pv_metadata = pd.read_csv(pv_metadata_path)
pv_timeseries = pd.read_csv(pv_timeseries_path)

In [4]:
pv_metadata.drop(columns=pv_metadata.columns[0], axis=1, inplace=True)
pv_timeseries['timestamp'] = pd.to_datetime(pv_timeseries['timestamp'])
pv_timeseries = pv_timeseries.rename(columns={'timestamp':'time'})

# Selecting a Day of Data
Our PV data spans many years, but the NWP data is large, so it is split into chunks of single days. For each day, we will extract only that day of data from the PV data so we can align them in time.

In [5]:
start_time = nwp['time'][0].values
end_time = nwp['time'][-1].values

pv_day_data = pv_timeseries[
        (pv_timeseries['time'] >= start_time) &
        (pv_timeseries['time'] <= end_time)
    ].sort_values(by='time', ignore_index=True)

# Mapping lat/lon
The NWP data has lat/lon available at 0.25 increments (ie: 12.00, 12.25, 12.50, ...). While the PV data has precise lat/lon coords for specific PV sites. These coordinates could look like (12.141, 15.533).

We want to map the PV coords to the nearest NWP. We do that like this:
```python
round(coordinate * 4) / 4
```

In [6]:
pv_metadata['latitude'] = round(pv_metadata['latitude'] * 4) /4
pv_metadata['longitude'] = round(pv_metadata['longitude'] * 4) /4

# Joining Data
Let's join the data. The desired end goal is to have a table where each row contains the `system_id` of a PV site, a `time`, the PV site's metrics (`instantaneous_power_W`, `temperature_C` and `voltage`) and the NWP data at that time, latitude and longitude.

In [7]:
nwp_df = nwp.to_dataframe().reset_index()

In [8]:
nwp_pv_data = pv_day_data.merge(
        pv_metadata[['system_id', 'latitude', 'longitude']],
        how='left',
        on='system_id'
    ).merge(
        nwp_df,
        how='left',
        on=['time', 'latitude', 'longitude']
)

In [10]:
nwp_pv_data.to_csv('preprocessed_data/nwp_pv_joined_jan1_2022.csv')

# Validating the Data
We have preprocessed the same day of data with a different method. We will check if this new data is the same as the old one.

In [11]:
nwp_pv_data.sort_values(['system_id'], inplace=True, ignore_index=True)
ground_truth = pd.read_csv("preprocessed_data/processed_pv_data_jan1_2023_new.csv")
ground_truth.sort_values(['system_id'], inplace=True, ignore_index=True)