In [45]:
import os
import torch
import pandas as pd
import xarray as xr
import json
from dataset import Dataset
from torch.utils.data import DataLoader

In [33]:
# selects the gpu if available (when running on Google Colab) otherwise on the local cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

#### **Download data**

The next block downloads the data from huggingface. This is only required if working on Google Colab OR data has not yet been downloaded locally. Expect this cell to run for up to 30 minutes.

In [5]:
# # Download site locations (indices.json) and data (pv and satellite-hrv)

# if not os.path.exists("submission"):
#      os.makedirs("submission", exist_ok=True)
#      #Installing locally means you do not need to rerun this each time you restart the notebook
#      !curl -L https://raw.githubusercontent.com/climatehackai/getting-started-2023/main/indices.json --output indices.json

# if not os.path.exists("data"):
#     os.makedirs("data/pv/2020", exist_ok=True)
#     os.makedirs("data/pv/2021", exist_ok=True)
#     os.makedirs("data/satellite-hrv/2020", exist_ok=True)
#     os.makedirs("data/satellite-hrv/2021", exist_ok=True)

#     !curl -L https://huggingface.co/datasets/climatehackai/climatehackai-2023/resolve/main/pv/metadata.csv --output data/pv/metadata.csv

#      # Download data for June, July, August 2020 and 2021
#     for summer_months in range (6,9):
#           !curl -L https://huggingface.co/datasets/climatehackai/climatehackai-2023/resolve/main/pv/2020/{summer_months}.parquet --output data/pv/2020/{summer_months}.parquet
#           !curl -L https://huggingface.co/datasets/climatehackai/climatehackai-2023/resolve/main/pv/2021/{summer_months}.parquet --output data/pv/2021/{summer_months}.parquet     
#           !curl -L https://huggingface.co/datasets/climatehackai/climatehackai-2023/resolve/main/satellite-hrv/2020/{summer_months}.zarr.zip --output data/satellite-hrv/2020/{summer_months}.zarr.zip
#           !curl -L https://huggingface.co/datasets/climatehackai/climatehackai-2023/resolve/main/satellite-hrv/2021/{summer_months}.zarr.zip --output data/satellite-hrv/2021/{summer_months}.zarr.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1300k  100 1300k    0     0  4134k      0 --:--:-- --:--:-- --:--:-- 4153k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  1126  100  1126    0     0   8046      0 --:--:-- --:--:-- --:--:--  8100

 20 73.6M   20 15.0M    0     0  13.2M      0  0:00:05  0:00:01  0:00:04 13.2M
 52 73.6M   52 38.3M    0     0  17.9M      0  0:00:04  0:00:02  0:00:02 23.2M
 83 73.6M   83 61.7M    0     0  19.6M      0  0:00:03  0:00:03 --:--:-- 23.3M
100 73.6M  100 73.6M    0     0  20.1M      0  0:

##### **Load pv, hrv and indices**

In [43]:
# Load all pv data into a single dataframe
pv = []
for year in [2020, 2021]:
    for month in [6, 7, 8]:
        pv.append(pd.read_parquet(f"data/pv/{year}/{month}.parquet").drop("generation_wh", axis=1))  
pv = pd.concat(pv)
pv.index = pv.index.set_levels([pv.index.levels[0].tz_localize(None), pv.index.levels[1]])
 
# The parquet data here is similar to a dataframe. The "power" is the column with the other data types being indexes. The data is shaped with each timestamp being its own 
# subframe with the sites having their corresponding power (% of site capacity).  
hrv = []
for year in [2020, 2021]:
    for month in [6, 7, 8]:
        hrv.append(xr.open_dataset(f"data/satellite-hrv/{year}/{month}.zarr.zip", engine="zarr", chunks="auto"))

hrv = xr.concat(hrv, dim="time")

# Images are stored as vectors. The vectors are stored as an array of vectors. The arrays have a timestamp. Since there is only one channel (hrv)
# the array is a 1D set of vectors with the dimension being time. Read this to help you understand how this is being stored 
# https://tutorial.xarray.dev/fundamentals/01_datastructures.html
with open("indices.json") as f:
    site_locations = {
        data_source: {
            int(site): (int(location[0]), int(location[1]))
            for site, location in locations.items() #if site == '2607'#added this to run only 1 site location to understand how it works
        }
        for data_source, locations in json.load(f).items()
    }

#### **Train, validation and test datasets**

In [50]:
horizon = 1 # define forecast horizon in hours
crop_size = 1 # define the number of pixels around the site
BATCH_SIZE = 64 # number of samples per batch

In [51]:
# train 
train_dataset = Dataset(pv, hrv, site_locations, 
                        start_date="2020-06-01", end_date="2020-08-31",
                        crop_size = crop_size, horizon = horizon, sites = None)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, pin_memory = True)

# validation
validation_dataset = Dataset(pv, hrv, site_locations=site_locations,
                                      start_date="2021-06-01", end_date="2021-06-08") 
validation_dataloader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, pin_memory=True)

# test
test_dataset = Dataset(pv, hrv, site_locations, 
                        start_date="2021-07-01", end_date="2020-08-31",
                        crop_size = crop_size, horizon = horizon, sites = None)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, pin_memory = True)

In [None]:
# model 