In [1]:
import os
import torch
import pandas as pd
import xarray as xr
import json
from dataset import ChallengeDataset
from torch.utils.data import DataLoader, IterableDataset
from torchinfo import summary
import torch.nn as nn
import torch.optim as optim
from ocf_blosc2 import Blosc2
import torch.nn.functional as F
from datetime import datetime, time, timedelta
import numpy as np


In [2]:
# selects the gpu if available (when running on Google Colab) otherwise on the local cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

#### **Download data**

The next block downloads the data from huggingface. This is only required if working on Google Colab OR data has not yet been downloaded locally. Expect this cell to run for up to 30 minutes.

In [3]:
# # Download site locations (indices.json) and data (pv and satellite-hrv)

# if not os.path.exists("submission"):
#      os.makedirs("submission", exist_ok=True)
#      #Installing locally means you do not need to rerun this each time you restart the notebook
#      !curl -L https://raw.githubusercontent.com/climatehackai/getting-started-2023/main/indices.json --output indices.json

# if not os.path.exists("data"):
#     os.makedirs("data/pv/2020", exist_ok=True)
#     os.makedirs("data/pv/2021", exist_ok=True)
#     os.makedirs("data/satellite-hrv/2020", exist_ok=True)
#     os.makedirs("data/satellite-hrv/2021", exist_ok=True)

#     !curl -L https://huggingface.co/datasets/climatehackai/climatehackai-2023/resolve/main/pv/metadata.csv --output data/pv/metadata.csv

#      # Download data for June, July, August 2020 and 2021
#     for summer_months in range (6,9):
#           !curl -L https://huggingface.co/datasets/climatehackai/climatehackai-2023/resolve/main/pv/2020/{summer_months}.parquet --output data/pv/2020/{summer_months}.parquet
#           !curl -L https://huggingface.co/datasets/climatehackai/climatehackai-2023/resolve/main/pv/2021/{summer_months}.parquet --output data/pv/2021/{summer_months}.parquet     
#           !curl -L https://huggingface.co/datasets/climatehackai/climatehackai-2023/resolve/main/satellite-hrv/2020/{summer_months}.zarr.zip --output data/satellite-hrv/2020/{summer_months}.zarr.zip
#           !curl -L https://huggingface.co/datasets/climatehackai/climatehackai-2023/resolve/main/satellite-hrv/2021/{summer_months}.zarr.zip --output data/satellite-hrv/2021/{summer_months}.zarr.zip

##### **Load pv, hrv and indices**

In [4]:
# Load all pv data into a single dataframe
pv = []
for year in [2020, 2021]:
    for month in [6, 7, 8]:
        pv.append(pd.read_parquet(f"data/pv/{year}/{month}.parquet").drop("generation_wh", axis=1))  
pv = pd.concat(pv)
pv.index = pv.index.set_levels([pv.index.levels[0].tz_localize(None), pv.index.levels[1]])
 
# The parquet data here is similar to a dataframe. The "power" is the column with the other data types being indexes. The data is shaped with each timestamp being its own 
# subframe with the sites having their corresponding power (% of site capacity).  
hrv = []
for year in [2020, 2021]:
    for month in [6, 7, 8]:
        hrv.append(xr.open_dataset(f"data/satellite-hrv/{year}/{month}.zarr.zip", engine="zarr", chunks="auto"))

hrv = xr.concat(hrv, dim="time")

# Images are stored as vectors. The vectors are stored as an array of vectors. The arrays have a timestamp. Since there is only one channel (hrv)
# the array is a 1D set of vectors with the dimension being time. Read this to help you understand how this is being stored 
# https://tutorial.xarray.dev/fundamentals/01_datastructures.html
with open("indices.json") as f:
    site_locations = {
        data_source: {
            int(site): (int(location[0]), int(location[1]))
            for site, location in locations.items() #if site == '2607'#added this to run only 1 site location to understand how it works
        }
        for data_source, locations in json.load(f).items()
    }

#### **Create train, validation and test datasets**

In [5]:
hrv_buffer = 1 # number of pixels of hrv image to crop around each site location
BATCH_SIZE = 64 # number of samples per batch

train_start_date = "2020-06-01"
train_end_date = "2020-08-31"
validation_start_date = "2021-06-01"
validation_end_date = "2021-06-08"
test_start_date = "2021-08-01"
test_end_date = "2021-08-31"

In [16]:
# train 
train_dataset = ChallengeDataset(pv, hrv, site_locations, hrv_buffer = hrv_buffer, start_date=train_start_date, end_date=train_end_date)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, pin_memory = True)

# validation
validation_dataset = ChallengeDataset(pv, hrv, site_locations=site_locations, hrv_buffer = hrv_buffer, start_date=validation_start_date, end_date=validation_end_date) 
validation_dataloader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, pin_memory=True)

# # test
test_dataset = ChallengeDataset(pv, hrv, site_locations, hrv_buffer = hrv_buffer, start_date=test_start_date, end_date=test_end_date,)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, pin_memory = True)

In [7]:
def model_validation(model, criterion, validation_dataloader):
    model.eval() # This is used to set the model to evaluation mode
    with torch.no_grad(): # This is used to stop the model from storing gradients
        losses = []
        for time_ids, site_id, pv_features, hrv_features, pv_targets in validation_dataloader:
            pv_features, hrv_features, pv_targets = pv_features.to(device, dtype=torch.float), hrv_features.to(device, dtype=torch.float), pv_targets.to(device, dtype=torch.float)
            predictions = model(pv_features, hrv_features)
            loss = criterion(predictions, pv_targets)
            losses.append(loss.item())

    model.train() # This is used to set the model back to training mode
    
    return sum(losses) / len(losses)

# **Model Training**

In [8]:
class CNN(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()

        self.conv1 = nn.Conv2d(in_channels=12, out_channels=24, kernel_size=1)
        self.conv2 = nn.Conv2d(in_channels=24, out_channels=48, kernel_size=1)
        self.conv3 = nn.Conv2d(in_channels=48, out_channels=96, kernel_size=1)
        self.conv4 = nn.Conv2d(in_channels=96, out_channels=192, kernel_size=1)

        self.pool = nn.MaxPool2d(kernel_size=1)
        self.flatten = nn.Flatten()

        self.linear1 = nn.Linear(780, 12)

    def forward(self, pv, hrv):
        x = torch.relu(self.pool(self.conv1(hrv)))
        x = torch.relu(self.pool(self.conv2(x)))
        x = torch.relu(self.pool(self.conv3(x)))
        x = torch.relu(self.pool(self.conv4(x)))

        x = self.flatten(x)
        x = torch.concat((x, pv), dim=-1)

        x = torch.sigmoid(self.linear1(x))

        return x
    
summary(CNN(), input_size=[(1, 12), (1, 12, 2, 2)])

Layer (type:depth-idx)                   Output Shape              Param #
CNN                                      [1, 12]                   --
├─Conv2d: 1-1                            [1, 24, 2, 2]             312
├─MaxPool2d: 1-2                         [1, 24, 2, 2]             --
├─Conv2d: 1-3                            [1, 48, 2, 2]             1,200
├─MaxPool2d: 1-4                         [1, 48, 2, 2]             --
├─Conv2d: 1-5                            [1, 96, 2, 2]             4,704
├─MaxPool2d: 1-6                         [1, 96, 2, 2]             --
├─Conv2d: 1-7                            [1, 192, 2, 2]            18,624
├─MaxPool2d: 1-8                         [1, 192, 2, 2]            --
├─Flatten: 1-9                           [1, 768]                  --
├─Linear: 1-10                           [1, 12]                   9,372
Total params: 34,212
Trainable params: 34,212
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.11
Input size (MB): 0.00
Forward/b

In [9]:
model = CNN().to(device)
criterion = nn.L1Loss()
optimiser = optim.Adam(model.parameters(), lr=1e-3)

In [10]:
# model 
EPOCHS = 3
training_losses = []
validation_losses = []
epoch_train_losses = []
epoch_validation_losses = []

for epoch in range(EPOCHS):
    model.train()

    running_loss = 0.0 ##sets the starting loss at zero
    count = 0 #is used to keep track of the number of batches passed through the training model
  
    for i, (time_ids, site_id, pv_features, hrv_features, pv_targets) in enumerate(train_dataloader): 
        
        optimiser.zero_grad()
        predictions = model(
            pv_features.to(device, dtype=torch.float),
            hrv_features.to(device, dtype=torch.float),
        )
        loss = criterion(predictions, pv_targets.to(device, dtype=torch.float))
        loss.backward()
        optimiser.step()

        running_loss += loss.item() * pv_targets.size(0)
        count += pv_targets.size(0)

        if i % 200 == 199:
            batch_loss = running_loss / count
            training_losses.append(batch_loss)
            print(f"     Training Loss: {batch_loss}")

            validation_loss = model_validation(model, criterion, validation_dataloader)
            validation_losses.append(validation_loss)
            # print(f"     Validation Loss: {validation_loss}\n")
                
    epoch_train_loss = running_loss / count 
    epoch_train_losses.append(epoch_train_loss)        
    print(f"Epoch {epoch + 1}: {running_loss / count}")
    

     Training Loss: 0.1622434850409627
     Training Loss: 0.1307074546907097
     Training Loss: 0.13681874041755995
     Training Loss: 0.13619325406849384
     Training Loss: 0.13269630539417268
     Training Loss: 0.12546556800603867
     Training Loss: 0.12266514617949724
     Training Loss: 0.12015252809273079
     Training Loss: 0.12113904813097583
     Training Loss: 0.12055784591846168
     Training Loss: 0.11754081377759576
     Training Loss: 0.11870237614804258
     Training Loss: 0.11959913544070262
     Training Loss: 0.11712279492856137
     Training Loss: 0.11426558097824455
     Training Loss: 0.11405525070964359
     Training Loss: 0.11435342425590052
     Training Loss: 0.11290099699050188
     Training Loss: 0.11270550652995313
     Training Loss: 0.11109498851932585
     Training Loss: 0.11114449027482243
     Training Loss: 0.11166590150953694
     Training Loss: 0.11021476298408664
     Training Loss: 0.11054046594188549
     Training Loss: 0.11153563129864633
  

In [11]:
model_name = "CNN Baseline"
os.makedirs(f"models/{model_name}", exist_ok=True)

# Save the variables used to make the dataset to a text file
with open(f"models/{model_name}/data_summary.txt", "w") as f:
    f.write("BATCH_SIZE = "+ str(BATCH_SIZE)+ "\n"+ 
            "hrv_buffer = "+ str(hrv_buffer)+ "\n"+
            "train_start_date = "+ train_start_date+ "\n"+
            "train_end_date = "+ train_end_date+ "\n"+
            "validation_start_date = "+ validation_start_date+ "\n"+
            "validation_end_date = "+ validation_end_date)
    
# Save the trained model for future predictions
torch.save(model.state_dict(), f"models/{model_name}/CNN Baseline.pt")

# Create a DataFrame from the training_losses and validation_losses lists
# df = pd.DataFrame({'Training Losses': training_losses})
df = pd.DataFrame({'Training Losses': training_losses, 'Validation Losses': validation_losses})

# Export the DataFrame to a CSV file
df.to_csv(f'models\{model_name}\losses.csv', index=True)

In [20]:
def model_test(model, criterion, validation_dataloader):
    model.eval() # This is used to set the model to evaluation mode
    with torch.no_grad(): # This is used to stop the model from storing gradients
        losses = []
        for time_ids, site_id, pv_features, hrv_features, pv_targets in validation_dataloader:
            pv_features, hrv_features, pv_targets = pv_features.to(device, dtype=torch.float), hrv_features.to(device, dtype=torch.float), pv_targets.to(device, dtype=torch.float)
            predictions = model(pv_features, hrv_features)
            loss = criterion(predictions, pv_targets)
            losses.append(loss.item())

    return losses

model = CNN()
model.load_state_dict(torch.load(f"models/CNN Baseline/CNN Baseline.pt"))
model.eval()

criterion = nn.MSELoss()
test_losses = model_test(model, criterion, test_dataloader)

print(f"CNN Test MAE: {test_losses}")

CNN Test MAE: [0.0300606656819582, 0.03147609904408455, 0.03203533962368965, 0.025898873805999756, 0.02077232114970684, 0.02095039002597332, 0.023507537320256233, 0.02429147996008396, 0.020521551370620728, 0.0288733821362257, 0.023067578673362732, 0.02060003951191902, 0.030404502525925636, 0.03673620522022247, 0.02895902656018734, 0.03803335130214691, 0.037316180765628815, 0.018355010077357292, 0.02641255408525467, 0.03494087979197502, 0.04227383807301521, 0.029422862455248833, 0.0306395310908556, 0.045140091329813004, 0.0318831242620945, 0.04439157247543335, 0.04633203148841858, 0.04663198068737984, 0.04471667483448982, 0.03497900441288948, 0.03321631997823715, 0.04258313775062561, 0.03308439999818802, 0.04098588600754738, 0.033233653753995895, 0.03542439639568329, 0.04526665806770325, 0.05313733220100403, 0.06345132738351822, 0.04031729698181152, 0.04642763361334801, 0.04856818541884422, 0.03604092076420784, 0.0368407778441906, 0.05426216125488281, 0.047583069652318954, 0.05635839700

In [25]:
test_losses

[0.0300606656819582,
 0.03147609904408455,
 0.03203533962368965,
 0.025898873805999756,
 0.02077232114970684,
 0.02095039002597332,
 0.023507537320256233,
 0.02429147996008396,
 0.020521551370620728,
 0.0288733821362257,
 0.023067578673362732,
 0.02060003951191902,
 0.030404502525925636,
 0.03673620522022247,
 0.02895902656018734,
 0.03803335130214691,
 0.037316180765628815,
 0.018355010077357292,
 0.02641255408525467,
 0.03494087979197502,
 0.04227383807301521,
 0.029422862455248833,
 0.0306395310908556,
 0.045140091329813004,
 0.0318831242620945,
 0.04439157247543335,
 0.04633203148841858,
 0.04663198068737984,
 0.04471667483448982,
 0.03497900441288948,
 0.03321631997823715,
 0.04258313775062561,
 0.03308439999818802,
 0.04098588600754738,
 0.033233653753995895,
 0.03542439639568329,
 0.04526665806770325,
 0.05313733220100403,
 0.06345132738351822,
 0.04031729698181152,
 0.04642763361334801,
 0.04856818541884422,
 0.03604092076420784,
 0.0368407778441906,
 0.05426216125488281,
 0.04

In [21]:
model

CNN(
  (conv1): Conv2d(12, 24, kernel_size=(1, 1), stride=(1, 1))
  (conv2): Conv2d(24, 48, kernel_size=(1, 1), stride=(1, 1))
  (conv3): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1))
  (conv4): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear1): Linear(in_features=780, out_features=12, bias=True)
)

In [None]:
def _eval_visual(dataloader, model, device):
    model.eval()

    predictions_list = []
    timestamps_list = []
    pv_targets_list = []  # List to store pv_targets for each batch

    with torch.no_grad():
        for i, (time_ids, site_id, pv_features, hrv_features, pv_targets) in enumerate(dataloader):
            hrv_features = hrv_features.to(device, dtype=torch.float)
            pv_features = pv_features.to(device, dtype=torch.float)
            pv_targets = pv_targets.to(device, dtype=torch.float)
            
            batch_predictions = model(pv_features, hrv_features)
            batch_predictions = batch_predictions.cpu().numpy()
            batch_pv_targets = pv_targets.cpu().numpy()  # Convert pv_targets to numpy array

            # Timestamp processing as before
            if isinstance(time_ids[0], tuple) or isinstance(time_ids[0], list):
                single_timestamp = time_ids[0][0]
            else:
                single_timestamp = time_ids[0]
            if isinstance(single_timestamp, datetime):
                timestamp = single_timestamp.strftime('%Y-%m-%d %H:%M:%S')
            else:
                timestamp = str(single_timestamp)
            
            # Append each batch's data to the lists
            predictions_list.append(batch_predictions)
            pv_targets_list.append(batch_pv_targets)  # Append pv_targets to its list
            batch_timestamps = [timestamp] * batch_predictions.shape[0]
            timestamps_list.extend(batch_timestamps)

    # Concatenate all collected arrays into single numpy arrays
    predictions = np.concatenate(predictions_list, axis=0)
    pv_targets = np.concatenate(pv_targets_list, axis=0)  # Concatenate all pv_targets

    # Convert to DataFrame
    predictions_df = pd.DataFrame(predictions)
    pv_targets_df = pd.DataFrame(pv_targets, columns=[f'target_{i}' for i in range(pv_targets.shape[1])])
    timestamps_df = pd.DataFrame(timestamps_list, columns=['timestamp'])

    # Combine timestamps, predictions, and targets by using index alignment
    final_df = pd.concat([timestamps_df, predictions_df, pv_targets_df], axis=1)
    
    return final_df

# Usage
prediction_df = _eval_visual(validation_dataloader, model, device)
prediction_df[['date', 'time']] = prediction_df['timestamp'].str.split('T', expand=True)
timestamp_index = prediction_df.columns.get_loc('timestamp')
prediction_df.insert(timestamp_index, 'date', prediction_df.pop('date'))
prediction_df.insert(timestamp_index + 1, 'time', prediction_df.pop('time'))
prediction_df.drop('timestamp', axis=1, inplace=True)
predictions_RMSE = prediction_df[11]
targets_RMSE = prediction_df['target_11']
mse = (predictions_RMSE - targets_RMSE) ** 2
rmse = np.sqrt(mse)
prediction_df['RMSE_1 hr'] = rmse

prediction_df.to_csv('predictions.csv', index=False)
print(prediction_df)

In [13]:
model

CNN(
  (conv1): Conv2d(12, 24, kernel_size=(1, 1), stride=(1, 1))
  (conv2): Conv2d(24, 48, kernel_size=(1, 1), stride=(1, 1))
  (conv3): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1))
  (conv4): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear1): Linear(in_features=780, out_features=12, bias=True)
)