In [29]:
import os
import torch
import pandas as pd
import xarray as xr
import json
from dataset import Dataset
from torch.utils.data import DataLoader, IterableDataset
from torchinfo import summary
import torch.nn as nn
import torch.optim as optim
from ocf_blosc2 import Blosc2
import torch.nn.functional as F
from datetime import datetime, time, timedelta


In [30]:
# selects the gpu if available (when running on Google Colab) otherwise on the local cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

#### **Download data**

The next block downloads the data from huggingface. This is only required if working on Google Colab OR data has not yet been downloaded locally. Expect this cell to run for up to 30 minutes.

In [3]:
# # Download site locations (indices.json) and data (pv and satellite-hrv)

# if not os.path.exists("submission"):
#      os.makedirs("submission", exist_ok=True)
#      #Installing locally means you do not need to rerun this each time you restart the notebook
#      !curl -L https://raw.githubusercontent.com/climatehackai/getting-started-2023/main/indices.json --output indices.json

# if not os.path.exists("data"):
#     os.makedirs("data/pv/2020", exist_ok=True)
#     os.makedirs("data/pv/2021", exist_ok=True)
#     os.makedirs("data/satellite-hrv/2020", exist_ok=True)
#     os.makedirs("data/satellite-hrv/2021", exist_ok=True)

#     !curl -L https://huggingface.co/datasets/climatehackai/climatehackai-2023/resolve/main/pv/metadata.csv --output data/pv/metadata.csv

#      # Download data for June, July, August 2020 and 2021
#     for summer_months in range (6,9):
#           !curl -L https://huggingface.co/datasets/climatehackai/climatehackai-2023/resolve/main/pv/2020/{summer_months}.parquet --output data/pv/2020/{summer_months}.parquet
#           !curl -L https://huggingface.co/datasets/climatehackai/climatehackai-2023/resolve/main/pv/2021/{summer_months}.parquet --output data/pv/2021/{summer_months}.parquet     
#           !curl -L https://huggingface.co/datasets/climatehackai/climatehackai-2023/resolve/main/satellite-hrv/2020/{summer_months}.zarr.zip --output data/satellite-hrv/2020/{summer_months}.zarr.zip
#           !curl -L https://huggingface.co/datasets/climatehackai/climatehackai-2023/resolve/main/satellite-hrv/2021/{summer_months}.zarr.zip --output data/satellite-hrv/2021/{summer_months}.zarr.zip

##### **Load pv, hrv and indices**

In [31]:
# Load all pv data into a single dataframe
pv = []
for year in [2020, 2021]:
    for month in [6, 7, 8]:
        pv.append(pd.read_parquet(f"data/pv/{year}/{month}.parquet").drop("generation_wh", axis=1))  
pv = pd.concat(pv)
pv.index = pv.index.set_levels([pv.index.levels[0].tz_localize(None), pv.index.levels[1]])
 
# The parquet data here is similar to a dataframe. The "power" is the column with the other data types being indexes. The data is shaped with each timestamp being its own 
# subframe with the sites having their corresponding power (% of site capacity).  
hrv = []
for year in [2020, 2021]:
    for month in [6, 7, 8]:
        hrv.append(xr.open_dataset(f"data/satellite-hrv/{year}/{month}.zarr.zip", engine="zarr", chunks="auto"))

hrv = xr.concat(hrv, dim="time")

# Images are stored as vectors. The vectors are stored as an array of vectors. The arrays have a timestamp. Since there is only one channel (hrv)
# the array is a 1D set of vectors with the dimension being time. Read this to help you understand how this is being stored 
# https://tutorial.xarray.dev/fundamentals/01_datastructures.html
with open("indices.json") as f:
    site_locations = {
        data_source: {
            int(site): (int(location[0]), int(location[1]))
            for site, location in locations.items() #if site == '2607'#added this to run only 1 site location to understand how it works
        }
        for data_source, locations in json.load(f).items()
    }

In [35]:
class ChallengeDataset(IterableDataset):
    def __init__(self, pv, hrv, site_locations, horizon = 1, start_date = "2020-7-1", end_date = "2020-7-30", sites=None):
        self.pv = pv
        self.hrv = hrv
        self._site_locations = site_locations
        self._sites = sites if sites else list(site_locations["hrv"].keys())#This gets the individual site ids which are stored as the dict's keys
        self.start_date = list(map(int, start_date.split("-")))
        self.end_date= list(map(int, end_date.split("-")))
        self.horizon = horizon

    def _get_image_times(self):#This function starts at the minimum date in the set and iterates up to the highest date, this is done as the data set is large and due to the nature of the parquette and xarray
        min_date = datetime(self.start_date[0], self.start_date[1], self.start_date[2])
        max_date = datetime(self.end_date[0], self.end_date[1], self.end_date[2])
        
        start_time = time(8)
        end_time = time(17)

        date = min_date 
        while date <= max_date: 
            current_time = datetime.combine(date, start_time)
            while current_time.time() < end_time:
                if current_time:
                    yield current_time

                current_time += timedelta(minutes=60)

            date += timedelta(days=1)

    def __iter__(self):
        for time in self._get_image_times():

            # 1 hour leading up to the predicton time        
            first_hour = slice(str(time), str(time + timedelta(minutes=55)))

            # PV power output in first hour
            pv_features = pv.xs(first_hour, drop_level=False)

            # PV power output in the next 48 hours
            pv_targets = pv.xs(
                slice(  # type: ignore
                    str(time + timedelta(hours=1)),
                    str(time + timedelta(hours= self.horizon, minutes=55)),
                ),
                drop_level=False,
            )

           # hrv satellite images on first hour timestamps setting them up as an input feature
            hrv_data = self.hrv["data"].sel(time=first_hour).to_numpy()

            for site in self._sites:
                try:
                    # Get solar PV features and targets, the site_targets is used to find the models loss
                    site_features = pv_features.xs(site, level=1).to_numpy().squeeze(-1) 
                    site_targets = pv_targets.xs(site, level=1).to_numpy().squeeze(-1)
                    assert site_features.shape == (12,) and site_targets.shape == (12,)
                 
                    # Get a HRV crop centred on the site over the previous hour
                    x, y = self._site_locations["hrv"][site]
                    hrv_features = hrv_data[:, y - 1  : y + 1 ,
                                             x - 1  : x + 1 , 0]
                    assert hrv_features.shape == (12, 2, 2)

                except:
                    continue

                yield site_features, hrv_features, site_targets

#### **Create train, validation and test datasets**

In [36]:
horizon = 1 # forecast horizon in hours
crop_size = 1 # number of pixels of hrv image to crop around each site location
BATCH_SIZE = 64 # number of samples per batch

train_start_date = "2020-06-01"
train_end_date = "2020-08-31"

In [38]:
# train 
train_dataset = ChallengeDataset(pv, hrv, site_locations, start_date=train_start_date, end_date="2020-08-31")
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, pin_memory = True)

# # validation
# validation_dataset = ChallengeDataset(pv, hrv, site_locations=site_locations,
#                                       start_date="2021-06-01", end_date="2021-06-08") 
# validation_dataloader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, pin_memory=True)

# # test
# test_dataset = ChallengeDataset(pv, hrv, site_locations,
#                         start_date="2021-07-01", end_date="2020-08-31",
#                         crop_size = crop_size, horizon = horizon)
# test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, pin_memory = True)

# **Model Training**

In [39]:
layers = [4, 4, 4, 4] #For a deeper resnet with 16 total conv layers

def conv_block(in_channels, out_channels, kernel_size=1, stride=1, padding=0):
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding),
        nn.BatchNorm2d(out_channels),
        nn.ReLU(inplace=True))

class BasicBlock(nn.Module):
    expansion = 1
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv_block(in_channels, out_channels, stride=stride)
        self.conv2 = conv_block(out_channels, out_channels)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x
        out = self.conv1(x)
        out = self.conv2(out)
        if self.downsample is not None:
            identity = self.downsample(x)
        out = out + identity
        return F.relu(out, inplace=False)

class ResNet_light_deep_crop1_60m(nn.Module):
    
    def __init__(self, block, layers):
        
        super(ResNet_light_deep_crop1_60m, self).__init__()
        self.in_channels = 12 #reduce the stride
        self.initial = nn.Identity()
        #self.maxpool = nn.MaxPool2d(kernel_size=1, stride=1, padding=0)
        self.layer1 = self._make_layer(block, 12, layers[0])
        self.layer2 = self._make_layer(block, 24, layers[1], stride=1)
        self.layer3 = self._make_layer(block, 48, layers[2], stride=1)
        self.layer4 = self._make_layer(block, 96, layers[3], stride=1)
        self.avgpool = nn.AdaptiveMaxPool2d((1, 1))
        # Adjust this linear layer based on the concatenated size of HRV and PV features
        self.fc = nn.Linear(96  + 12, 12)  

    def _make_layer(self, block, out_channels, num_blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * block.expansion),
            )
        layers = [block(self.in_channels, out_channels, stride, downsample)]
        self.in_channels = out_channels * block.expansion
        for _ in range(1, num_blocks):
            layers.append(block(self.in_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, pv, hrv ):
        #print("Initial HRV shape:", hrv.shape)  
        #print("Initial PV shape:", pv.shape) 
        #print(f"{pv[0]}")
        x = self.initial(hrv)
        #x = self.maxpool(x)
        #print("Shape after initial conv and maxpool:", x.shape)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        #print("Shape after ResNet_light blocks:", x.shape)

        x = self.avgpool(x)
        #print("Shape after avgpool:", x.shape)
        x = torch.flatten(x, 1)
        pv = torch.flatten(pv, start_dim=1)
        #print(f"Sshape of x = {x.shape} shape of pv = {pv.shape}")
        #x = torch.concat((x, pv), dim=-1)
        #print("Shape after avgpool and flatten:", x.shape)

        
        
        #pv = pv.view(pv.size(0), -1)
        if pv.dim() > 2:
            pv = torch.flatten(pv, start_dim=1)
        #print("Adjusted PV shape:", pv.shape)

        combined = torch.cat((x, pv), dim=1)

        if self.fc.in_features != combined.shape[1]:
            self.fc = nn.Linear(combined.shape[1], 12).to(combined.device)

        out = self.fc(combined)
        return out

In [40]:
model = ResNet_light_deep_crop1_60m(BasicBlock, layers).to(device)
criterion = nn.L1Loss()
optimiser = optim.Adam(model.parameters(), lr=1e-3)

In [41]:
# model 
EPOCHS = 1
batch_losses = []
val_losses = []
epoch_train_losses = []
epoch_val_losses = []
for epoch in range(EPOCHS):
    model.train()

    running_loss = 0.0 ##sets the starting loss at zero
    count = 0 #is used to keep track of the number of batches passed through the training model
  
    for i, (pv_features, hrv_features, pv_targets) in enumerate(train_dataloader): 
        
        optimiser.zero_grad()
        predictions = model(
            pv_features.to(device, dtype=torch.float),
            hrv_features.to(device, dtype=torch.float),
        )
        loss = criterion(predictions, pv_targets.to(device, dtype=torch.float))#calculates the loss between the models predictions and the actual pv
        loss.backward()
        optimiser.step()

        running_loss += loss.item() * pv_targets.size(0)
        count += pv_targets.size(0)

        size = int(pv_targets.size(0))
        running_loss += float(loss) * size
        count += size
   
        if i % 200 == 199:
            print(f"Epoch {epoch + 1}, {i + 1}: {running_loss / count}")
            
    epoch_train_loss = running_loss / count
    epoch_train_losses.append(epoch_train_loss)        
    print(f"Epoch {epoch + 1}: {running_loss / count}")
    

0
64
128
192
256
320
384
448
512
576
640
704
768
832
896
960
1024
1088
1152
1216
1280
1344
1408
1472
1536
1600
1664
1728
1792
1856
1920
1984
2048
2112
2176
2240
2304
2368
2432
2496
2560
2624
2688
2752
2816
2880
2944
3008
3072
3136
3200
3264
3328
3392
3456
3520
3584
3648
3712
3776
3840
3904
3968
4032
4096
4160
4224
4288
4352
4416
4480
4544
4608
4672
4736
4800
4864
4928
4992
5056
5120
5184
5248
5312
5376
5440
5504
5568
5632
5696
5760
5824
5888
5952
6016
6080
6144
6208
6272
6336
6400
6464
6528
6592
6656
6720
6784
6848
6912
6976
7040
7104
7168
7232
7296
7360
7424
7488
7552
7616
7680
7744
7808
7872
7936
8000
8064
8128
8192
8256
8320
8384
8448
8512
8576
8640
8704
8768
8832
8896
8960
9024
9088
9152
9216
9280
9344
9408
9472
9536
9600
9664
9728
9792
9856
9920
9984
10048
10112
10176
10240
10304
10368
10432
10496
10560
10624
10688
10752
10816
10880
10944
11008
11072
11136
11200
11264
11328
11392
11456
11520
11584
11648
11712
11776
11840
11904
11968
12032
12096
12160
12224
12288
12352
12416
12480


In [42]:
model_name = "ResNet_light_deep_crop1_60m_v1"
os.makedirs(f"models/{model_name}", exist_ok=True)

# Save the variables used to make the dataset to a text file
with open(f"models/{model_name}/data_summary.txt", "w") as f:
    f.write("BATCH_SIZE = "+ str(BATCH_SIZE)+ "\n"+ 
            "train_start_date = "+ "2020-06-01"+ "\n"+
            "train_end_date = "+ train_start_date+ "\n"+
            "validation_start_date = "+ validation_start_date+ "\n"+
            "validation_end_date = "+ validation_end_date)
    
# Save the trained model for future predictions
torch.save(model.state_dict(), f"models/{model_name}/trained_model.pt")

# Create a DataFrame from the training_losses and validation_losses lists
df = pd.DataFrame({'Training Losses': batch_losses})
# df = pd.DataFrame({'Training Losses': batch_losses, 'Validation Losses': validation_losses})

# Export the DataFrame to a CSV file
df.to_csv(f'models\{model_name}\losses.csv', index=False)

RuntimeError: Parent directory models/ResNet_light_deep_crop1_60m does not exist.