In [1]:
from datetime import datetime, time, timedelta
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import xarray as xr
from ocf_blosc2 import Blosc2
from torch.utils.data import DataLoader, IterableDataset
from torchinfo import summary
import json
import glob
plt.rcParams["figure.figsize"] = (20, 12)

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [3]:
from dataset import HDF5Dataset
dataset = HDF5Dataset(['/data/processed_data/processed_train_dev.hdf5'], "/data/sat_np/", "/data/weather_np/", True, True, True, True)
data_loader = DataLoader(dataset, batch_size=16, pin_memory=True, num_workers=8, shuffle=False)
print(f"train dataset len: {len(dataset)}")

Opening file /data/processed_data/processed_train_dev.hdf5.
Warming up the dataloader!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 19949.13it/s]

train dataset len: 100





In [4]:
EPOCHS = 15
START_EPOCH = 0
LR = 1e-3
from submission.model import OurResnet2
model = OurResnet2(image_size=128, device=device).to(device)
criterion = nn.L1Loss()
optimiser = optim.AdamW(model.parameters(), lr=LR, weight_decay=0.02)
lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimiser, T_max=10, eta_min=7e-5)
summary(model, input_size=[(1, 12), (1, 11, 12, 128, 128), (1, 10, 6, 128, 128), (1, 3)])
# x = torch.randn((1, 12)).to(device)
# y = torch.randn((1, 1, 12, 128, 128)).to(device)
# z = torch.randn((1, 10, 6, 128, 128)).to(device)
# a = torch.randn((1, 3)).to(device)
# model(x, y, z, a)

Layer (type:depth-idx)                             Output Shape              Param #
OurResnet2                                         [1, 48]                   --
├─VideoResNet: 1-1                                 [1, 256]                  --
│    └─R2Plus1dStem: 2-1                           [1, 64, 12, 64, 64]       --
│    │    └─Conv3d: 3-1                            [1, 45, 12, 64, 64]       24,255
│    │    └─BatchNorm3d: 3-2                       [1, 45, 12, 64, 64]       90
│    │    └─ReLU: 3-3                              [1, 45, 12, 64, 64]       --
│    │    └─Conv3d: 3-4                            [1, 64, 12, 64, 64]       8,640
│    │    └─BatchNorm3d: 3-5                       [1, 64, 12, 64, 64]       128
│    │    └─ReLU: 3-6                              [1, 64, 12, 64, 64]       --
│    └─Sequential: 2-2                             [1, 64, 12, 64, 64]       --
│    │    └─BasicBlock: 3-7                        [1, 64, 12, 64, 64]       222,016
│    │    └─BasicBlock

In [5]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
def hasNan(tensor):
    return torch.isnan(tensor).any()

In [6]:
MODEL_KEY="ExtraEmbedding_TemporalResnet2+1Combo-PVResFCNet2"
print(f"Training model key {MODEL_KEY}")
from tqdm import tqdm
for epoch in range(EPOCHS):
    model.train()

    running_loss = 0.0
    i = 0
    count = 0
    for (pv_features, hrv_features, nwp, extra, pv_targets) in (pbar := tqdm(data_loader, total=len(data_loader), ascii=True)):
        optimiser.zero_grad()
        with torch.autocast(device_type="cuda"):
            real_extra = extra[:, 2:]
            if hasNan(pv_features) or hasNan(hrv_features) or hasNan(nwp) or hasNan(extra) or hasNan(pv_targets):
                print(f"Found nan {i}")
                continue
            predictions = model(
                pv_features.to(device,dtype=torch.float),
                hrv_features.to(device,dtype=torch.float),
                nwp.to(device,dtype=torch.float),
                real_extra.to(device,dtype=torch.float),
            )
            # print(pv_features.shape, hrv_features.shape, nwp.shape, real_extra.shape)
            loss = criterion(predictions, pv_targets.to(device, dtype=torch.float))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimiser.step()

        size = int(pv_targets.size(0))
        running_loss += float(loss) * size
        count += size

        if i % 10 == 9:
            writer.add_scalar(f"Loss/train_batch_level", (running_loss / count), epoch * len(data_loader) + i)
            pbar.set_description(f"Epoch {START_EPOCH + epoch + 1}, {i + 1}: {running_loss / count}")
        if i % 100 == 99:
            print(f"Epoch {START_EPOCH + epoch + 1}, {i + 1}: {running_loss / count}")
            writer.add_scalar(f"Loss/train_ep_level", (running_loss / count), START_EPOCH + epoch + 1)
        if i % 3000 == 2999:
            torch.save(model.state_dict(), f"./cpts/{MODEL_KEY}-ep{START_EPOCH + epoch + 1}.pt")
        i += 1
    lr_scheduler.step() 
    current_lr = lr_scheduler.get_last_lr()[0]
    print(f"Epoch {START_EPOCH + epoch + 1}: {running_loss / count} (LR: {current_lr})")
    writer.add_scalar(f"LR", current_lr, START_EPOCH + epoch + 1)
    torch.save(model.state_dict(), f"./cpts/{MODEL_KEY}-ep{START_EPOCH + epoch + 1}.pt")
    print("Saved model!")

Training model key ExtraEmbedding_TemporalResnet2+1Combo-PVResFCNet2


  0%|                                                                                                                                        | 0/7 [00:01<?, ?it/s]


IndexError: Caught IndexError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/dsingh/miniconda3/lib/python3.11/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/dsingh/miniconda3/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dsingh/miniconda3/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/home/dsingh/source/devksingh4/climatehack-2023/vit/dataset.py", line 128, in __getitem__
    data.append(torch.from_numpy(crop).permute(2, 1, 3, 4))
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
IndexError: Dimension out of range (expected to be in range of [-4, 3], but got 4)


In [None]:
for i in range(100):
    pv_features, hrv_features, weather_features, extra, pv_targets = dataset[i]
    print(hrv_features.shape, weather_features.shape)