#Set up

In [5]:
!pip install torchinfo
!pip install torchviz



In [6]:
!pip install wandb



In [7]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33manon1859[0m (use `wandb login --relogin` to force relogin)


In [4]:
import os
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import urllib
import wandb
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms

from sklearn import preprocessing
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchinfo import summary
from tqdm.notebook import tqdm

torch.__version__ # 1.10.0+cu111

'1.10.0+cu111'

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
%pip install -U git+https://github.com/szagoruyko/pytorchviz.git@master
from torchviz import make_dot, make_dot_from_trace

#Data loading


In [9]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [10]:
!tar -xvf '/content/gdrive/MyDrive/01_UnderGrad_3rd_year/Pattern Recog/HW 5/nowcastingHWdataset.tar.gz'

dataset/features-m10.pk
dataset/features-m6.pk
dataset/features-m7.pk
dataset/features-m8.pk
dataset/features-m9.pk
dataset/labels-m10.pk
dataset/labels-m6.pk
dataset/labels-m7.pk
dataset/labels-m8.pk
dataset/labels-m9.pk


In [11]:
def read_data(months, data_dir='dataset'):
    features = np.array([], dtype=np.float32).reshape(0,5,5,5,3)
    labels = np.array([], dtype=np.float32).reshape(0,5)
    for m in months:
        filename = 'features-m{}.pk'.format(m)
        with open(os.path.join(data_dir,filename), 'rb') as file:
            features_temp = pickle.load(file)
        features = np.concatenate((features, features_temp), axis=0)
        
        filename = 'labels-m{}.pk'.format(m)
        with open(os.path.join(data_dir,filename), 'rb') as file:
            labels_temp = pickle.load(file)
        labels = np.concatenate((labels, labels_temp), axis=0)

    return features, labels

In [12]:
# use data from month 6,7,8 as training set
x_train, y_train = read_data(months=[6,7,8])

# use data from month 9 as validation set
x_val, y_val = read_data(months=[9])

# use data from month 10 as test set
x_test, y_test = read_data(months=[10])

print('x_train shape:',x_train.shape)
print('y_train shape:', y_train.shape, '\n')
print('x_val shape:',x_val.shape)
print('y_val shape:', y_val.shape, '\n')
print('x_test shape:',x_test.shape)
print('y_test shape:', y_test.shape)

x_train shape: (229548, 5, 5, 5, 3)
y_train shape: (229548, 5) 

x_val shape: (92839, 5, 5, 5, 3)
y_val shape: (92839, 5) 

x_test shape: (111715, 5, 5, 5, 3)
y_test shape: (111715, 5)


#Data preparing

In [13]:
# Dataset need to be reshaped to make it suitable for feedforword model
def preprocess_for_CNNGRU(x_train, y_train, x_val, y_val, x_test, y_test):
    x_train = x_train.reshape((-1, 5, 5, 5, 3))
    x_train = np.moveaxis(x_train, -1, 2)
    y_train = y_train.reshape((-1, 5, 1))

    x_val = x_val.reshape((-1, 5, 5, 5, 3))
    x_val = np.moveaxis(x_val, -1, 2)
    y_val = y_val.reshape((-1, 5, 1))

    x_test = x_test.reshape((-1, 5, 5, 5, 3))
    x_test = np.moveaxis(x_test, -1, 2)
    y_test = y_test.reshape((-1, 5, 1))

    return x_train, y_train, x_val, y_val, x_test, y_test

x_train_CNNGRU, y_train_CNNGRU, x_val_CNNGRU, y_val_CNNGRU, x_test_CNNGRU, y_test_CNNGRU = preprocess_for_CNNGRU(x_train, y_train, x_val, y_val, x_test, y_test)
print(x_train_CNNGRU.shape, y_train_CNNGRU.shape)
print(x_val_CNNGRU.shape, y_val_CNNGRU.shape)
print(x_test_CNNGRU.shape, y_test_CNNGRU.shape)

(229548, 5, 3, 5, 5) (229548, 5, 1)
(92839, 5, 3, 5, 5) (92839, 5, 1)
(111715, 5, 3, 5, 5) (111715, 5, 1)


In [14]:
class RainfallDatasetCNNGRU(Dataset):
    def __init__(self, x, y):
        self.x = x.astype(np.float32)
        self.y = y.astype(np.float32)
        print(self.x.shape)
        print(self.y.shape)

    def __getitem__(self, index):
        x = self.x[index] # Retrieve data
        y = self.y[index]
        return x, y

    def __len__(self):
        return self.x.shape[0]

In [15]:
train_dataset_CNNGRU = RainfallDatasetCNNGRU(x_train_CNNGRU, y_train_CNNGRU)
val_dataset_CNNGRU = RainfallDatasetCNNGRU(x_val_CNNGRU, y_val_CNNGRU)
test_dataset_CNNGRU = RainfallDatasetCNNGRU(x_test_CNNGRU, y_test_CNNGRU)

(229548, 5, 3, 5, 5)
(229548, 5, 1)
(92839, 5, 3, 5, 5)
(92839, 5, 1)
(111715, 5, 3, 5, 5)
(111715, 5, 1)


In [16]:
train_loader_CNNGRU = DataLoader(train_dataset_CNNGRU, batch_size=1024, shuffle=True, pin_memory=True)
val_loader_CNNGRU = DataLoader(val_dataset_CNNGRU, batch_size=1024, shuffle=False, pin_memory=True)
test_loader_CNNGRU = DataLoader(test_dataset_CNNGRU, batch_size=1024, shuffle=False, pin_memory=True)

#Loss FN

In [17]:
loss_fn = nn.MSELoss()

#Evaluation

In [18]:
def evaluate(data_loader, model):
    """
    Evaluate model on validation data given by data_loader
    """
    model.eval()
    with torch.no_grad():
      losses = []
      for i, (inputs, y_true) in enumerate(tqdm(data_loader)):
        inputs = inputs.to(device)
        y_true = y_true.to(device)

        y_pred = model(inputs)
        loss = loss_fn(y_pred, y_true)

        losses.append(loss)
      mse = torch.stack(losses).mean()
    model.train()
    return mse

#Train

In [19]:
def train(config, organizer):
  train_losses = []
  val_losses = []
  learning_rates = []

  # Start wandb run
  wandb.init(
      project='precipitation-nowcasting',
      config=config,
  )

  model = organizer['model']
  optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
      optimizer, 
      'min', 
      factor=config['scheduler_factor'], 
      patience=config['scheduler_patience'], 
      min_lr=config['scheduler_min_lr']
  )
  train_loader = organizer['train_loader']
  val_loader = organizer['val_loader']

  # Log parameters and gradients
  wandb.watch(organizer['model'], log='all')

  for epoch in range(config['epochs']):  # loop over the dataset multiple times
      
      # Training
      train_loss = []
      current_lr = optimizer.param_groups[0]['lr']
      learning_rates.append(current_lr)

      # Flag model as training. Some layers behave differently in training and
      # inference modes, such as dropout, BN, etc.
      model.train()

      print(f"Training epoch {epoch+1}...")
      print(f"Current LR: {current_lr}")

      for i, (inputs, y_true) in enumerate(tqdm(train_loader)):
          # Transfer data from cpu to gpu
          inputs = inputs.to(device)
          y_true = y_true.to(device)

          # Reset the gradient
          optimizer.zero_grad()

          # Predict
          y_pred = model(inputs)
          # Calculate loss
          loss = loss_fn(y_pred, y_true)

          # Compute gradient
          loss.backward()
          
          # Update parameters
          optimizer.step()

          # Log stuff
          train_loss.append(loss)
          
      avg_train_loss = torch.stack(train_loss).mean().item()
      train_losses.append(avg_train_loss)

      print(f"Epoch {epoch+1} train loss: {avg_train_loss:.4f}")
      
      # Validation
      model.eval()
      with torch.no_grad(): # No gradient is required during validation
          print(f"Validating epoch {epoch+1}")
          val_loss = []
          for i, (inputs, y_true) in enumerate(tqdm(val_loader)):
              # Transfer data from cpu to gpu
              inputs = inputs.to(device)
              y_true = y_true.to(device)
              
              # Predict
              y_pred = model(inputs)

              # Calculate loss
              loss = loss_fn(y_pred, y_true)

              # Log stuff
              val_loss.append(loss)
          
          avg_val_loss = torch.stack(val_loss).mean().item()
          val_losses.append(avg_val_loss)
          print(f"Epoch {epoch+1} val loss: {avg_val_loss:.4f}")

          # LR adjustment with scheduler
          scheduler.step(avg_val_loss)

          # Save checkpoint if val_loss is the best we got
          best_val_loss = np.inf if epoch == 0 else min(val_losses[:-1])
          if avg_val_loss < best_val_loss:
              # Save whatever you want
              state = {
                  'epoch': epoch,
                  'model': model.state_dict(),
                  'optimizer': optimizer.state_dict(),
                  'scheduler': scheduler.state_dict(),
                  'train_loss': avg_train_loss,
                  'val_loss': avg_val_loss,
                  'best_val_loss': best_val_loss,
              }
              
              print(f"Saving new best model..")
              torch.save(state, config['architecture']+'.pth.tar')
      output = {
          'train_loss': avg_train_loss,
          'val_loss': avg_val_loss,
          'lr': current_lr,
      }
      wandb.log(output)

  wandb.finish()
  print('Finished Training')
  output['best_val_loss'] = best_val_loss
  output['state'] = state
  return output

#GRUCNN


In [32]:
class CNNGRU(nn.Module):
  def __init__(self, hidden_size = 200, kernel_size = 3, channel = 3, device = torch.device("cuda")):
    super(CNNGRU, self).__init__()
    self.device = device
    self.hidden_size = hidden_size

    self.conv2d = [nn.Conv2d(channel, hidden_size, kernel_size) for _ in range(5)]
    self.size_feature_map = 5-kernel_size+1
    self.ff1 = [nn.Linear(hidden_size*self.size_feature_map**2, 75) for _ in range(5)]
    self.gru = nn.GRU(75, hidden_size)
    self.ff2 = nn.Linear(hidden_size, hidden_size)
    self.out = nn.Linear(hidden_size, 1)

  def forward(self, x):
    t = torch.empty(x.size()[0], 5, 75).to(self.device)
    for i in range(x.size()[1]):
      temp = self.conv2d[i](x[:,i])
      temp = torch.flatten(temp, 1)
      t[:,i] = F.relu(self.ff1[i](temp))
    h0 = torch.zeros((1, 5, self.hidden_size), device = device).requires_grad_()
    t, _ = self.gru(t, h0.detach())
    t = F.relu(self.ff2(t))
    t = self.out(t)
    return t

In [33]:
model_CNNGRU = CNNGRU(device = device).to(device)
summary(model_CNNGRU, input_size=(1024, 5, 3, 5, 5))

Layer (type:depth-idx)                   Output Shape              Param #
CNNGRU                                   --                        --
├─GRU: 1-1                               [1024, 5, 200]            166,200
├─Linear: 1-2                            [1024, 5, 200]            40,200
├─Linear: 1-3                            [1024, 5, 1]              201
Total params: 206,601
Trainable params: 206,601
Non-trainable params: 0
Total mult-adds (M): 892.31
Input size (MB): 1.54
Forward/backward pass size (MB): 16.42
Params size (MB): 0.83
Estimated Total Size (MB): 18.79

In [39]:
config = {
    'architecture': 'cnn-gru-2',
    'lr': 0.1,
    'hidden_size': 50,
    'scheduler_factor': 0.3,
    'scheduler_patience': 0,
    'scheduler_min_lr': 5e-5,
    'epochs': 10
}

model_CNNGRU = CNNGRU(device = device)
model_CNNGRU = model_CNNGRU.to(device)

organizer = {
  'model' : model_CNNGRU,
  'train_loader' : train_loader_CNNGRU,
  'val_loader' : val_loader_CNNGRU
}

outcome_ff = train(config, organizer)

Training epoch 1...
Current LR: 0.1


  0%|          | 0/225 [00:00<?, ?it/s]

Epoch 1 train loss: 9.5386
Validating epoch 1


  0%|          | 0/91 [00:00<?, ?it/s]

Epoch 1 val loss: 1.6788
Saving new best model..
Training epoch 2...
Current LR: 0.1


  0%|          | 0/225 [00:00<?, ?it/s]

Epoch 2 train loss: 1.9268
Validating epoch 2


  0%|          | 0/91 [00:00<?, ?it/s]

Epoch 2 val loss: 1.6844
Training epoch 3...
Current LR: 0.03


  0%|          | 0/225 [00:00<?, ?it/s]

Epoch 3 train loss: 1.9263
Validating epoch 3


  0%|          | 0/91 [00:00<?, ?it/s]

Epoch 3 val loss: 1.6783
Saving new best model..
Training epoch 4...
Current LR: 0.03


  0%|          | 0/225 [00:00<?, ?it/s]

Epoch 4 train loss: 1.9206
Validating epoch 4


  0%|          | 0/91 [00:00<?, ?it/s]

Epoch 4 val loss: 1.6779
Saving new best model..
Training epoch 5...
Current LR: 0.03


  0%|          | 0/225 [00:00<?, ?it/s]

Epoch 5 train loss: 1.9208
Validating epoch 5


  0%|          | 0/91 [00:00<?, ?it/s]

Epoch 5 val loss: 1.6763
Saving new best model..
Training epoch 6...
Current LR: 0.03


  0%|          | 0/225 [00:00<?, ?it/s]

Epoch 6 train loss: 1.9213
Validating epoch 6


  0%|          | 0/91 [00:00<?, ?it/s]

Epoch 6 val loss: 1.6770
Training epoch 7...
Current LR: 0.009


  0%|          | 0/225 [00:00<?, ?it/s]

Epoch 7 train loss: 1.9181
Validating epoch 7


  0%|          | 0/91 [00:00<?, ?it/s]

Epoch 7 val loss: 1.6793
Training epoch 8...
Current LR: 0.0026999999999999997


  0%|          | 0/225 [00:00<?, ?it/s]

Epoch 8 train loss: 1.9229
Validating epoch 8


  0%|          | 0/91 [00:00<?, ?it/s]

Epoch 8 val loss: 1.6766
Training epoch 9...
Current LR: 0.0008099999999999998


  0%|          | 0/225 [00:00<?, ?it/s]

Epoch 9 train loss: 1.9236
Validating epoch 9


  0%|          | 0/91 [00:00<?, ?it/s]

Epoch 9 val loss: 1.6770
Training epoch 10...
Current LR: 0.00024299999999999994


  0%|          | 0/225 [00:00<?, ?it/s]

Epoch 10 train loss: 1.9225
Validating epoch 10


  0%|          | 0/91 [00:00<?, ?it/s]

Epoch 10 val loss: 1.6775



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
lr,██▃▃▃▃▂▁▁▁
train_loss,█▁▁▁▁▁▁▁▁▁
val_loss,▃█▃▂▁▂▄▁▂▂

0,1
lr,0.00024
train_loss,1.92254
val_loss,1.67751


Finished Training


#Model dowloading & Test eval

In [44]:
checkpoint = torch.load('/content/cnn-gru-2.pth.tar')
loaded_model = CNNGRU(device = device) # Create model object
loaded_model.load_state_dict(checkpoint['model']) # Load weights
loaded_model = loaded_model.to(device)
print(f"Loaded epoch {checkpoint['epoch']} model")

print('CNNGRU-model')
print('validate', evaluate(val_loader_CNNGRU, model_CNNGRU).item())
print('test', evaluate(test_loader_CNNGRU, model_CNNGRU).item())

Loaded epoch 4 model
CNNGRU-model


  0%|          | 0/91 [00:00<?, ?it/s]

validate 1.6775100231170654


  0%|          | 0/110 [00:00<?, ?it/s]

test 1.1583175659179688
