In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
import torch
import math as m
import os
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from torch.utils.data.sampler import BatchSampler
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install wandb
!wandb login
import wandb
wandb.init(project="multitask", entity="sophied")
sweep_config = {'method': 'random'}
metric = {'name': 'loss', 'goal': 'minimize'   }
sweep_config['metric'] = metric
parameters_dict = {
    # 'optimizer': {
    #     'values': ['adam', 'sgd']
    #     },
    'latent_dim': {
        'min': 1,
        'max': 10,
        'distribution': 'int_uniform'
        # 'values': [1, 2, 3, 4, 5, 7, 10, 20]
    },
    'loss_scalar': {
        'min': 1,
        'max': 3,
        'distribution': 'uniform'
        # 'values': [1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3]
    },
    'learning_rate': {
        'min': 0.0001,
        'max': 0.005,
        'distribution': 'uniform'
        # 'values': [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01]
    },
    'batch_size': {
        'min': 50,
        'max': 500,
        'distribution': 'int_uniform'
        # 'values': [64, 100, 128, 200, 256, 300, 500]
    },
    'ae_layer': {
        'min': 7,
        'max': 20,
        'distribution': 'int_uniform'
    },
    'branch1_layer': {
        'min': 4,
        'max': 10,
        'distribution': 'int_uniform'
    }
}

sweep_config['parameters'] = parameters_dict
import pprint
pprint.pprint(sweep_config)
sweep_id = wandb.sweep(sweep_config, project="multitask")

[34m[1mwandb[0m: Currently logged in as: [33msophied[0m (use `wandb login --relogin` to force relogin)




{'method': 'random',
 'metric': {'goal': 'minimize', 'name': 'loss'},
 'parameters': {'ae_layer': {'distribution': 'int_uniform',
                             'max': 20,
                             'min': 7},
                'batch_size': {'distribution': 'int_uniform',
                               'max': 500,
                               'min': 50},
                'branch1_layer': {'distribution': 'int_uniform',
                                  'max': 10,
                                  'min': 4},
                'latent_dim': {'distribution': 'int_uniform',
                               'max': 10,
                               'min': 1},
                'learning_rate': {'distribution': 'uniform',
                                  'max': 0.005,
                                  'min': 0.0001},
                'loss_scalar': {'distribution': 'uniform', 'max': 3, 'min': 1}}}
Create sweep with ID: k3hiuw5i
Sweep URL: https://wandb.ai/sophied/multitask/sweeps/k3hiuw5i


In [None]:
# https://github.com/hcarlens/pytorch-tabular/blob/master/fast_tensor_data_loader.py
class FastTensorDataLoader:
    """
    A DataLoader-like object for a set of tensors that can be much faster than
    TensorDataset + DataLoader because dataloader grabs individual indices of
    the dataset and calls cat (slow).
    Source: https://discuss.pytorch.org/t/dataloader-much-slower-than-manual-batching/27014/6
    """
    def __init__(self, *tensors, batch_size=32, shuffle=False):
        """
        Initialize a FastTensorDataLoader.
        :param *tensors: tensors to store. Must have the same length @ dim 0.
        :param batch_size: batch size to load.
        :param shuffle: if True, shuffle the data *in-place* whenever an
            iterator is created out of this object.
        :returns: A FastTensorDataLoader.
        """
        assert all(t.shape[0] == tensors[0].shape[0] for t in tensors)
        self.tensors = tensors

        self.dataset_len = self.tensors[0].shape[0]
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Calculate # batches
        n_batches, remainder = divmod(self.dataset_len, self.batch_size)
        if remainder > 0:
            n_batches += 1
        self.n_batches = n_batches
    def __iter__(self):
        if self.shuffle:
            r = torch.randperm(self.dataset_len)
            self.tensors = [t[r] for t in self.tensors]
        self.i = 0
        return self

    def __next__(self):
        if self.i >= self.dataset_len:
            raise StopIteration
        batch = tuple(t[self.i:self.i+self.batch_size] for t in self.tensors)
        self.i += self.batch_size
        return batch

    def __len__(self):
        return self.n_batches

In [None]:
class Bear(torch.nn.Module):
  def __init__(self, dim, ae_layer, branch1_layer):
    super().__init__()
    self.encoder = torch.nn.Sequential(
      torch.nn.Linear(100, ae_layer),
      torch.nn.ReLU(),
      torch.nn.Linear(ae_layer, dim)
    )
    self.decoder = torch.nn.Sequential( 
      torch.nn.ReLU(),
      torch.nn.Linear(dim, ae_layer),
      torch.nn.ReLU(),
      torch.nn.Linear(ae_layer, 100)
    )
    self.branch1 = torch.nn.Sequential(
      torch.nn.ReLU(),
      torch.nn.Linear(dim, branch1_layer),
      torch.nn.ReLU(),
      torch.nn.Linear(branch1_layer, 1)
    )

  def forward(self, x):
    latent_space = self.encoder(x)
    return latent_space, self.decoder(latent_space), self.branch1(latent_space)

NameError: ignored

In [None]:
train_data = torch.tensor(pd.read_csv('/content/drive/MyDrive/Project_MTL/data/SyntheticData/train.csv').drop(columns='Unnamed: 0').values, requires_grad=True, dtype=torch.float32)
train = pd.read_csv('/content/drive/MyDrive/Project_MTL/data/SyntheticData/train.csv').drop(columns='Unnamed: 0')
validate = torch.tensor(pd.read_csv('/content/drive/MyDrive/Project_MTL/data/SyntheticData/validate.csv').drop(columns='Unnamed: 0').values, requires_grad=True, dtype=torch.float32)

X = train_data[:, :100]
Y = train_data[:, 100:].mul(0.001)
X_v = validate[:, :100]
Y_v = validate[:, 100:].mul(0.001)

test = pd.read_csv('/content/drive/MyDrive/Project_MTL/data/SyntheticData/test.csv').drop(columns='Unnamed: 0')#.sample(n=100)
X_test = torch.tensor(test.drop(columns=['half_life']).values, requires_grad=True, dtype=torch.float32).to(torch.float32)
Y_test = torch.tensor(test['half_life'].to_numpy(), requires_grad=True, dtype=torch.float32).unsqueeze(dim=1).mul(0.001)

In [None]:
loss_func = torch.nn.MSELoss()
def train(epochs=25, batch_size=200, config=None, learning_rate=0.005): #get train and use one dataloader
  with wandb.init(config=config):
    config = wandb.config
    model = Bear(config.latent_dim, config.ae_layer, config.branch1_layer)
    batch_size = config.batch_size
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    train_batches = FastTensorDataLoader(X, Y, batch_size=batch_size, shuffle=False)
    for epoch in range(epochs):
      for idx, batch in enumerate(train_batches):
        optimizer.zero_grad()
        latent_space, reconstructed, preds = model(X)        
        branch1_loss = loss_func(preds, Y)
        ae_loss = loss_func(reconstructed, X)
        loss = config.loss_scalar*ae_loss + branch1_loss
        loss.backward(retain_graph=True)
        optimizer.step()
        wandb.log({'epoch': epoch, 'loss': ae_loss+branch1_loss, 'ae_loss': ae_loss, 'branch1_loss': branch1_loss})
        for i in range(config.latent_dim):
          wandb.log({str(i+1): latent_space[:, i: i+1]})
        latent_space_v, reconstructed_v, preds_v = model(X_v)        
        branch1_loss_v = loss_func(preds_v, Y_v)
        ae_loss_v = 2*loss_func(reconstructed_v, X_v)
        loss_v = ae_loss_v + branch1_loss_v
        wandb.log({'v_loss': loss_v, 'v_ae_loss': ae_loss_v, 'v_branch1_loss': branch1_loss_v})

In [None]:
wandb.agent(sweep_id, train, count=10000)