<a href="https://colab.research.google.com/github/TheCodingCvrlo/bachelor-thesis/blob/main/best_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("drive", force_remount = True)

Mounted at drive


In [None]:
%pip install wandb --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.7/211.7 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [None]:
#@title Patch to allow for continued wandb runs
#patch extrapolated from the following open github issue https://github.com/wandb/wandb/issues/5194

%%writefile public.py.patched
diff --git a/public.py b/public.py
index 4e961b6..8b68922 100644
--- a/public.py
+++ b/public.py
@@ -2784,14 +2784,14 @@ class File(Attrs):
         path = os.path.join(root, self.name)
         if os.path.exists(path) and not replace:
             if exist_ok:
-                return open(path)
+                return open(path, "rb")
             else:
                 raise ValueError(
                     "File already exists, pass replace=True to overwrite or exist_ok=True to leave it as is and don't error."
                 )

         util.download_file_from_url(path, self.url, Api().api_key)
-        return open(path)
+        return open(path, "rb")

     @normalize_exceptions
     def delete(self):

Writing public.py.patched


In [None]:
#apply patch
!patch /usr/local/lib/python3.10/dist-packages/wandb/apis/public.py public.py.patched

patching file /usr/local/lib/python3.10/dist-packages/wandb/apis/public.py


In [None]:
#@title Imports
import os
import sys
import numpy as np
import pandas as pd
import pickle
from copy import deepcopy as cp
from datetime import datetime

#viz
import matplotlib.pyplot as plt
import seaborn as sns
import wandb

#pytorch
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, Subset
from torchvision.transforms import ToTensor, Lambda
import torch.optim as optim




In [None]:

PATH_CHECKS = "drive//MyDrive//thesis//model_checkpoints"
PATHS_DATA = {
    'pca30' : [
        "drive//MyDrive//thesis//data//reduced//30//train//features.csv",
        "drive//MyDrive//thesis//data//reduced//30//train//labels.csv",
        "drive//MyDrive//thesis//data//reduced//30//test//features.csv",
        "drive//MyDrive//thesis//data//reduced//30//test//labels.csv"
        ],

    'pca100' : [
        "drive//MyDrive//thesis//data//reduced//100//train//features.csv",
        "drive//MyDrive//thesis//data//reduced//100//train//labels.csv",
        "drive//MyDrive//thesis//data//reduced//100//test//features.csv",
        "drive//MyDrive//thesis//data//reduced//100//test//labels.csv"
        ],
    'full30' : "drive//MyDrive//thesis//data//filtered//df_30.csv",
    'full100' : "drive//MyDrive//thesis//data//filtered//df_100.csv",
}


In [None]:
#@title Device selection
# default pytorch device selection snippet (credits https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html)
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

print(f"Using {device} device")

Using cuda device


In [None]:
#@title  Load df (test size only works for full reps)
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

dataset = "pca30" #@param ["pca30","pca100","full30","full100"]{allow-input:true}
test_size = 0.15 #@param {type:"slider", min:0, max:1, step:0.05}

PATH_DF = PATHS_DATA[dataset]

if isinstance(PATH_DF, str):


  df = pd.read_csv(PATH_DF, index_col=0)

  oh_enc = OneHotEncoder(sparse_output=False)
  labels_oh = oh_enc.fit_transform(np.array(df['artist']).reshape(-1,1))

  labels_df = pd.DataFrame(labels_oh, index=df.index, columns=['lab_'+i for i in oh_enc.categories_[0]])

  df_full = pd.merge(df, labels_df, left_index=True, right_index=True)

  feats_pattern = r'^\d+$'
  feats_cols = df.filter(regex=feats_pattern).columns
  labs_cols = ['lab_'+i for i in oh_enc.categories_[0]]

  targets = df_full.label

  train_idx, valid_idx = train_test_split(
      np.arange(len(targets)),
      test_size=test_size,
      shuffle=True,
      stratify=targets
      )

  df_train = df_full.iloc[train_idx]
  df_test = df_full.iloc[valid_idx]

  train_feats = df_train[feats_cols].values
  train_labs = df_train[labs_cols].values

  test_feats = df_test[feats_cols].values
  test_labs = df_test[labs_cols].values


else:
  feats_pattern = r'^\d+$'

  train_feats = pd.read_csv(PATH_DF[0], index_col=0).filter(regex=feats_pattern).values
  test_feats = pd.read_csv(PATH_DF[2], index_col=0).filter(regex=feats_pattern).values

  train_labs = pd.read_csv(PATH_DF[1], index_col=0).values
  test_labs = pd.read_csv(PATH_DF[3], index_col=0).values

n_train, m = train_feats.shape
n_test, _ = test_feats.shape

n = n_train + n_test

_, k = train_labs.shape


sys.stdout.write(f'Loaded dataset: {dataset}\n--------------------------\n')
sys.stdout.write(f' N_FEATURES: {m} (reference: m)\n N_CLASSES: {k} (reference: k)\n--------------------------\n')
sys.stdout.write(f' TRAIN SET SIZE: {n_train}\n TEST SET SIZE: {n_test}')

# # df_torch = MyDataset(df_full,
# #                      features_cols=feats_cols,
# #                      target_cols=labs_cols,
# #                      device=device)
try:
  del df, df_full, df_train, df_test
except:
  pass


Loaded dataset: pca30
--------------------------
 N_FEATURES: 256 (reference: m)
 N_CLASSES: 137 (reference: k)
--------------------------
 TRAIN SET SIZE: 9563
 TEST SET SIZE: 1688

## Helper functions

In [None]:
#@title Dynamic Architecture Model
class DNN(nn.Module):
  """
  MLP with tunable number of layers and nodes. Declare sizes as a list with the following structure: [dim_in, dim_in_hl1, ..., dim_out].
  Default dropout rate is 0.5, dropout happens after each activation (ReLU).
  """
  def __init__(self, dropout=0.5, sizes=[m,k]):
    super().__init__()
    n = len(sizes)

    stack = nn.ModuleList()
    drop = nn.Dropout(dropout)
    act = nn.ReLU()

    self.dropout = drop
    self.n_layers = n
    self.activation = act

    for i in range(n-1):
      d_in = sizes[i]
      d_out = sizes[i+1]

      linear_layer = nn.Linear(d_in, d_out)

      stack.append(linear_layer)
      if i != n-2:
        stack.append(act)
        stack.append(drop)

    self.stack = stack


  def forward(self, x):
    m = nn.Softmax(dim=0)

    for layer in self.stack:
      x = layer(x)

    x = m(x)

    return x

In [None]:
#@title dataset class

class MyDataset(Dataset):

  def __init__(self,x,y):
    self.x=x
    self.y=y
    self.l = x.shape[0]

  def __len__(self):
    return self.l

  def __getitem__(self,idx):
    return self.x[idx], self.y[idx]


In [None]:
#@title get dataloaders
def get_dataloaders(train_feats, test_feats, train_labels, test_labels, batch_size=1, shuffle=True, device=device):

  train_dataset = MyDataset(torch.from_numpy(train_feats).float().to(device), torch.from_numpy(train_labels).float().to(device))
  test_dataset = MyDataset(torch.from_numpy(test_feats).float().to(device), torch.from_numpy(test_labels).float().to(device))

  train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
  test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle)

  return train_dataloader, test_dataloader



In [None]:
#@title run setup
def run_setup(train_feats, test_feats, train_labels, test_labels, bs=64, shuffle=True, lr=1e-3, dropout=0.5, sizes=[m,k], device=device):
  model = DNN(dropout=dropout, sizes=sizes).to(device)
  optimizer = optim.SGD(model.parameters(), lr=lr)
  # optimizer = optim.Adam(model.parameters(), lr=lr)
  train_dl, test_dl = get_dataloaders(train_feats,
                                      test_feats,
                                      train_labels,
                                      test_labels,
                                      batch_size=bs,
                                      shuffle=True,
                                      device=device)

  return model, optimizer, train_dl, test_dl

In [None]:
#@title train/test loop
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss, train_acc = 0, 0

    for batch, (X, y) in enumerate(dataloader):

        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss
        train_acc += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()

    train_loss /= num_batches
    train_acc /= size

    return train_loss, train_acc

def test_loop(dataloader, model, loss_fn, verbose=False, t=1):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, test_acc = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            test_acc += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()


    test_loss /= num_batches
    test_acc /= size

    if verbose and t%100 == 0:
      sys.stdout.write('\r' + f"Validation accuracy: {(100*test_acc):>0.1f}%, Avg loss: {test_loss:>8f} \n")

    return test_loss, test_acc

In [None]:
# #DEBUG
# dlt, dls = get_dataloaders(train_feats, test_feats, train_labs, test_labs)
# print('get_dataloaders ok')
# dls_ls = [i for i in dls]
# print(f'datatype: {dls_ls[0][0].dtype, dls_ls[0][1].dtype}')
# mm, oo, dlt, dls = run_setup(train_feats, test_feats, train_labs, test_labs)
# print(mm)
# print(oo)
# train_loop(dls, mm, nn.CrossEntropyLoss(), oo)
# print('train_loop ok')
# test_loop(dlt, mm,nn.CrossEntropyLoss())
# print('checks ok!')

# Training Best Model
The following cells allow to train a model in different rounds, saving checkpoints and resuming the process while tracking each round within the same WandB run. Pretty swag.

In [None]:
bs = 16
lr = 6e-2
dropout = 0.25
sizes = [m, (m+k)//2, k]

In [None]:
model, optimizer, train_dataloader, test_dataloader = run_setup(train_feats,
                                            test_feats,
                                            train_labs,
                                            test_labs,
                                            bs = bs,
                                            shuffle = True,
                                            lr = lr,
                                            dropout = dropout,
                                            sizes = sizes,
                                            device = device
                                            )




In [None]:
epochs = 200

In [None]:
os.listdir(f"drive//MyDrive//thesis//model_checkpoints//jukebox-best-models//{dataset}")

['dvt2xmy9_epoch_200']

In [None]:
#@title Launch Training (4ca4dc012e66db4ce63df479d47564975c346ca3)
from datetime import datetime

PROJ_NAME = "jukebox-best-models"

PATH_PROJ = PATH_CHECKS + f"//{PROJ_NAME}"
os.makedirs(PATH_PROJ, exist_ok=True)
run_name = "pca30_v1"

run_id = None

model = model.to(device)
loss_fn = nn.CrossEntropyLoss()
epoch = 0

dt = str(datetime.now().strftime('%b_%d_%Y-%H_%M'))

if not run_id:
   run_id = wandb.util.generate_id()


chk = "sansy3uj_epoch_50"
PATH_CHECK = PATH_PROJ+f"//{dataset}"
os.makedirs(PATH_CHECK, exist_ok=True)
PATH_CHECK += f"//{chk}"



# comment "resume" parameter to initialize new run, comment "reinit" to load a checkpoint
run = wandb.init(
    project=PROJ_NAME,
    name=run_name,
    # resume='must',
    id=run_id,
    reinit=True
    )


if wandb.run.resumed:
  checkpoint = torch.load(PATH_CHECK)
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  epoch = checkpoint['epoch']
  train_loss = checkpoint['train_loss']
  test_loss = checkpoint['test_loss']
  train_acc = checkpoint['train_acc']
  test_acc = checkpoint['test_acc']


model.train()

sys.stdout.write(f"Training run {run_id}\n--------------------------\nEpoch {epoch}")

for t in range(epoch+1, epochs + epoch+1):
    if t%100==0:
      sys.stdout.write('\r'+ f"Epoch {t+1}\n-------------------------------")

    train_loss, train_acc = train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loss, test_acc =  test_loop(test_dataloader, model, loss_fn, verbose=True, t=t)

    metrics = {
        'train/train_loss': train_loss,
        'train/train_accuracy': train_acc,
        'train/epoch' : t,
        'test/epoch' : t,
        'test/test_loss': test_loss,
        'test/test_acc': test_acc
    }

    wandb.log(metrics)


PATH_CHECK = PATH_PROJ+f'//{dataset}//{run_id}_epoch_{t}'

torch.save({

      'epoch': t,

      'model_state_dict': model.state_dict(),
      'optimizer_state_dict': optimizer.state_dict(),

      'train_loss': train_loss,
      'train_acc': train_acc,

      'test_loss': test_loss,
      'test_acc': test_acc,

      }, PATH_CHECK)

print("Done!")

0,1
test/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
test/test_acc,▁▂▂▃▂▄▃▃▂▄▄▅▄▃▅▂▄▂▃▃▆▄▆▄▃▅█▃▅▄▆▂▇▄▄▆▄▇█▃
test/test_loss,▇▅█▄▇▂▃█▇█▄▄▆▆▄▅▄▄▄▅▄▅▃▅▆▆▂▆▅▄▆▆▄▄▇▂▄▁▁▅
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/train_accuracy,▁▁▂▂▂▃▂▃▃▄▄▄▄▃▄▄▅▅▅▅▆▅▅▅▆▆▆▆▆▆▇▆▇█▇▇▇███
train/train_loss,██▇▇▇▆▆▅▅▆▅▆▅▅▅▄▄▅▃▄▃▃▄▄▃▃▄▃▄▃▂▃▃▂▃▃▃▂▂▁

0,1
test/epoch,150.0
test/test_acc,0.50501
test/test_loss,3.02618
train/epoch,150.0
train/train_accuracy,0.75398
train/train_loss,2.86516


Training run dvt2xmy9
--------------------------
Epoch 101
Validation accuracy: 37.1%, Avg loss: 4.338785 
Epoch 201
Validation accuracy: 38.3%, Avg loss: 4.352123 
Done!
