<a href="https://colab.research.google.com/github/TeoAle/Kaggle_ML2/blob/master/notebooks/ml2_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import os
import sys
from dataloader import load_train_df
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, Subset

%reload_ext autoreload

In [2]:
%pip install wandb --quiet

In [3]:
PATH = 'data'
df = load_train_df(
    PATH=PATH,
    decode_dummies=True,
    add_geo_features=True
)

In [4]:
y = df['Cover_Type'].values
X = df.drop(['Cover_Type'], axis=1)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

cols_numeric = X.select_dtypes(include=['float']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), cols_numeric)]
)

target_encoder = OneHotEncoder(sparse_output=False)


In [7]:
X_train_array = preprocessor.fit_transform(X_train)
X_val_array = preprocessor.transform(X_val)

In [8]:
y_train_array = target_encoder.fit_transform(y_train.reshape(-1, 1))
y_val_array = target_encoder.transform(y_val.reshape(-1, 1))

## Helper Functions

In [9]:
#@title Dynamic Architecture Model
m = X_train_array.shape[1]
k = y_train_array.shape[1]

class DNN(nn.Module):
  """
  MLP with tunable number of layers and nodes. Declare sizes as a list with the following structure: [dim_in, dim_in_hl1, ..., dim_out].
  Default dropout rate is 0.5, dropout happens after each activation (ReLU).
  """
  def __init__(self, dropout=0.5, sizes=[m,k]):
    super().__init__()
    n = len(sizes)

    stack = nn.ModuleList()
    drop = nn.Dropout(dropout)
    act = nn.ReLU()

    self.dropout = drop
    self.n_layers = n
    self.activation = act

    for i in range(n-1):
      d_in = sizes[i]
      d_out = sizes[i+1]

      linear_layer = nn.Linear(d_in, d_out)

      stack.append(linear_layer)
      if i != n-2:
        stack.append(act)
        stack.append(drop)

    self.stack = stack


  def forward(self, x):
    m = nn.Softmax(dim=0)

    for layer in self.stack:
      x = layer(x)

    x = m(x)

    return x

In [10]:
#@title Device selection
# default pytorch device selection snippet (credits https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html)
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

print(f"Using {device} device")

Using cpu device


In [11]:
#@title dataset class

class MyDataset(Dataset):

  def __init__(self,x,y):
    self.x=x
    self.y=y
    self.l = x.shape[0]

  def __len__(self):
    return self.l

  def __getitem__(self,idx):
    return self.x[idx], self.y[idx]

In [12]:
#@title get dataloaders
def get_dataloaders(train_feats, test_feats, train_labels, test_labels, batch_size=1, shuffle=True, device=device):

  train_dataset = MyDataset(torch.from_numpy(train_feats).float().to(device), torch.tensor(train_labels).float().to(device))
  test_dataset = MyDataset(torch.from_numpy(test_feats).float().to(device), torch.tensor(test_labels).float().to(device))

  train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
  test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle)

  return train_dataloader, test_dataloader

In [13]:
#@title train/test loop
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss, train_acc = 0, 0


    for batch, (X, y) in enumerate(dataloader):

        optimizer.zero_grad()

        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation

        loss.backward()
        optimizer.step()

        train_loss += loss
        train_acc += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()

    train_loss /= num_batches
    train_acc /= size

    return train_loss, train_acc

def test_loop(dataloader, model, loss_fn, verbose=False):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, test_acc = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            test_acc += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()


    test_loss /= num_batches
    test_acc /= size

    if verbose:
      sys.stdout.write('\r' + f"Validation accuracy: {(100*test_acc):>0.1f}%, Avg loss: {test_loss:>8f} \n")

    return test_loss, test_acc

In [14]:
#@title run setup
def run_setup(train_feats, test_feats, train_labels, test_labels, bs=2, shuffle=True, lr=1e-2, dropout=0.2, sizes=[m,k], device=device):
  model = DNN(dropout=dropout, sizes=sizes).to(device)
  optimizer = optim.SGD(model.parameters(), lr=lr)
  # optimizer = optim.Adam(model.parameters(), lr=lr)
  train_dl, test_dl = get_dataloaders(train_feats,
                                      test_feats,
                                      train_labels,
                                      test_labels,
                                      batch_size=bs,
                                      shuffle=True,
                                      device=device)

  return model, optimizer, train_dl, test_dl

## Prepare run

In [15]:
import wandb
# api-key: 4ca4dc012e66db4ce63df479d47564975c346ca3

In [16]:
#@title Configure Sweep
proj_name = "ml2-challenge"
sweep_name = "first-sweep" #@param [""]{allow-input:true}

sweep_configuration = {

    'method': 'random',

    'name': sweep_name,

    'metric': {
        'goal': 'maximize',
        'name': 'test_acc'
        },

    'parameters':
     {
        'batch_size': {
            'values': [4, 8, 16]
            },

        'lr': {
            'max': 0.1,
            'min': 1e-3
            },

        'dropout': {
            'values': [0.25, 0.5]
            },

        'epochs': {
            'values': [10]
            },

        'sizes': {
            'values': [
                [m, 128, 128, 64, k]
                ]
            }
        },
    'early_terminate' : {
            'type' : 'hyperband',
            's' : 3,
            'eta' : 2,
            'max_iter' : 10
            }
    }


In [17]:
sweep_id = wandb.sweep(sweep=sweep_configuration, project=proj_name)
# api-key: 4ca4dc012e66db4ce63df479d47564975c346ca3

Create sweep with ID: kb0vkc4h
Sweep URL: https://wandb.ai/torch-carlo/ml2-challenge/sweeps/kb0vkc4h


In [18]:
#@title  main loop
def main():

    run = wandb.init()


    lr  =  wandb.config.lr
    bs = wandb.config.batch_size
    drop = wandb.config.dropout
    epochs = wandb.config.epochs
    sizes = wandb.config.sizes

    model, op, train_dl, test_dl = run_setup(X_train_array,
                                           X_val_array,
                                           y_train_array,
                                           y_val_array,
                                           lr=lr,
                                           bs=bs,
                                           shuffle=True,
                                           dropout=drop,
                                           sizes=sizes,
                                           device=device
                                           )

    model = model.to(device)

    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(1, epochs):

      train_loss, train_acc = train_loop(train_dl, model, loss_fn, op)
      test_loss, test_acc = test_loop(test_dl, model, loss_fn)

      wandb.log({
        'epoch': epoch,
        'train_loss': train_loss,
        'test_acc': test_acc,
        'test_loss': test_loss
      }, step=epoch)

In [19]:
#@title # Start run
n_iterations = 3 #@param
wandb.agent(sweep_id, function=main, count=n_iterations)

[34m[1mwandb[0m: Agent Starting Run: a8qe7lms with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	lr: 0.05274841544680981
[34m[1mwandb[0m: 	sizes: [10, 128, 128, 64, 7]
[34m[1mwandb[0m: Currently logged in as: [33mcarlopatti1[0m ([33mtorch-carlo[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='0.001 MB of 0.011 MB uploaded\r'), FloatProgress(value=0.09642597968069666, max=1.…

0,1
epoch,▁▂▃▄▅▅▆▇█
test_acc,▆▆▅▄▃▁▅█▆
test_loss,█▆▅▆▆▄▆▅▁
train_loss,█▃▂▂▁▁▁▂▂

0,1
epoch,9.0
test_acc,0.34491
test_loss,1.70791
train_loss,1.72165


[34m[1mwandb[0m: Agent Starting Run: 5kprjd4p with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	lr: 0.0028193188679699107
[34m[1mwandb[0m: 	sizes: [10, 128, 128, 64, 7]


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▄▅▅▆▇█
test_acc,▁▁▁▃▄▆▆▆█
test_loss,██▇▇▆▆▅▄▁
train_loss,██▇▇▆▅▄▃▁

0,1
epoch,9.0
test_acc,0.22983
test_loss,1.93629
train_loss,1.93832


[34m[1mwandb[0m: Agent Starting Run: paib953c with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.25
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	lr: 0.04963525443622027
[34m[1mwandb[0m: 	sizes: [10, 128, 128, 64, 7]


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▄▅▅▆▇█
test_acc,▁▅▆▇▇▇▇██
test_loss,█▄▃▂▂▂▂▁▁
train_loss,█▄▃▂▂▁▁▁▁

0,1
epoch,9.0
test_acc,0.5205
test_loss,1.67375
train_loss,1.67531
