In [51]:
import torch
from torch import nn
from torch import optim
import pandas as pd
import numpy as np
import wandb

wandb.login()

sweep_config = {
    'method': 'random'
    }

metric = {
    'name': 'loss',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

parameters_dict = {
    'optimizer': {
        'values': ['adam', 'sgd']
        },
    'fc_layer_size': {
        'values': [128, 256, 512]
        },
    'dropout': {
          'values': [0.3, 0.4, 0.5]
        },
    }

sweep_config['parameters'] = parameters_dict

parameters_dict.update({
    'epochs': {
        'value': 2}
    })

parameters_dict.update({
    'learning_rate': {
        # a flat distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 0,
        'max': 0.1
      },
    'batch_size': {
        # integers between 32 and 256
        # with evenly-distributed logarithms 
        'distribution': 'q_log_uniform_values',
        'q': 8,
        'min': 32,
        'max': 256,
      }
    })

import pprint
pprint.pprint(sweep_config)

sweep_id = wandb.sweep(sweep_config, project="digitrecognizer-sweeps")

{'method': 'random',
 'metric': {'goal': 'minimize', 'name': 'loss'},
 'parameters': {'batch_size': {'distribution': 'q_log_uniform_values',
                               'max': 256,
                               'min': 32,
                               'q': 8},
                'dropout': {'values': [0.3, 0.4, 0.5]},
                'epochs': {'value': 2},
                'fc_layer_size': {'values': [128, 256, 512]},
                'learning_rate': {'distribution': 'uniform',
                                  'max': 0.1,
                                  'min': 0},
                'optimizer': {'values': ['adam', 'sgd']}}}
Create sweep with ID: hb9e9nup
Sweep URL: https://wandb.ai/nunoduarte/digitrecognizer-sweeps/sweeps/hb9e9nup


In [56]:
# load the data
df = pd.read_csv('train.csv')

# creating tensor from targets_df 
df_x = df.iloc[:,1:]
df_y = df.iloc[:,0]
# normalize from 0:1
torch_x = torch.tensor(df_x.values).float() / 255
torch_y = torch.tensor(df_y.values).long()

# Convert to one-hot encoding
num_classes = 10  # Assuming you have 10 classes (0 to 9)
one_hot_encoded = torch.eye(num_classes)[torch_y]

# split train, val, test set
split = 38000
train_x = torch_x[:split]
train_y = torch_y[:split]
# train_y = one_hot_encoded[:split]

val_x = torch_x[split:]
val_y = torch_y[split:]
# val_y = one_hot_encoded[split:]


In [57]:
def build_network(fc_layer_size, dropout):
    model = nn.Sequential(  # fully-connected
        nn.Linear(28*28, fc_layer_size),
        nn.ReLU(),
        nn.Linear(fc_layer_size, fc_layer_size),
        nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(fc_layer_size, 10))

    return model.cuda()
        

def build_optimizer(model, optimizer, learning_rate):
    if optimizer == "sgd":
        optimizer = optim.SGD(model.parameters(),
                              lr=learning_rate, momentum=0.9)
    elif optimizer == "adam":
        optimizer = optim.Adam(model.parameters(),
                               lr=learning_rate)
    return optimizer


def train_epoch(network, train_loader, val_loader, optimizer, epoch):
    losses = list()
    accuracy = list()
    cumu_loss_t = 0
    for batch in train_loader():
        x, y = batch
        
        # b = x.size(0)
        # x = x.view(b, -1)   

        l = model(x.cuda())    # l:logits
        
        loss = nn.CrossEntropyLoss()
        # 2. compute the objective function
        J = loss(l, y.cuda())

        # 3. cleaning the gradients
        model.zero_grad()
            
        # 4. accumulate the partial derivatives of J wrt params
        J.backward()

        # 5. step in hte opposite direction of the gradient
        optimizer.step()

        losses.append(J.item())
        cumu_loss_t += J.item()
        accuracy.append(y.eq(l.detach().argmax(dim=1).cpu()).float().mean())
        
        wandb.log({"batch loss": J.item()})

    print(f' Epoch {epoch +1}, train loss: {torch.tensor(losses).mean()}', end=', ')
    print(f' train accuracy: {torch.tensor(accuracy).mean()}')

    losses = list()
    accuracy = list()
    cumu_loss = 0
    for batch in val_loader():
        x, y = batch
        
        # # for torchvision dataset
        # b = x.size(0)
        # x = x.view(b, -1)

        # 1. forward
        with torch.no_grad():
            l = model(x.cuda())    # l:logits

        # 2. compute the objective function
        J = loss(l, y.cuda())

        losses.append(J.item())
        cumu_loss += J.item()
        accuracy.append(y.eq(l.detach().argmax(dim=1).cpu()).float().mean())

    print(f' Epoch {epoch +1}, validation loss: {torch.tensor(losses).mean()}', end=', ')
    print(f' val accuracy: {torch.tensor(accuracy).mean()}')

    return cumu_loss_t, cumu_loss


In [61]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train(config=None):
    # Initialize a new wandb run
    with wandb.init(config=config):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config

        def train_loader():
            num_batches = train_x.shape[0] // config.batch_size
            for i in range(num_batches):
                batch_start = i * config.batch_size
                batch_end = (i + 1) * config.batch_size
                batch_X = train_x[batch_start:batch_end,:]
                batch_Y = train_y[batch_start:batch_end]
                yield batch_X, batch_Y

        def val_loader():
            num_batches = val_x.shape[0] // config.batch_size
            for i in range(num_batches):
                batch_start = i * config.batch_size
                batch_end = (i + 1) * config.batch_size
                batch_X = val_x[batch_start:batch_end,:]
                batch_Y = val_y[batch_start:batch_end]
                yield batch_X, batch_Y

        network = build_network(config.fc_layer_size, config.dropout)
        optimizer = build_optimizer(network, config.optimizer, config.learning_rate)

        for epoch in range(config.epochs):
            avg_train_loss, avg_val_loss = train_epoch(network, train_loader, val_loader, optimizer, epoch)
            wandb.log({"loss": avg_train_loss, "val loss": avg_val_loss, "epoch": epoch})           

In [62]:
wandb.agent(sweep_id, train, count=5)

[34m[1mwandb[0m: Agent Starting Run: p7udbjql with config:
[34m[1mwandb[0m: 	batch_size: 112
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	fc_layer_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.030106120876737297
[34m[1mwandb[0m: 	optimizer: adam


 Epoch 1, train loss: 0.0640685185790062,  train accuracy: 0.9800094366073608
 Epoch 1, validation loss: 0.12194699048995972,  val accuracy: 0.9647958874702454
 Epoch 2, train loss: 0.0640685185790062,  train accuracy: 0.9800094366073608
 Epoch 2, validation loss: 0.12194699048995972,  val accuracy: 0.9647958874702454


0,1
batch loss,▃▃▃▃▄▂▂▄▅█▆▇▇█▁▇▁▄▃▂▃▃▅▄▁▅▂▂▂▂▃▃▂▆▄▂▁▅▅▃
epoch,▁█
loss,▁▁
val loss,▁▁

0,1
batch loss,0.0366
epoch,1.0
loss,21.71923
val loss,4.26814


[34m[1mwandb[0m: Agent Starting Run: ynwyz3ue with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	fc_layer_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0033090616601750057
[34m[1mwandb[0m: 	optimizer: sgd


 Epoch 1, train loss: 0.06408777832984924,  train accuracy: 0.9800010323524475
 Epoch 1, validation loss: 0.1222604289650917,  val accuracy: 0.9649697542190552
 Epoch 2, train loss: 0.06408777832984924,  train accuracy: 0.9800010323524475
 Epoch 2, validation loss: 0.1222604289650917,  val accuracy: 0.9649697542190552


0,1
batch loss,▁▂▃▂▂▁▁▁▁▁▂▂▅▄▁▂▂▃▂▂▄▂▃▁▂▁▂▁█▄▁▂▃▁▂▁▁▁▂▂
epoch,▁█
loss,▁▁
val loss,▁▁

0,1
batch loss,0.0421
epoch,1.0
loss,38.00405
val loss,7.58015


[34m[1mwandb[0m: Agent Starting Run: 66vj3hrp with config:
[34m[1mwandb[0m: 	batch_size: 80
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	fc_layer_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.007051036531566491
[34m[1mwandb[0m: 	optimizer: adam


 Epoch 1, train loss: 0.06406784802675247,  train accuracy: 0.9800000786781311
 Epoch 1, validation loss: 0.12133482098579407,  val accuracy: 0.9652499556541443
 Epoch 2, train loss: 0.06406784802675247,  train accuracy: 0.9800000786781311
 Epoch 2, validation loss: 0.12133482098579407,  val accuracy: 0.9652499556541443


0,1
batch loss,▄▃▃▃▁▁▂▂▂█▃▆█▅▃▃▁▂▃▂▁▄▁▂▁▁▁▄▂▄▄▃▂▄▂▅▄▂▄▁
epoch,▁█
loss,▁▁
val loss,▁▁

0,1
batch loss,0.0426
epoch,1.0
loss,30.43223
val loss,6.06674


[34m[1mwandb[0m: Agent Starting Run: 61xnv9x1 with config:
[34m[1mwandb[0m: 	batch_size: 136
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	fc_layer_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.09940310825863936
[34m[1mwandb[0m: 	optimizer: adam


 Epoch 1, train loss: 0.06408625841140747,  train accuracy: 0.9799967408180237
 Epoch 1, validation loss: 0.1229841485619545,  val accuracy: 0.9647565484046936
 Epoch 2, train loss: 0.06408625841140747,  train accuracy: 0.9799967408180237
 Epoch 2, validation loss: 0.1229841485619545,  val accuracy: 0.9647565484046936


0,1
batch loss,▂▁▃▃█▂▃▂▅▂▄▅▁▃▂▄▄▅▂▄▃▃▃▄▄▁▁▁▃▃▅▃▂▇▅▄▂▄▃▅
epoch,▁█
loss,▁▁
val loss,▁▁

0,1
batch loss,0.04226
epoch,1.0
loss,17.88007
val loss,3.56654


[34m[1mwandb[0m: Agent Starting Run: t4ck3fb1 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	fc_layer_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.003257377582173338
[34m[1mwandb[0m: 	optimizer: sgd


 Epoch 1, train loss: 0.06404935568571091,  train accuracy: 0.980017900466919
 Epoch 1, validation loss: 0.12133480608463287,  val accuracy: 0.9652500152587891
 Epoch 2, train loss: 0.06404935568571091,  train accuracy: 0.980017900466919
 Epoch 2, validation loss: 0.12133480608463287,  val accuracy: 0.9652500152587891


0,1
batch loss,▂▂▁▅▁▁▁▁▂▁▃▂▆▅▂▁▄▆▁▂█▁▁▂▂▁▃▁▃▆▁▂▆▁▃▁▂▁▃▁
epoch,▁█
loss,▁▁
val loss,▁▁

0,1
batch loss,0.01848
epoch,1.0
loss,76.02658
val loss,15.16685


In [7]:
# My Training loops
nb_epochs = 50
for epoch in range(nb_epochs):
    losses = list()
    accuracy = list()
    for batch in train_loader():
        x, y = batch

        # print(x.size())
        # print(y)

        # # for torchvision dataset
        b = x.size(0)
        x = x.view(b, -1)

        # print(x.size())

        # batch b
        # x 28*28

        # 1. forward
        # print(x[0])
        l = model(x.cuda())    # l:logits

        # 2. compute the objective function
        J = loss(l, y.cuda())

        # 3. cleaning the gradients
        model.zero_grad()
            
        # 4. accumulate the partial derivatives of J wrt params
        J.backward()

        # 5. step in hte opposite direction of the gradient
        optimizer.step()

        losses.append(J.item())
        accuracy.append(y.eq(l.detach().argmax(dim=1).cpu()).float().mean())

    print(f' Epoch {epoch +1}, train loss: {torch.tensor(losses).mean()}', end=', ')
    print(f' train accuracy: {torch.tensor(accuracy).mean()}')


    losses = list()
    accuracy = list()
    for batch in val_loader():
        x, y = batch
        
        # for torchvision dataset
        b = x.size(0)
        x = x.view(b, -1)

        # 1. forward
        with torch.no_grad():
            l = model(x.cuda())    # l:logits

        # 2. compute the objective function
        J = loss(l, y.cuda())

        losses.append(J.item())
        accuracy.append(y.eq(l.detach().argmax(dim=1).cpu()).float().mean())

    print(f' Epoch {epoch +1}, validation loss: {torch.tensor(losses).mean()}', end=', ')
    print(f' val accuracy: {torch.tensor(accuracy).mean()}')
    

 Epoch 1, train loss: 0.46435442566871643,  train accuracy: 0.8583350777626038
 Epoch 1, validation loss: 0.21963706612586975,  val accuracy: 0.9387500286102295
 Epoch 2, train loss: 0.18161554634571075,  train accuracy: 0.9446872472763062
 Epoch 2, validation loss: 0.16401249170303345,  val accuracy: 0.9522500038146973
 Epoch 3, train loss: 0.12903910875320435,  train accuracy: 0.9611151814460754
 Epoch 3, validation loss: 0.13805542886257172,  val accuracy: 0.9585000276565552
 Epoch 4, train loss: 0.10031295567750931,  train accuracy: 0.9703559279441833
 Epoch 4, validation loss: 0.1297679841518402,  val accuracy: 0.9610000252723694
 Epoch 5, train loss: 0.08099465072154999,  train accuracy: 0.9761478304862976
 Epoch 5, validation loss: 0.13459056615829468,  val accuracy: 0.9607499837875366


KeyboardInterrupt: 

In [54]:
# printing out result
print(torch_x.shape)
print(torch_y.shape)

torch_x_image = torch_x.view(-1, 1, 28, 28)

# split train, val, test set
split = 38000
train_x_image = torch_x_image[:split]
train_y = torch_y[:split]
# train_y = one_hot_encoded[:split]

val_x_image = torch_x_image[split:]
val_y = torch_y[split:]
# val_y = one_hot_encoded[split:]

# printing out result
print(train_x_image.shape)
print(val_x_image.shape)

# batching data
batch_size = 32
def train_loader():
    num_batches = train_x.shape[0] // batch_size
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = (i + 1) * batch_size
        batch_X_image = train_x_image[batch_start:batch_end,:]
        batch_Y = train_y[batch_start:batch_end]
        yield batch_X_image, batch_Y

def val_loader():
    num_batches = val_x.shape[0] // batch_size
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = (i + 1) * batch_size
        batch_X_image = val_x_image[batch_start:batch_end,:]
        batch_Y = val_y[batch_start:batch_end]
        yield batch_X_image, batch_Y
        
# Creating a CNN class
class ConvNeuralNet(nn.Module):
	#  Determine what layers and their order in CNN object 
    def __init__(self, num_classes):
        super(ConvNeuralNet, self).__init__()
        self.conv_layer1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3)
        self.conv_layer2 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3)
        self.max_pool1 = nn.MaxPool2d(kernel_size = 2, stride = 2)
        
        self.conv_layer3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
        self.conv_layer4 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3)
        self.max_pool2 = nn.MaxPool2d(kernel_size = 2, stride = 2)
        
        self.fc1 = nn.Linear(1024, 128)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes)
    
    # Progresses data across layers    
    def forward(self, x):
        out = self.conv_layer1(x)
        out = self.conv_layer2(out)
        out = self.max_pool1(out)
        
        out = self.conv_layer3(out)
        out = self.conv_layer4(out)
        out = self.max_pool2(out)
                
        out = out.reshape(out.size(0), -1)
        
        out = self.fc1(out)
        out = self.relu1(out)
        out = self.fc2(out)
        return out

model = ConvNeuralNet(10).cuda()

torch.Size([42000, 784])
torch.Size([42000])
torch.Size([38000, 1, 28, 28])
torch.Size([4000, 1, 28, 28])


In [57]:
# My Training loops
nb_epochs = 50
for epoch in range(nb_epochs):
    losses = list()
    accuracy = list()
    for batch in train_loader():
        x, y = batch

        # print(x.size())
        # print(y)

        # # for torchvision dataset
        # b = x.size(0)
        # x = x.view(b, -1)

        # print(x.size())

        # batch b
        # x 28*28

        # 1. forward
        # print(x[0])
        l = model(x.cuda())    # l:logits

        # 2. compute the objective function
        J = loss(l, y.cuda())

        # 3. cleaning the gradients
        model.zero_grad()
            
        # 4. accumulate the partial derivatives of J wrt params
        J.backward()

        # 5. step in hte opposite direction of the gradient
        optimizer.step()

        losses.append(J.item())
        accuracy.append(y.eq(l.detach().argmax(dim=1).cpu()).float().mean())

    print(f' Epoch {epoch +1}, train loss: {torch.tensor(losses).mean()}', end=', ')
    print(f' train accuracy: {torch.tensor(accuracy).mean()}')


    losses = list()
    accuracy = list()
    for batch in val_loader():
        x, y = batch
        
        # # for torchvision dataset
        # b = x.size(0)
        # x = x.view(b, -1)

        # 1. forward
        with torch.no_grad():
            l = model(x.cuda())    # l:logits

        # 2. compute the objective function
        J = loss(l, y.cuda())

        losses.append(J.item())
        accuracy.append(y.eq(l.detach().argmax(dim=1).cpu()).float().mean())

    print(f' Epoch {epoch +1}, validation loss: {torch.tensor(losses).mean()}', end=', ')
    print(f' val accuracy: {torch.tensor(accuracy).mean()}')
    

 Epoch 1, train loss: 0.2350294440984726,  train accuracy: 0.9244155287742615
 Epoch 1, validation loss: 0.09493966400623322,  val accuracy: 0.9670000076293945
 Epoch 2, train loss: 0.06232268735766411,  train accuracy: 0.9809130430221558
 Epoch 2, validation loss: 0.07293308526277542,  val accuracy: 0.9782500267028809
 Epoch 3, train loss: 0.041058119386434555,  train accuracy: 0.9876000285148621
 Epoch 3, validation loss: 0.07747603207826614,  val accuracy: 0.9787499904632568
 Epoch 4, train loss: 0.03091043420135975,  train accuracy: 0.9905223250389099
 Epoch 4, validation loss: 0.0742223709821701,  val accuracy: 0.981249988079071
 Epoch 5, train loss: 0.022718852385878563,  train accuracy: 0.9927864074707031
 Epoch 5, validation loss: 0.09567372500896454,  val accuracy: 0.9769999980926514
 Epoch 6, train loss: 0.017699792981147766,  train accuracy: 0.9942607283592224
 Epoch 6, validation loss: 0.0882871225476265,  val accuracy: 0.9789999723434448
 Epoch 7, train loss: 0.01640133932

In [69]:
# A more flexible model
class ResNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(28 * 28, 64)
        self.l2 = nn.Linear(64, 64)
        self.l3 = nn.Linear(64, 10)
        self.do = nn.Dropout(0.1)

    def forward(self, x):
        h1 = nn.functional.relu(self.l1(x))
        h2 = nn.functional.relu(self.l2(h1))
        do = self.do(h1 + h2)  # this allows for the partial gradients in the deeper layers (first ones) to update faster
        logits = self.l3(do)
        return logits

model = ResNet().cuda()