In [1]:
import math
import numpy as np
import matplotlib.pyplot as plt
import torch

%matplotlib inline

In [5]:
W = torch.tensor([0.2, -0.5, 0.1, 2.0, 1.5, 1.3, 2.1, 0.0, 0.0, 0.25, 0.2, -0.3]).view(3, 4).float().clone()
b = torch.tensor([1.1, 3.2, -1.2]).view(3, 1).float().clone()
x = torch.tensor([56, 231, 24, 2]).view(4, 1).float().clone()

In [6]:
def linear_layer(x, W, b):
    return W @ x + b

In [7]:
linear_layer(x, W, b)

tensor([[-96.8000],
        [437.9000],
        [ 60.7500]])

In [12]:
new_W = torch.cat((W, b), dim=1)
new_x = torch.cat((x, torch.tensor([1]).view(1, 1)), dim=0)

In [11]:
new_W

tensor([[ 0.2000, -0.5000,  0.1000,  2.0000,  1.1000],
        [ 1.5000,  1.3000,  2.1000,  0.0000,  3.2000],
        [ 0.0000,  0.2500,  0.2000, -0.3000, -1.2000]])

In [13]:
new_x

tensor([[ 56.],
        [231.],
        [ 24.],
        [  2.],
        [  1.]])

In [14]:
def linear_layer_bias_trick(x, W):
    return W @ x

In [15]:
linear_layer_bias_trick(new_x, new_W)

tensor([[-96.8000],
        [437.9000],
        [ 60.7500]])

In [18]:
linear_layer(x, W, b).tolist() == linear_layer_bias_trick(new_x, new_W).tolist()

True

In [19]:
y_true = torch.tensor([0, 1, 2])

In [20]:
y_pred = linear_layer(x, W, b).view(-1)

In [67]:
def svm_loss_single(y_pred, y_true, delta=1.0):
    margins = torch.maximum(torch.tensor(0), y_pred - y_pred[y_true] + delta)
    margins[y_true] = 0
    loss = margins.sum().item()
    return loss

In [68]:
def svm_loss_vect(y_pred, y_true, delta=1.0):
    num_samples = y_pred.shape[0]
    correct_values = y_pred[torch.arange(num_samples), y_true].unsqueeze(1)
    margins = torch.maximum(torch.tensor(0), y_pred - correct_values + torch.tensor(delta))
    margins[torch.arange(num_samples), y_true] = 0
    loss = margins.sum().item() / num_samples
    return loss

In [79]:
torch.manual_seed(0)
num_samples = 50_000
X = torch.randn(num_samples, 32, 32, 3).view(num_samples, -1)
y = torch.randint(0, 10, (num_samples,))
W = torch.randn(10, 32 * 32 * 3)

In [80]:
import time

start_time = time.time()
loss = 0.0
y_pred = X @ W.T
for i in range(num_samples):
    loss += svm_loss_single(y_pred[i], y[i])
loss /= num_samples
print(f'Loss is {loss:.2f}')
elapsed_time = time.time() - start_time
print(f'It took {elapsed_time:.5f}s ({elapsed_time // elapsed_time}x speedup)')

Loss is 286.05
It took 2.72030s (1.0x speedup)


In [82]:
start_time = time.time()
loss = svm_loss_vect(X @ W.T, y)
print(f'Loss is {loss:.2f}')
fast_elapsed_time = time.time() - start_time
print(f'It took {fast_elapsed_time:.5f}s ({elapsed_time // fast_elapsed_time}x speedup)')

Loss is 286.05
It took 0.17112s (15.0x speedup)


In [104]:
def softmax(x):
    normalized_x = x - x.max(dim=1, keepdims=True)[0]
    exp_x = normalized_x.exp()
    sum_exp_x = exp_x.sum(dim=1, keepdims=True)
    if torch.any(sum_exp_x == 0):
        print('Warning: Zero sum detected')
    return exp_x / sum_exp_x

In [111]:
def cross_entropy_loss(y_pred, y_true, epsilon=1e-9):
    num_samples = y_pred.shape[0]
    softmax_output = softmax(y_pred)
    picked_values = softmax_output[torch.arange(num_samples), y_true] + epsilon
    if torch.any(picked_values == 0):
        print('Warning: Zero picked value detected')
    loss = -picked_values.log().sum().item() / num_samples
    return loss

In [110]:
cross_entropy_loss(y_pred, y)

17.9293575

In [113]:
def l1_regularization(W):
    return W.abs().sum()

In [114]:
def l2_regularization(W):
    return (W**2).sum()

In [117]:
reg_factor = 0.1
loss = svm_loss_vect(y_pred, y) + reg_factor * l1_regularization(W)
loss

tensor(2742.6738)

In [116]:
reg_factor = 0.1
loss = cross_entropy_loss(y_pred, y) + reg_factor * l1_regularization(W)
loss

tensor(2474.5576)

In [115]:
reg_factor = 0.1
loss = cross_entropy_loss(y_pred, y) + reg_factor * l2_regularization(W)
loss

tensor(3109.6365)