In [1]:
import os
import argparse
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
import torch.nn.functional as F
from tensorboardX import SummaryWriter
import my_classes as mc
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import my_classes as mc

iris_data = load_iris()
features, pre_labels = iris_data.data, iris_data.target

root = './iris'

if not os.path.exists(root):
    os.mkdir(root)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

labels = mc.one_hot_encode(pre_labels)

feature_train, feature_test, labels_train, labels_test = train_test_split(features, labels, random_state = 17)

# Load the standard scaler
sc = StandardScaler()

# Compute the mean and standard deviation based on the training data
sc.fit(feature_train)

# Scale the training data to be of mean 0 and of unit variance
feature_train = sc.transform(feature_train)

# Scale the test data to be of mean 0 and of unit variance
feature_test = sc.transform(feature_test)

# Data type for tensors
dtype = torch.float

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [123]:
def cross_ent(y_hat, y):
    soft = softmax(y_hat)
    #print('soft',soft.shape)
    #print('y',y.shape)
    return torch.mean( -torch.sum(y.float() * torch.log(soft), 1))

# SOFTMAX
def softmax(X):
    '''
    X: output of last fc layer, dim = [num_examples, num_classes]
    output: 
    '''
    exps = torch.exp(X - torch.max(X))
    return exps / exps.sum()

def delta_cross_ent(X, y):
    '''
    X: output of last fully connected layer, dim = [num_examples, num_classes]
    y: labels (not one-hot encoded), dim = [num_examples, 1]
    output: gradient of cross entropy function wrt output, dim = [num_examples, num_classes]
    '''
    m = y.shape[1]
    grad = softmax(X)
    
    #print(grad.shape)
    #print(y.shape)
    debug['grad'] = grad
    
    grad[range(m),y.long()] -= 1
    grad = grad/m
    return grad

def predict(x, w1, w2, act):
    h = x.mm(w1)
    if act is 'ReLU':
        h_act = h.clamp(min=0)
    elif act is 'Sig':
        h_act = h * (1-h)
    y_pred = h_act.mm(w2)
    y_pred = softmax(y_pred)
    return torch.max(y_pred, 1)

In [126]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 112, 4, 7, 3

#
x = torch.as_tensor(torch.from_numpy(feature_train), device=device, dtype=dtype)
y = torch.as_tensor(torch.from_numpy(labels_train), device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

loss_list = []

learning_rate = 1e-6

num_steps = 50000

# MSE or CE
loss_fn = 'CE'
criterion = nn.CrossEntropyLoss()

# ReLU or Sig
activ_fn = 'ReLU'

debug = {}

for t in range(num_steps):
    # Forward pass: compute predicted y
    h = x.mm(w1)

    if activ_fn is 'ReLU':
        h_activation = h.clamp(min=0)
    elif activ_fn is 'Sig':
        h_activation = mc.sigmoid_activation(h)

    y_pred = h_activation.mm(w2)

    debug['y_pred'] = y_pred
    debug['y'] = y
    
    # Compute and print loss
    if loss_fn is 'MSE':
        loss = mc.mean_sum_square_errors(y_pred, y)
    elif loss_fn is 'CE':
        #print(y_pred.shape, y.shape)
        #print(y_pred, y)
        loss = cross_ent(y_pred, y)
        #loss = criterion(torch.argmax(y_pred), y.long())

    loss_list.append(loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    if loss_fn is 'MSE':
        grad_y_pred = mc.sum_square_errors_delta(y_pred, y)
    elif loss_fn is 'CE':
        grad_y_pred = delta_cross_ent(y_pred, y)

    #print('grad_y_pred', grad_y_pred.shape)
    grad_w2 = h_activation.t().mm(grad_y_pred)
    #print('grad_w2', grad_w2.shape)
    grad_h_activation = grad_y_pred.mm(w2.t())
    #print('grad_h_activation', grad_h_activation.shape)
    grad_h = grad_h_activation.clone()

    if activ_fn is 'ReLU':
        grad_h[h < 0] = 0
    elif activ_fn is 'Sig':
        grad_h = grad_h * (1 - grad_h)

    #print('grad_h', grad_h.shape)
    
    grad_w1 = x.t().mm(grad_h)

    #print('grad_w1', grad_w1.shape)
    
    #print('w1', w1.shape)
    #print('w2', w2.shape)
    #print()
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

    if (t + 1) % 10000 == 0:
        print(f'Step [{t+1}/{num_steps}], Loss: {loss:.4}')



Step [1000/50000], Loss: 8.477
Step [2000/50000], Loss: 8.458
Step [3000/50000], Loss: 8.439
Step [4000/50000], Loss: 8.42
Step [5000/50000], Loss: 8.402
Step [6000/50000], Loss: 8.383
Step [7000/50000], Loss: 8.365
Step [8000/50000], Loss: 8.348
Step [9000/50000], Loss: 8.33
Step [10000/50000], Loss: 8.313
Step [11000/50000], Loss: 8.296
Step [12000/50000], Loss: 8.28
Step [13000/50000], Loss: 8.263
Step [14000/50000], Loss: 8.247
Step [15000/50000], Loss: 8.231
Step [16000/50000], Loss: 8.215
Step [17000/50000], Loss: 8.199
Step [18000/50000], Loss: 8.184
Step [19000/50000], Loss: 8.169
Step [20000/50000], Loss: 8.154
Step [21000/50000], Loss: 8.139
Step [22000/50000], Loss: 8.124
Step [23000/50000], Loss: 8.11
Step [24000/50000], Loss: 8.096
Step [25000/50000], Loss: 8.082
Step [26000/50000], Loss: 8.068
Step [27000/50000], Loss: 8.054
Step [28000/50000], Loss: 8.04
Step [29000/50000], Loss: 8.027
Step [30000/50000], Loss: 8.014
Step [31000/50000], Loss: 8.001
Step [32000/50000], Lo

grad_y_pred torch.Size([112, 3])
grad_w2 torch.Size([7, 3])
grad_h_activation torch.Size([112, 7])
grad_h torch.Size([112, 7])
grad_w1 torch.Size([4, 7])
w1 torch.Size([4, 7])
w2 torch.Size([7, 3])

In [None]:
grad_y_pred torch.Size([112, 3])
grad_w2 torch.Size([7, 3])
grad_h_activation torch.Size([112, 7])
grad_h torch.Size([112, 7])
grad_w1 torch.Size([4, 7])
w1 torch.Size([4, 7])
w2 torch.Size([7, 3])

In [107]:
grad = debug['grad']

grad[range(3), y.long()].shape

torch.Size([112, 3])

In [127]:
# Testing
x_test = torch.from_numpy(feature_test).float()
y_test = torch.from_numpy(labels_test).float()

y_preds = predict(x_test, w1, w2, activ_fn)[1]

# Compute accuracy
_, argmax = torch.max(y_test, 1)
accuracy = (y_preds == argmax.squeeze()).float().mean()

print(f'Acc: {accuracy:.2}')

Acc: 0.13


In [53]:
for k, v in debug.items():
    print(k)
    print(v.shape)

y_pred
torch.Size([112, 3])
y
torch.Size([112, 3])


In [46]:
def cross_ent(y_hat, y):
    y_soft = softmax(y_hat)
    #print(y.shape)
    return torch.mean( torch.sum(- y.float() * torch.log(y_soft), 1))

# SOFTMAX
def softmax(X):
    exps = torch.exp(X - torch.max(X))
    return exps / exps.sum()

def delta_cross_ent(X, y):
    '''
    X: output of last fully connected layer, dim = [num_examples, num_classes]
    y: labels (not one-hot encoded), dim = [num_examples, 1]
    output: gradient of cross entropy function wrt output, dim = [num_examples, num_classes]
    '''
    m = y.shape[0]
    labels = y.argmax(1)
    #print(y.shape)
    grad = softmax(X)
    grad[range(m),labels] -= 1
    grad = grad/m
    return grad

In [41]:
y.byte()

tensor([[0, 1, 0],
        [1, 0, 0],
        [0, 1, 0],
        [0, 1, 0],
        [1, 0, 0],
        [0, 1, 0],
        [0, 0, 1],
        [0, 1, 0],
        [0, 1, 0],
        [0, 0, 1],
        [0, 1, 0],
        [1, 0, 0],
        [0, 0, 1],
        [0, 1, 0],
        [0, 1, 0],
        [0, 1, 0],
        [0, 1, 0],
        [1, 0, 0],
        [0, 1, 0],
        [0, 0, 1],
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 0],
        [0, 0, 1],
        [1, 0, 0],
        [0, 0, 1],
        [0, 0, 1],
        [1, 0, 0],
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 0],
        [0, 1, 0],
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 0],
        [0, 1, 0],
        [1, 0, 0],
        [0, 0, 1],
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 0],
        [0, 1, 0],
        [0, 0, 1],
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 0],
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 0],
        [0, 0, 1],
        [0, 0, 1],
        [0, 0, 1],
        [0, 

In [31]:
cross_ent(debug['y_pred'], debug['y'])

tensor(9.7122)

In [29]:
y_hat = debug['y_pred']

labels = y.argmax(1)
print(labels.shape)
y_soft = softmax(y_hat)
torch.mean( torch.sum(- y * torch.log(y_soft), 1))

torch.Size([112])


tensor(9.7122)

In [19]:
labels

tensor([1, 0, 1, 1, 0, 1, 2, 1, 1, 2, 1, 0, 2, 1, 1, 1, 1, 0, 1, 2, 2, 0, 0, 2,
        0, 2, 2, 0, 2, 0, 0, 1, 2, 0, 0, 1, 0, 2, 2, 0, 0, 1, 2, 2, 0, 0, 2, 0,
        0, 2, 2, 2, 2, 0, 2, 1, 0, 1, 0, 0, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1,
        0, 2, 2, 1, 2, 1, 0, 1, 0, 1, 1, 2, 0, 1, 0, 2, 1, 0, 0, 1, 0, 2, 0, 2,
        0, 1, 2, 0, 0, 0, 2, 0, 0, 1, 0, 0, 1, 2, 2, 2])

In [24]:
delta_cross_ent(y_pred, labels)

tensor([[ 1.5195e-06, -8.9279e-03,  5.3537e-07],
        [-8.9242e-03,  8.5554e-08,  2.4602e-05],
        [ 1.1009e-06, -8.9279e-03,  1.1824e-06],
        [ 7.1905e-06, -8.9273e-03,  7.6243e-08],
        [-8.9268e-03,  4.1026e-10,  2.2414e-05],
        [ 1.1489e-06, -8.9275e-03,  8.0467e-07],
        [ 2.8277e-05,  3.7228e-06, -8.9285e-03],
        [ 1.6129e-06, -8.9272e-03,  5.9732e-07],
        [ 1.5055e-06, -8.9285e-03,  1.7925e-06],
        [ 2.5622e-06,  1.7824e-06, -8.9282e-03],
        [ 1.9689e-05, -8.9266e-03,  3.4864e-08],
        [-8.9263e-03,  1.3587e-08,  5.5463e-05],
        [ 3.8374e-06,  9.4689e-09, -8.9280e-03],
        [ 4.1309e-06, -8.9272e-03,  1.1331e-07],
        [ 1.2248e-06, -8.9275e-03,  7.6069e-07],
        [ 1.3436e-05, -8.9262e-03,  4.7847e-08],
        [ 1.5011e-06, -8.9273e-03,  6.3622e-07],
        [-8.9228e-03,  3.1889e-09,  7.7023e-04],
        [ 3.5268e-05, -8.9263e-03,  1.6989e-08],
        [ 2.5622e-06,  1.7824e-06, -8.9282e-03],
        [ 6.4734e-06

In [63]:
soft = softmax(y_hat)

In [65]:
y.shape[0]

112