In [1]:
import numpy as np
import torch
import torch.nn as nn

In [2]:
# import dataset from seaborn
import seaborn as sns
iris = sns.load_dataset("iris")

# organize the data

# convert from pandas dataframe to tensor
data = torch.tensor(iris[iris.columns[0:4]].values).float()

# transform species to number
labels = torch.zeros(len(data), dtype = torch.long)
# labels[iris.species == "setosa"] = 0 # don't need
labels[iris.species == "versicolor"] = 1
labels[iris.species == "virginica"] = 2

labels

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2])

In [18]:
# how many training examples
propTraining = .8 # in proportion, not percent
nTraining = int(len(labels)*propTraining)

# initialize a boolean vector to select data and labels
traintestBool = np.zeros(len(labels), dtype = bool)

# is this the correct way to select samples
# traintestBool[range(nTraining)] = True

# this is better, why?
items2use4train = np.random.choice(range(len(labels)), nTraining, replace = False)
traintestBool[items2use4train] = True

traintestBool
# true values represent train set and false represents test samples

array([ True,  True, False,  True,  True, False,  True,  True,  True,
       False,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True, False,  True,  True, False,  True,  True, False,  True,
        True,  True,  True, False, False,  True,  True, False,  True,
        True, False,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True, False, False,
       False,  True, False, False,  True,  True, False,  True,  True,
        True, False,  True,  True,  True,  True, False,  True,  True,
        True, False,

In [19]:
# test whether it's balanced
print("Avg of full data:")
print(torch.mean(labels.float())) # =1 by definition
print(" ")
# if 1 returned then all three species are equal in number else one might be greater in number than the others

print("Avg of training data:")
print(torch.mean(labels[traintestBool].float())) # should be 1
print(" ")

print("Avg of test data:")
print(torch.mean(labels[~traintestBool].float())) # should also be 1

Avg of full data:
tensor(1.)
 
Avg of training data:
tensor(0.9917)
 
Avg of test data:
tensor(1.0333)


In [20]:
# create ANN model

# model architecture
ANNiris = nn.Sequential(
    nn.Linear(4, 64), # input layer
    nn.ReLU(), # activation unit
    nn.Linear(64, 64), # hidden unit
    nn.ReLU(), # activation unit
    nn.Linear(64, 3) # output units
)

# loss function
lossfun = nn.CrossEntropyLoss()

# optimizer
optimizer = torch.optim.SGD(ANNiris.parameters(), lr = .01)

In [21]:
# entire dataset
print(data.shape)

# training set
print(data[traintestBool, :].shape)

# test set
print(data[~traintestBool, :].shape)

torch.Size([150, 4])
torch.Size([120, 4])
torch.Size([30, 4])


In [23]:
# train the model
numepochs = 1000

# initialize losses
losses = torch.zeros(numepochs)
ongoingAcc = []

# train the model
for epochi in range(numepochs):

    # forward pass
    yHat = ANNiris(data[traintestBool, :])

    # compute accuracy
    accuracy = (torch.argmax(yHat, axis=1) == labels[traintestBool]).float().mean()
    ongoingAcc.append(100 * accuracy)  # Store accuracy as percentage

    # compute loss
    loss = lossfun(yHat, labels[traintestBool])
    losses[epochi] = loss.item()  # Store loss value (not tensor)

    # backprop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [24]:
# compute train and test accuracies

# final forward pass using training data
predictions = ANNiris(data[traintestBool, :])
trainacc = 100*torch.mean((torch.argmax(predictions, axis = 1) == labels[traintestBool]).float())

# final forward pass using test data
predictions = ANNiris(data[~traintestBool, :])
testacc = 100*torch.mean((torch.argmax(predictions, axis = 1) == labels[~traintestBool]).float())

In [25]:
# report accuracies

print("Final TRAIN accuracy is: %g%%" %trainacc)
print("Final TEST accuracy is: %g%%" %testacc)

Final TRAIN accuracy is: 98.3333%
Final TEST accuracy is: 100%
