In [None]:
# import libraries
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
# import dataset
iris = sns.load_dataset("iris")

# convert from pandas dataframe to tensor
data = torch.tensor( iris[iris.columns[0:4]].values ).float()

# transform species to number
labels = torch.zeros(len(data), dtype=torch.long)
# labels[iris.species=='setosa'] = 0 # don't need!
labels[iris.species=='versicolor'] = 1
labels[iris.species=='virginica'] = 2

# Use Numpy to split data into train-test

In [None]:
#  (no devset here)

# how many training examples
propTraining = .8 # in proportion, not percent
nTraining = int(len(labels)*propTraining)

# initialize a boolean vector to select data and labels
traintestBool = np.zeros(len(labels),dtype=bool)

# We will randomly select nTraining samples to be in the training set
# This is important to make sure data is balanced between training and test sets
train_idx = np.random.choice(range(len(labels)),nTraining,replace=False)
traintestBool[train_idx] = True

traintestBool


In [None]:
# test whether it's balanced
print('Average of full data:')
print( torch.mean(labels.float()) ) # =1 by definition
print(' ')

print('Average of training data:')
print( torch.mean(labels[traintestBool].float()) ) # should be 1...
print(' ')

print('Average of test data:')
print( torch.mean(labels[~traintestBool].float()) ) # should also be 1...

In [None]:
def createANewModel():

  # model architecture
  ANNiris = nn.Sequential(
      nn.Linear(4,64),   # input layer
      nn.ReLU(),         # activation unit
      nn.Linear(64,64),  # hidden layer
      nn.ReLU(),         # activation unit
      nn.Linear(64,3),   # output units
        )

  # loss function
  lossfun = nn.CrossEntropyLoss()

  # optimizer
  optimizer = torch.optim.SGD(ANNiris.parameters(),lr=.01)

  return ANNiris,lossfun,optimizer

In [None]:
# create the ANN model
ANNiris,lossfun,optimizer = createANewModel()

In [None]:
# entire dataset
print( data.shape )

# training set
print( data[traintestBool,:].shape )

# test set
print( data[~traintestBool,:].shape )

# Train and test the model

In [None]:
# train the model

numepochs = 1000

# initialize losses
losses = torch.zeros(numepochs)
ongoingAcc = []

# loop over epochs
for epochi in range(numepochs):

  # forward pass
  yHat = ANNiris(data[traintestBool,:])

  # compute accuracy (note: denser than previous code!)
  ongoingAcc.append( 100*torch.mean(
              (torch.argmax(yHat,axis=1) == labels[traintestBool]).float()) )

  # compute loss
  loss = lossfun(yHat,labels[traintestBool])
  losses[epochi] = loss

  # backprop
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

In [None]:
# compute train and test accuracies

# final forward pass USING TRAINING DATA
predictions = ANNiris(data[traintestBool,:])
trainacc = 100*torch.mean((torch.argmax(predictions,axis=1) == labels[traintestBool]).float())


# final forward pass USING TEST DATA!
predictions = ANNiris(data[~traintestBool,:])
testacc = 100*torch.mean((torch.argmax(predictions,axis=1) == labels[~traintestBool]).float())

In [None]:
# report accuracies

print('Final TRAIN accuracy: %g%%' %trainacc)
print('Final TEST accuracy:  %g%%' %testacc)


# Using scikit-learn to split the data into train and test

In [None]:
# import libraries
from sklearn.model_selection import train_test_split

# Use scikit learn to split the data into train and test

In [None]:
# Define a function that trains the model

def trainTheModel(numepochs):

  # initialize losses
  losses = torch.zeros(numepochs)
  trainAcc = []
  testAcc  = []

  # loop over epochs
  for epochi in range(numepochs):
  
    # forward pass and loss
    yHat = ANNiris(X_train)
    loss = lossfun(yHat,y_train)

    # backprop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # compute training accuracy
    trainAcc.append( 100*torch.mean((torch.argmax(yHat,axis=1) == y_train).float()).item() )

    # test accuracy
    predlabels = torch.argmax( ANNiris(X_test),axis=1 )
    testAcc.append( 100*torch.mean((predlabels == y_test).float()).item() )

  # function output
  return trainAcc,testAcc


# Test the model by running it once

In [None]:
# create a model
ANNiris,lossfun,optimizer = createANewModel()

# train the model
numepochs = 200
trainProp = .8
X_train,X_test, y_train,y_test = train_test_split(data,labels, train_size=trainProp)
trainAcc,testAcc = trainTheModel(numepochs)


In [None]:
# plot the results
fig = plt.figure(figsize=(10,5), facecolor='#24273a') 
ax = plt.gca()
ax.set_facecolor('#24273a') 

# Catppuccin Macchiato colors
plt.plot(trainAcc, color='#a6da95', marker='o', linestyle='-', linewidth=2, markersize=6)  
plt.plot(testAcc, color='#f5a97f', marker='s', linestyle='-', linewidth=2, markersize=6)   

plt.xlabel('Epochs', color='#cad3f5')  # Text color
plt.ylabel('Accuracy (%)', color='#cad3f5')
plt.legend(['Train','Test'], facecolor='#363a4f', edgecolor='#5b6078', labelcolor='#cad3f5')

# Set tick colors
ax.tick_params(colors='#cad3f5')
ax.spines['bottom'].set_color('#5b6078')
ax.spines['top'].set_color('#5b6078')
ax.spines['right'].set_color('#5b6078')
ax.spines['left'].set_color('#5b6078')

plt.show()

# PyTorch DataLoaders

In [None]:
# import libraries

from torch.utils.data import DataLoader, TensorDataset

# A brief aside on using DataLoader

In [None]:
# create fake dataset

fakedata = np.tile(np.array([1,2,3,4]),(10,1)) + np.tile(10*np.arange(1,11),(4,1)).T
fakelabels = np.arange(10)>4
print(fakedata), print(' ')
print(fakelabels)

In [None]:
# dataloader object with all data
fakedataLdr = DataLoader(fakedata, shuffle=True)
print( fakedataLdr )
print( fakedataLdr.batch_size )

In [None]:
# iterate through the data
for i,oneSample in enumerate(fakedataLdr):
  print(i,oneSample,oneSample.shape)


In [None]:
# we need to create a Dataset that contains the data and labels
fakeDataset = TensorDataset(torch.Tensor(fakedata),torch.Tensor(fakelabels))
# print( fakeDataset.tensors ), print(' ')

# then create another DataLoader
fakedataLdr = DataLoader(fakeDataset, shuffle=True)

# iterate through the data
for dat,lab in fakedataLdr:
  print(dat,lab)

In [None]:
# use scikitlearn to split the data
train_data,test_data, train_labels,test_labels = train_test_split(fakedata, fakelabels, test_size=.2)

# then convert them into PyTorch Datasets
train_data = torch.utils.data.TensorDataset(
     torch.Tensor(train_data),torch.Tensor(train_labels))

test_data = torch.utils.data.TensorDataset(
     torch.Tensor(test_data),torch.Tensor(test_labels))

# finally, translate into dataloader objects
# notice the batches (see next cell)!
train_loader = DataLoader(train_data,batch_size=4)
test_loader  = DataLoader(test_data)

In [None]:
# examine the contents of the dataloader (batching is an advantage of dataloader!)
print('TRAINING DATA')
for batch,label in train_loader: # iterable
  print(batch,label)
  print(' ')


print(' ')
print('TESTING DATA')
for batch,label in test_loader: # iterable
  print(batch,label)
  print(' ')

# Now back to the real data!

In [None]:
# use scikitlearn to split the data
train_data,test_data, train_labels,test_labels = \
                              train_test_split(data, labels, train_size=.8)


# then convert them into PyTorch Datasets (note: already converted to tensors)
train_data = torch.utils.data.TensorDataset(train_data,train_labels)
test_data  = torch.utils.data.TensorDataset(test_data,test_labels)


# finally, translate into dataloader objects
train_loader = DataLoader(train_data,shuffle=True,batch_size=12)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])

In [None]:
# check sizes of data batches
for X,y in train_loader:
  print(X.shape,y.shape)

X,y

In [None]:
# Define a function that trains the model in batches

def trainTheModel(numepochs):

  # initialize accuracies as empties (not storing losses here)
  trainAcc = []
  testAcc  = []

  # loop over epochs
  for epochi in range(numepochs):


    # loop over training data batches
    batchAcc = []
    for X,y in train_loader:

      # forward pass and loss
      yHat = ANNiris(X)
      loss = lossfun(yHat,y)

      # backprop
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # compute training accuracy just for this batch
      batchAcc.append( 100*torch.mean((torch.argmax(yHat,axis=1) == y).float()).item() )
    # end of batch loop...


    # now that we've trained through the batches, get their average training accuracy
    trainAcc.append( np.mean(batchAcc) )

    # test accuracy
    X,y = next(iter(test_loader)) # extract X,y from test dataloader
    predlabels = torch.argmax( ANNiris(X),axis=1 )
    testAcc.append( 100*torch.mean((predlabels == y).float()).item() )

  # function output
  return trainAcc,testAcc


# Modeling

In [None]:
# create a model
ANNiris,lossfun,optimizer = createANewModel()

# train the model
numepochs = 500
trainAcc,testAcc = trainTheModel(numepochs)

In [None]:
# plot the results
fig, ax = plt.subplots(figsize=(10,5))
fig.patch.set_facecolor('#24273a')  # Catppuccin Macchiato background
ax.set_facecolor('#24273a')

ax.plot(trainAcc, 'o-', color='#f5a97f', linewidth=2, markersize=4, label='Train')  # Catppuccin peach
ax.plot(testAcc, 's-', color='#a6da95', linewidth=2, markersize=4, label='Test')   # Catppuccin green
ax.set_xlabel('Epochs', color='#cad3f5')  # Catppuccin text
ax.set_ylabel('Accuracy (%)', color='#cad3f5')
ax.legend(frameon=False, labelcolor='#cad3f5')

# Style the axes
ax.tick_params(colors='#cad3f5')
ax.spines['bottom'].set_color('#5b6078')
ax.spines['top'].set_color('#5b6078')
ax.spines['right'].set_color('#5b6078')
ax.spines['left'].set_color('#5b6078')

# optional zoom-in to final epochs
# ax.set_xlim([300,500])
# ax.set_ylim([90,100.5])

plt.tight_layout()
plt.show()

In [None]:
# Something to try on your own: 

# We have only shown the division of data into train - test sets. Can you try to do a split into train - dev - test sets?
# Can you apply this to the regression problem we discussed earlier ? 