In [111]:
import numpy as np
from numpy import loadtxt
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import random
from matplotlib.colors import ListedColormap

# part 5, 5 - softmax classifier on music data

class SoftmaxClassifier:

    def __init__(self, epochs, learning_rate, batch_size, regularization, momentum):

        self.num_classes = 120  # for 1900 to 2020 range
        
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.regularization = regularization
        self.momentum = momentum
        self.velocity = None
        self.weights = None

    def one_hot(self, y):
        # get a vector of labels, convert into 1 hot

        self.num_classes = 120  # range is 1900-2020
        y = np.asarray(y, dtype='int32')  # convert type to int
        y = y.reshape(-1)  # convert into a list of numbers
        y_one_hot = np.zeros((len(y), self.num_classes))  # init shape of len y, and out 3 (num of classes)
        y_one_hot[np.arange(len(y)), y] = 1  # set the right indexes to 1, based on y (a list)
        return y_one_hot  # shape N by num_classes (3)
    
    def calc_mse(self, pred, yt):

        # make sure to floor by converting to int()
        diff = pred - yt
        mse = (np.square(diff)).mean()

        return mse
    
#     def calc_accuracy(self, x, y):
#         #  predict the class, then compare with the correct label.  return the average correct %
#         pred = np.argmax(x.dot(self.weights), 1)  # predict
#         pred = pred.reshape((-1, 1))  # convert to column vector
#         return np.mean(np.equal(y, pred))  # return average over all the 1's (over the total)

    def softmax(self, x):
        # calc the softmax
        exp_x = np.exp(x - np.max(x))  # make sure it doesn't blow up by sub max

        # make sure sum along columns, and keep dims keeps the exact same dim when summing
        # ie keep cols, instead of converting to rows
        y = np.sum(exp_x, axis=1, keepdims=True)
        return exp_x / y

    def loss_and_gradient(self, x, y):
        # calc the loss and gradient.  forward prop, get softmax, calc the neg loss loss, and total loss.
        # calc dW by taking the residual, then dot with X,  + regularization
        # find average for both

        n_samples = x.shape[0]  # num of examples

        # forward prop
        f = np.dot(x, self.weights)  # mult X by W
        probs = self.softmax(f)  # pass f to softmax

        # take neg log of the highest prob. for that row
        neg_log_loss = -np.log(probs[np.arange(n_samples), np.argmax(probs, axis=1)])
        loss = np.sum(neg_log_loss)  # sum to get total loss across all samples
        # calc the regularization loss too
        reg_loss = 0.5 * self.regularization * np.sum(self.weights * self.weights)
        total_loss = (loss / n_samples) + reg_loss  # sum to get total, divide for avg

        # calc dW
        y_one_hot = self.one_hot(y)  # need one hot

        # calc derivative of loss (including regularization derivative)
        dW = x.T.dot( (probs - y_one_hot) ) + (self.regularization * self.weights) 
        dW /= n_samples  # compute average dW

        return total_loss, dW

    def train_phase(self, x_train, y_train):
        # shuffle data together, and forward prop by batch size, and add momentum

        num_train = x_train.shape[0]
        losses = []
        # Randomize the data (using sklearn shuffle)
        x_train, y_train = shuffle(x_train, y_train)

        # get the next batch (loop through number of training samples, step by batch size)
        for i in range(0, num_train, self.batch_size):

            # grab the next batch size
            x_train_batch = x_train[i:i + self.batch_size]
            y_train_batch = y_train[i:i + self.batch_size]

            # forward prop
            loss, dW = self.loss_and_gradient(x_train_batch, y_train_batch)  # calc loss and dW
            # calc velocity
            self.velocity = (self.momentum * self.velocity) + (self.learning_rate * dW)
            self.weights -= self.velocity  # update the weights
            losses.append(loss)  # save the losses

        return np.average(losses)  # return the average

    def test_phase(self, x, y_test):
        # extra, but more explicit calc of loss and gradient during testing (no back prop)

        loss, _ = self.loss_and_gradient(x, y_test)  # calc loss and dW (don't need)
        return loss

    def run_epochs(self, x_train, y_train, x_test, y_test):
        # start the training/valid by looping through epochs

        num_dim = x_train.shape[1]  # num of dimensions

        # create weights array/matrix size (num features x output)
        self.weights = 0.001 * np.random.rand(num_dim, self.num_classes)
        self.velocity = np.zeros(self.weights.shape)

        # store losses and accuracies here
        train_losses = []
        test_losses = []
        train_mse_arr = []
        test_mse_arr = []

        for e in range(self.epochs): # loop through epochs

            print('Ephoch {} / {}...'.format(e + 1, self.epochs))

            # calc loss and accuracies
            train_loss = self.train_phase(x_train, y_train)
            test_loss = self.test_phase(x_test, y_test)
            train_mse = self.calc_mse(x_train, y_train)
            test_mse = self.calc_mse(x_test, y_test)
            
            print('train loss: ', train_loss)
            print('test loss: ', test_loss)
            print('train MSE: ', train_mse)
            print('test MSE: ', test_mse)

            # append vals to lists
            train_losses.append(train_loss)
            test_losses.append(test_loss)
            train_mse_arr.append(train_mse)
            test_mse_arr.append(test_mse)

        return train_losses, test_losses, train_mse_arr, test_mse_arr  # return all the vals

    def plot_graph(self, train_losses, test_losses, train_mse_arr, test_mse_arr):
        # plot graph
        plt.subplot(1, 2, 1)
        plt.plot(train_losses, label="Train loss")
        plt.plot(test_losses, label="Test loss")
        plt.legend(loc='best')
        plt.title("Epochs vs. Loss")
        plt.xlabel("Iterations")
        plt.ylabel("Loss (Cross entropy)")

        plt.subplot(1, 2, 2)
        plt.plot(train_acc, label="Train Accuracy")
        plt.plot(test_acc, label="Test Accuracy")
        # plt.legend(loc='best')
        plt.title("Softmax Class. Epochs vs Accuracy")
        plt.xlabel("Iterations")
        plt.ylabel("Accuracy")
        plt.show()

In [112]:
def normalize_feat(x, mean=None, std=None):
    # normalize the feature data.  test data must pass mean and std

    # calc feature-wise mean
    if mean is None:
        mean = np.mean(x, axis=0)

    # calc feature-wise std
    if std is None:
        std = np.std(x, axis=0)

    # sub the mean per column
    x_norm = x - mean

    # div by the standard dev.
    x_norm = x_norm / std

    return x_norm, mean, std

In [113]:
# define data loader

def load_data(fname, bias=1):

    data = loadtxt(fname, delimiter=',')

    # loads data, normalizes, and appends a bias vector to the data

    TRAIN_NUM = 463714  # training data up to this point

    # process training data
    x_train = data[:TRAIN_NUM,1:].astype(float)  # parse train

    x_train, train_mean, train_std = normalize_feat(x_train)  # normalize data

    # create a col vector of ones
    col_bias = np.ones((x_train.shape[0], 1))

    # append bias with hstack
    x_train = np.hstack((x_train, col_bias))

    # convert label vals to int and to vector
    y_train = data[:TRAIN_NUM,0].astype(int)
    y_train = y_train.reshape((-1, 1))

    # -------------------

    # process test data
    x_test = data[TRAIN_NUM:,1:].astype(float)  # parse test
    x_test, _, _ = normalize_feat(x_test, train_mean, train_std)  # normalize data

    # create a col vector of ones
    col_bias = np.ones((x_test.shape[0], 1))

    # append bias with hstack
    x_test = np.hstack((x_test, col_bias))    

    # convert label vals to int and to vector
    y_test = data[TRAIN_NUM:,0].astype(int)
    y_test = y_test.reshape((-1, 1))  # convert to column vector

    return x_train, y_train, x_test, y_test

In [114]:
def offset_labels(y):
    OFFSET = 1900 # starting the index 0 with year 1900
    return y - OFFSET
    

In [115]:
# # load data
# fname = 'YearPredictionMSD.txt'

# # note, features are normalized
# x_train, y_train, x_test, y_test = load_data(fname)

In [116]:
# offset to make labels start at index 0
y_train_off = offset_labels(y_train)
y_test_off = offset_labels(y_test)

In [117]:
# set hyperparameters here
epochs = 100
learning_rate = 0.0001  # [0.1, 0.01, 0.001]
batch_size = 1000  # try powers of 2
regularization = 0.1  # L2 weight decay, range [1, 0.1, 0.01, 0.001]
momentum = 0.30  # started with 0 to 1

smc = SoftmaxClassifier(epochs, learning_rate, batch_size, regularization, momentum)
train_losses, test_losses, train_mse, test_mse = smc.run_epochs(x_train, y_train_off, x_test, y_test_off)
smc.plot_graph(train_losses, test_losses, train_mse, test_mse)
# smc.plot_decision_boundary(x_train, y_train)
# smc.plot_decision_boundary(x_test, y_test)

Ephoch 1 / 100...
train loss:  4.780427104649142
test loss:  4.778249878771481
train MSE:  9798.508138769364
test MSE:  9817.965857640336
Ephoch 2 / 100...
train loss:  4.775569207763842
test loss:  4.77240604586792
train MSE:  9798.508138769364
test MSE:  9817.965857640336
Ephoch 3 / 100...
train loss:  4.769595838068432
test loss:  4.766139416190811
train MSE:  9798.508138769364
test MSE:  9817.965857640336
Ephoch 4 / 100...
train loss:  4.763327944186613
test loss:  4.759733931687148
train MSE:  9798.508138769364
test MSE:  9817.965857640336
Ephoch 5 / 100...
train loss:  4.757013153980332
test loss:  4.753272240339001
train MSE:  9798.508138769364
test MSE:  9817.965857640336
Ephoch 6 / 100...
train loss:  4.750684989873597
test loss:  4.7467951157502934
train MSE:  9798.508138769364
test MSE:  9817.965857640336
Ephoch 7 / 100...


KeyboardInterrupt: 