In [266]:
import numpy as np
from numpy import loadtxt
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import random
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import OneHotEncoder

# part 5, 5 - softmax classifier on music data

class SoftmaxClassifier:

    def __init__(self, epochs, learning_rate, batch_size, regularization, momentum, num_classes):

        self.num_classes = num_classes
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.regularization = regularization
        self.momentum = momentum
        self.velocity = None
        self.weights = None
    
    def calc_mse(self, probs, yt_off):
        
#         print('Evaluation phase --')
        
#         f = np.dot(x, self.weights)
#         pred = self.softmax(f)
        
#         print('pred shape: ', pred.shape)
        
        # make prediction
#         pred = np.argmax(np.dot(x, self.weights), 1)  # predict

        # make sure to floor by converting to int()
#         diff = pred - yt_off + self.offset
        preds = np.argmax(probs, 1).reshape(-1, 1)
        diff = preds - yt_off.reshape(-1, 1)
        mse = (np.square(diff)).mean()
        
#         print('pred shape: ', preds.shape)
#         print('yt_off shape: ', yt_off.shape)

        return mse

    def softmax(self, x):
        # calc the softmax
        exp_x = np.exp(x - np.max(x))  # make sure it doesn't blow up by sub max

        # make sure sum along columns, and keep dims keeps the exact same dim when summing
        # ie keep cols, instead of converting to rows
        y = np.sum(exp_x, axis=1, keepdims=True)
        return exp_x / y

    def loss_and_gradient(self, x, y_off, y_one_off):
        # calc the loss and gradient.  forward prop, get softmax, calc the neg loss loss, and total loss.
        # calc dW by taking the residual, then dot with X,  + regularization
        # find average for both

        n_samples = x.shape[0]  # num of examples

        # forward prop
        f = np.dot(x, self.weights)  # mult X by W
        probs = self.softmax(f)  # pass f to softmax

        # take neg log of the highest prob. for that row
        neg_log_loss = -np.log(probs[np.arange(n_samples), np.argmax(probs, axis=1)])
        loss = np.sum(neg_log_loss)  # sum to get total loss across all samples
        # calc the regularization loss too
        reg_loss = 0.5 * self.regularization * np.sum(self.weights * self.weights)
        total_loss = (loss / n_samples) + reg_loss  # sum to get total, divide for avg

        # calc derivative of loss (including regularization derivative)
        dW = x.T.dot( (probs - y_one_off) ) + (self.regularization * self.weights) 
        dW /= n_samples  # compute average dW

        return total_loss, dW, probs

    def train_phase(self, x_train, y_train_off, y_train_one_off):
        # shuffle data together, and forward prop by batch size, and add momentum

        print('TRAINING PHASE --')
        
        num_train = x_train.shape[0]
        losses = []
        probs_arr = []

        # Randomize the data
        x_train, y_train_off, y_train_one_off = shuffle(x_train, y_train_off, y_train_one_off)

        # get the next batch (loop through number of training samples, step by batch size)
        for i in range(0, num_train, self.batch_size):

            # grab the next batch size
            x_train_batch = x_train[i:i + self.batch_size]
            y_train_batch_off = y_train_off[i:i + self.batch_size]
            y_train_batch_one_off = y_train_one_off[i:i + self.batch_size]

            # forward prop
            loss, dW, probs = self.loss_and_gradient(x_train_batch, y_train_batch_off, y_train_batch_one_off)  # calc loss and dW
            
            probs_arr.extend(probs)
            
            # calc velocity
            self.velocity = (self.momentum * self.velocity) + (self.learning_rate * dW)
            self.weights -= self.velocity  # update the weights
            losses.append(loss)  # save the losses

        return np.average(losses), np.asarray(probs_arr)  # return the average

    def test_phase(self, x, y_test_off, y_test_one_off):
        # extra, but more explicit calc of loss and gradient during testing (no back prop)

        print('Test PHASE --')
        
        loss, _, probs = self.loss_and_gradient(x, y_test_off, y_test_one_off)  # calc loss and dW (don't need)
        return loss, probs

    def run_epochs(self, x_train, y_train_off, y_train_one_off, x_test, y_test_off, y_test_one_off):
        # start the training/valid by looping through epochs

        num_dim = x_train.shape[1]  # num of dimensions
        
        # create weights array/matrix size (num features x output)
        self.weights = 0.0001 * np.random.rand(num_dim, self.num_classes)
        self.velocity = np.zeros(self.weights.shape)

        # store losses and accuracies here
        train_losses = []
        test_losses = []
        train_mse_arr = []
        test_mse_arr = []

        for e in range(self.epochs): # loop through epochs

            print('Ephoch {} / {}...'.format(e + 1, self.epochs))

            # calc loss and accuracies
            train_loss, train_probs = self.train_phase(x_train, y_train_off, y_train_one_off)
            test_loss, test_probs = self.test_phase(x_test, y_test_off, y_test_one_off)
            train_mse = self.calc_mse(train_probs, y_train_off)
            test_mse = self.calc_mse(test_probs, y_test_off)
            
            print('train loss: ', train_loss)
            print('test loss: ', test_loss)
            print('train MSE: ', train_mse)
            print('test MSE: ', test_mse)

            # append vals to lists
            train_losses.append(train_loss)
            test_losses.append(test_loss)
            train_mse_arr.append(train_mse)
            test_mse_arr.append(test_mse)

        return train_losses, test_losses, train_mse_arr, test_mse_arr  # return all the vals

    def plot_graph(self, train_losses, test_losses, train_mse_arr, test_mse_arr):
        # plot graph
        plt.subplot(1, 2, 1)
        plt.plot(train_losses, label="Train loss")
        plt.plot(test_losses, label="Test loss")
        plt.legend(loc='best')
        plt.title("Softmax Class. Loss vs Epochs")
        plt.xlabel("Iterations")
        plt.ylabel("Loss (Cross entropy)")

        plt.subplot(1, 2, 2)
        plt.plot(train_mse_arr, label="Train MSE")
        plt.plot(test_mse_arr, label="Test MSE")
        # plt.legend(loc='best')
        plt.title("Softmax Class. MSE vs Epochs")
        plt.xlabel("Iterations")
        plt.ylabel("MSE")
        plt.show()

In [267]:
# ============= preprocessing code ========================

def normalize_feat(x, mean=None, std=None):
    # normalize the feature data.  test data must pass mean and std

    # calc feature-wise mean
    if mean is None:
        mean = np.mean(x, axis=0)

    # calc feature-wise std
    if std is None:
        std = np.std(x, axis=0)

    # sub the mean per column
    x_norm = x - mean

    # div by the standard dev.
    x_norm = x_norm / std

    return x_norm, mean, std

In [268]:
# define data loader

def load_data(fname, bias=1):

    data = loadtxt(fname, delimiter=',')

    # loads data, normalizes, and appends a bias vector to the data

    TRAIN_NUM = 463714  # training data up to this point

    # process training data
    x_train = data[:TRAIN_NUM,1:].astype(float)  # parse train

    x_train, train_mean, train_std = normalize_feat(x_train)  # normalize data

    # create a col vector of ones
    col_bias = np.ones((x_train.shape[0], 1))

    # append bias with hstack
    x_train = np.hstack((x_train, col_bias))

    # convert label vals to int and to vector
    y_train = data[:TRAIN_NUM,0].astype(int)
    y_train = y_train.reshape((-1, 1))

    # -------------------

    # process test data
    x_test = data[TRAIN_NUM:,1:].astype(float)  # parse test
    x_test, _, _ = normalize_feat(x_test, train_mean, train_std)  # normalize data

    # create a col vector of ones
    col_bias = np.ones((x_test.shape[0], 1))

    # append bias with hstack
    x_test = np.hstack((x_test, col_bias))    

    # convert label vals to int and to vector
    y_test = data[TRAIN_NUM:,0].astype(int)
    y_test = y_test.reshape((-1, 1))  # convert to column vector

    return x_train, y_train, x_test, y_test

In [269]:
def offset_labels(y):
    OFFSET = 1900 # starting the index 0 with year 1923
    return y - OFFSET
    

In [270]:
# # # load data
# fname = 'YearPredictionMSD.txt'

# # # note, features are normalized
# x_train, y_train, x_test, y_test = load_data(fname)

In [271]:
# offset to make labels start at index 0
y_train_off = offset_labels(y_train)
y_test_off = offset_labels(y_test)

In [272]:
# create one hot on y labels

def one_hot_vary(y_train, y_test):

    train_size = len(y_train)
    test_size = len(y_test)

    stacked = np.vstack((y_train, y_test))
    one_h = OneHotEncoder().fit_transform(stacked).toarray()
    
    y_train = one_h[0:train_size, :]
    y_test = one_h[train_size:, :]
    return y_train, y_test  # new one hots

In [273]:
# obtain one hots on the variable number of classes
y_train_one_off, y_test_one_off = one_hot_vary(y_train_off, y_test_off)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [274]:
# list of years represented by list indexes
train_years_key = np.unique(y_train_off)

In [275]:
# set hyperparameters here
num_classes = y_train_one_off.shape[1]
epochs = 100
learning_rate = 0.00001  # [0.1, 0.01, 0.001]
batch_size = 100  # try powers of 2
regularization = 0.5  # L2 weight decay, range [1, 0.1, 0.01, 0.001]
momentum = 0.05  # started with 0 to 1, tried 2

smc = SoftmaxClassifier(epochs, learning_rate, batch_size, regularization, momentum, num_classes)




In [None]:
subsample = 100000
train_losses, test_losses, train_mse, test_mse = smc.run_epochs(x_train[:subsample], y_train_off[:subsample], y_train_one_off[:subsample], x_test[:subsample], y_test_off[:subsample], y_test_one_off[:subsample])
smc.plot_graph(train_losses, test_losses, train_mse, test_mse)

Ephoch 1 / 100...
TRAINING PHASE --
Test PHASE --
train loss:  4.4877758097368385
test loss:  4.4873277124072715
train MSE:  1420.19596
test MSE:  505.3695841645523
Ephoch 2 / 100...
TRAINING PHASE --
Test PHASE --
train loss:  4.486847791311746
test loss:  4.486260266201404
train MSE:  461.41105
test MSE:  420.7141058666305
Ephoch 3 / 100...
TRAINING PHASE --
Test PHASE --
train loss:  4.485761764878209
test loss:  4.4851591114354
train MSE:  434.11365
test MSE:  416.99980631790976
Ephoch 4 / 100...
TRAINING PHASE --
Test PHASE --
train loss:  4.484700989400915
test loss:  4.484049886352825
train MSE:  434.52605
test MSE:  416.6447870465418
Ephoch 5 / 100...
TRAINING PHASE --
Test PHASE --
train loss:  4.4836095135623975
test loss:  4.482938990081306
train MSE:  433.29852
test MSE:  416.8707946776162
Ephoch 6 / 100...
TRAINING PHASE --
Test PHASE --
train loss:  4.4825142273848195
test loss:  4.481828495080987
train MSE:  433.62426
test MSE:  416.86661114446747
Ephoch 7 / 100...
TRAIN

In [None]:
y_train_off[0]