In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None).values
data[np.where(data == ' <=50K')] = -1
data[np.where(data == ' >50K')] = 1
data = data[:, [0,2,4,10,11,12,-1]]# only use numerical features and the last column(label)
mean = np.mean(data[:, 0:-1], axis=0)
std = np.std(data[:, 0:-1].astype(int), axis=0)
split = int(0.9*data.shape[0]) 

# data unit normalization
data[:, 0:-1] = np.divide(np.subtract(data[:, 0:-1], mean), std)

lambdas = np.array([0.001, 0.01, 0.1, 1])
train_acc = np.ones((4, 10))
test_acc = np.ones((4, 1))




In [5]:
split

29304

In [6]:
# train and evaluate model for different lambdas
for i in range(4): 
    
    best_acc = 0
    for epo in range(50): # train the model for 50 epochs

        # randomly split the data set into 90% training and 10% testing
        rand_idx = np.arange(data.shape[0])
        np.random.shuffle(rand_idx)
        train = data[rand_idx[0:split]]
        test = data[rand_idx[split:]]

        # select a small portion of training data as held out and the rest to be epoch dataset
        held_out = train[0:50]
        epoch = train[50:]
        steps = 300
        batch_size = int(epoch.shape[0]/steps) # batch_size = epoch_size / step
        held_out_acc = []
        
        # initialize a and b
        a = np.ones((1,6))
        b = 1
        
        for s in range(steps):
            
            step_length = 1/(0.01*s+20)# variant step length
            batch = epoch[ s*batch_size : (s+1)*batch_size ]
            boundary = np.dot(batch[:,-1].T, (np.dot(batch[:, 0:-1], a.T) + b))# y*(a*x+b)
            if boundary >= 1:
                a = a - step_length * lambdas[i] * a
            else:
                a = a - step_length * (lambdas[i] * a - batch[-1, -1] * batch[-1, 0:-1])
                b = b + step_length * batch[-1, -1]
                
            if s % 30 == 0: # examine the model accuracy on held out data for every 30 steps
                held_out_pred = np.sign(np.dot(held_out[:, 0:-1], a.T) + b)
                held_out_err = np.where(held_out_pred.T != held_out[:, -1])[0].shape[0]
                held_out_acc = 1 - held_out_err / held_out.shape[0]
                mark = int(s/30)
                train_acc[i, mark] = held_out_acc
                
        pred = np.sign(np.dot(test[:, 0:-1], a.T) + b)
        err = np.where(pred.T != test[:, -1])[0].shape[0]
        acc = 1 - err / test.shape[0]        
        if acc > best_acc:
            best_acc = acc
            
    test_acc[i] = best_acc


In [11]:
rand_idx[0:split]

array([22848,  5989,  6151, ...,  5358, 12294, 31009])

In [9]:
test

array([[1.496922235546068, 0.36076007825384554, 1.1347387637961643, ...,
        -0.21665952703259014, -0.27839874412538596, 1],
       [0.8371089803598137, 1.9065918166306237, 1.1347387637961643, ...,
        -0.21665952703259014, -0.8453271041481403, -1],
       [1.863485155093987, 1.8839386927861834, -1.5861584148750223, ...,
        -0.21665952703259014, -2.465122418498867, 1],
       ...,
       [3.109799081556912, -0.5454785674878533, -3.1409568026871293, ...,
        -0.21665952703259014, -1.6552247613235038, -1],
       [1.7901725711844034, 1.342973378997036, -1.5861584148750223, ...,
        -0.21665952703259014, -0.03542944697277691, -1],
       [1.2769844838173166, 0.05580033324972849, -1.5861584148750223,
        ..., -0.21665952703259014, -0.4403782755604586, 1]], dtype=object)