In [12]:
import pandas as pd
def read_data(s, delete_name):
    '''
    s, <string>, which dataset to access
    delete_name, <set>, which features do not take into account
    '''
    if s=='test':
        df = pd.read_csv('pro_data_test.csv', delimiter=',')
    elif s=='train':
        df = pd.read_csv('pro_data_train.csv', delimiter=',')
    else:
        return
    columns = df.columns
    for column in columns:
        if column in delete_name:
            del df[column]
    #print(df.head())
    feature_len = len(df.columns)-1
    feature = df[df.columns[0:feature_len-1]]
    label = df[df.columns[feature_len:feature_len+1]]
    X = feature.values
    Y_pre = label.values
    Y = np.zeros((len(Y_pre),1))
    for j in range(len(Y)):
        Y[j] = 2*(Y_pre[j]-0.5)
    return X, Y
delete_name = {'fnlwgt', 'capital-gain', 'capital-loss'}
X_train, Y_train = read_data('train', delete_name)
X_test, Y_test = read_data('test', delete_name)

In [107]:
import numpy as np
import random
class SVM:
    def __init__(self, data_len, lbd):
        self.W = np.zeros(data_len)
        self.b = 0
        self.lbd = lbd
    
    def train(self, X, Y, eta, epsilon, batch_size):
        batch_size = min(batch_size, len(Y))
        
        cnt = 0
        sqr_sum_W = np.ones(len(self.W))
        sqr_sum_b = 1
        e = 1
        
        stop = 0
        while True:
            batch = random.sample(range(len(Y)), batch_size)
            X_batch, Y_batch = X[batch], Y[batch]
            dev_W, dev_b = self.derivative(X_batch, Y_batch)
            sqr_sum_W = 0.9*sqr_sum_W + 0.1*dev_W*dev_W
            sqr_sum_b = 0.9*sqr_sum_b + 0.1*dev_b*dev_b
            
            self.W = self.W - eta*dev_W/np.sqrt(sqr_sum_W + e*np.ones(len(self.W)))
            self.b = self.b - eta*dev_b/np.sqrt(sqr_sum_b + e)
            
            if (np.dot(dev_W, dev_W) + np.dot(dev_b, dev_b))/(np.dot(self.W, self.W) + np.dot(self.b, self.b)) < epsilon:
                break
            
            #if cnt==20:
                #print(np.dot(dev_W, dev_W) + np.dot(dev_b, dev_b), self.loss(X,Y))
            #    cnt = 0
            #    stop = stop + 1
            #cnt = cnt + 1
            stop = stop + 1
        return
    
    def derivative(self, X, Y):
        dev_W = np.zeros(len(self.W))
        dev_b = 0
        for x, y in zip(X, Y):
            if 1 > y*(np.dot(self.W, x) + self.b):
                dev_W = dev_W - y*x
                dev_b = dev_b - y
        dev_W = dev_W / len(Y)
        dev_b = dev_b / len(Y)
        dev_W = dev_W + self.lbd * 2 * self.W
        #print(dev_W, dev_b)
        return dev_W, dev_b
        
    def loss(self, X, Y):
        l = 0
        for x, y in zip(X, Y):
            l += max(0, 1 - y*(np.dot(self.W, x) + self.b))
        l /= len(Y)
        l += self.lbd * np.dot(self.W, self.W)
        return l
    
    def output(self, x):
        if np.dot(self.W, x) + self.b > 0:
            return 1
        else:
            return -1
    
    def test(self, X, Y):
        error = 0
        for x, y in zip(X, Y):
            if y!=self.output(x):
                error = error + 1
        return error/len(Y)

# initialize svm
lbd = 0.05
svm = SVM(len(X_train[0]), lbd)

# training svm
epsilon = 0.005
eta = 0.01
batch_size = 200
svm.train(X_test, Y_test, eta, epsilon, batch_size)

In [113]:
print('Accuracy:', 1-svm.test(X_test,Y_test))

Accuracy: 0.8068393094289509


In [57]:
# support vector machine using sklearn package
import sklearn as skl
clf = skl.svm.SVC(kernel='linear')
clf.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [112]:
Y_pred = clf.predict(X_test)
print('Accuracy:', skl.metrics.accuracy_score(Y_pred, Y_test))

Accuracy: 0.8146746347941567
