# loading

In [3]:
import os
import struct
import random
import numpy as np
import matplotlib.pyplot as plt

def load_mnist(path, kind='train'):
    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels.idx1-ubyte'
                                % kind)
    images_path = os.path.join(path,
                               '%s-images.idx3-ubyte'
                               % kind)

    with open(labels_path, 'rb') as lbpath:
        magic, n = struct.unpack('>II',
                                 lbpath.read(8))
        labels = np.fromfile(lbpath,
                             dtype=np.uint8)

    with open(images_path, 'rb') as imgpath:
        magic, num, rows, cols = struct.unpack(">IIII",
                                               imgpath.read(16))
        images = np.fromfile(imgpath,
                             dtype=np.uint8).reshape(len(labels), 784)

    return images, labels

#### Loading the data

X_train, y_train = load_mnist('./data', kind='train')
X_test, y_test = load_mnist('./data', kind='t10k')

In [4]:
print('train_sample_number:\t:%d, column_number:%d' %(X_train.shape[0], X_train.shape[1]))
print('test_sample_number\t:%d, column_number:%d' %(X_test.shape[0], X_test.shape[1]))

train_sample_number:	:60000, column_number:784
test_sample_number	:10000, column_number:784


# preprocessing

In [5]:
X_train=X_train/255
X_test=X_test/255

# classifer

class myClassifier(object):
    """
    ovr
    """
    def __init__(self, C=0.1, eta=0.001, batch_size=500, num_iter=25, epsilon=0.001, class_num=0, shuffle=True):
        self.C=C
        self.eta=eta
        self.batch_size=batch_size
        self.num_iter=num_iter
        self.epsilon=epsilon
        self.class_num=class_num
        self.shuffle=shuffle
        
    def fit(self, X, y):
        X_num, X_fea = np.shape(X)
        #X_num=60000 X_fea=28*28
        self.class_num=len(np.unique(y))
        #class_num=10
        
        w=np.random.rand(self.class_num,X_fea)
        #w=[class_num][28*28]
        aver_w=w
        b=np.random.rand(self.class_num,1)
        #b=[class_num][1]
        aver_b=b
        
        if self.shuffle:
            s_data, s_labels = self.shuffling(X, y)
            #s_data[60000][28*28]
            #s_labels[60000][1]
        
        encoded_y=self.encoding(s_labels)
        #encoded_y[60000][class_num]
        
        cnt=0
        batch_count=X_num/self.batch_size
        for Xi in range(self.num_iter):
            #minibatch training
            for t in range(int(batch_count)):
                ###sgd
                batch_X, batch_y, bs=self.batching(s_data, encoded_y, t)
                #batch_X[batch_size][784] batch_y[batch_size][class_num] last_size=?
                
                batch_X=np.reshape(batch_X,(bs,X_fea))
                batch_y=np.reshape(batch_y,(bs,self.class_num))
                
                delta_w=np.zeros((self.class_num,X_fea))
                #delta_w[class_num][28*28]
                delta_b=np.zeros((self.class_num,1))
                #delta_b[10][1]
                
                loss=self.hinge_loss(batch_X, batch_y, w, b)
                #loss[batch_size][class_num]
                loss=1-loss
                
                loss[loss<=0]=0
                loss[loss>0]=1
                
                l_M_y=loss*batch_y
                #l_M_y  =loss[bs][class_num] batch_y[bs][class_num]
                
                temp_w=np.dot(np.transpose(l_M_y),batch_X)
                #temp[10][28*28]=batch_y[batch_size][28*28]
                delta_w=-(1/bs)*np.array(temp_w)+(1/self.C)*np.array(w)
                #delta_w[10][28*28]=c*temp[10][28*28]+c*w[10][28*28]
                
                temp_b=np.sum(np.transpose(l_M_y),axis=1)
                temp_b=np.reshape(temp_b,(self.class_num,1))
                #temp[class_num][1]
                delta_b=-(1/bs) * temp_b
                #delta_b[10][1]=c*[class_num][1]
                cnt+=1
            ###algorism
            
            w=np.array(w)-(self.eta*np.array(delta_w))
            #w[class_num][28*28]
            b=np.subtract(b,(self.eta * delta_b))
            #b[class_num][1]
            
            temp_w=(cnt/cnt+1)*aver_w + (1/cnt+1)*w
            temp_b=(cnt/cnt+1)*aver_b + (1/cnt+1)*b
            
            aver_w=np.where(aver_w>temp_w,aver_w,temp_w)
            aver_b=np.where(aver_b>temp_b,aver_b,aver_b)
            
        return aver_w, aver_b
           
    def encoding(self, y):
        encoded_y=-1*np.ones((np.shape(y)[0],self.class_num))
        #encoded_y[60000][class_num]
        for i in range(np.shape(y)[0]):
            encoded_y[i,y[i]] = 1
        return encoded_y
                
    def shuffling(self, X, y):
        temp=list(zip(X,y))
        random.shuffle(temp)
        X,y=zip(*temp)
        return X,y
    
    def batching(self, X, y, t):                         
        batch_X=X[t*self.batch_size:min(len(X),(t+1)*self.batch_size)]
        #batch_X[batch_size][28*28]
        batch_y=y[t*self.batch_size:min(len(X),(t+1)*self.batch_size)]
        #batch_y[batch_size][class_num]
        last_size=min(len(X), (t+1)*self.batch_size)-t*self.batch_size
        #last_size[size][28*28]
        
        return batch_X, batch_y,last_size
    
    def hinge_loss(self, X, y, w, b):
        net_v=self.net_input(X,w)
        #net_v[batch_size][class_num]
        temp=np.array(net_v)+np.transpose(b)
        #temp[batch_size][class_num]
        loss=y*temp
        #loss[batch_size][class_num]
        return loss
    
    def net_input(self, X, w):
        #X[batch_size][28*28] w[class_num][28*28]
        return np.dot(X,np.transpose(w))#[batch_size][class_num]
                      
    def test(self,X,y,w,b):
        net_v=self.net_input(X,w)
        #net_v[batch_size][class_num]
        temp=np.array(net_v)+np.transpose(b)
        #temp[batch_size][class_num]
        
        predicted=np.argmax(temp,axis=1)
        
        correct=np.count_nonzero(np.where(predicted==y,1,0))
        return correct
    
    def img(self, row, X, y, p):
        image = np.zeros((28,28))
        for i in range(0,28):
            for j in range(0,28):
                pix = 28*i+j
                image[i,j] = X[row, pix]
        plt.imshow(image, cmap = 'gray')
        plt.title('%d)true_value: %d pridicted_value: %d' %(row+1, y[row], p))
        plt.show()
        return print(X[row,0])

In [6]:
class MySVM(object):
    """
    1 vs 1 SVM (binary classification)
    """
    def __init__(self, C=0.1, eta=0.001, batch_size=1, max_iter=25, epsilon=1e-8):
        """
        constructor of MyBinarySVM class.
        """
        
        self.max_iter = max_iter
        self.batch_size = batch_size
        self.eta = eta
        self.C = C
        self.epsilon = epsilon
        self.num_classes = 0
#         self.beta1 = 0.9
#         self.beta2 = 0.99
    
    def fit(self, X, y=None, params=None):
        """
        fit method for training svm
        
        Arguments:
        --------------------------
        X: image data. (60000, 784)
        y: label data. (60000, 1)
        
        Returns:
        --------------------------
        Z: class score
        """

        m = np.shape(X)[0] #행 개수 60000
        n = np.shape(X)[1] #열 개수 784
        self.num_classes = len(np.unique(y)) #클래스 수 = 10
        
        y_encoded = self.encode_y(y)
        
        # create weights.
        if params is None:
            self.params = {
                'W': np.random.randn(n, self.num_classes), #(784,10) 정규분포난수
                'b': np.random.randn(1, self.num_classes)
            }
#             self.M = {
#                 'W': np.zeros((n, self.num_classes)),
#                 'b': np.zeros((1, self.num_classes))
#             }
#             self.V = {
#                 'W': np.zeros((n, self.num_classes)),
#                 'b': np.zeros((1, self.num_classes))
#             }

        cnt = 1
        
        # main loop: how much iterate on entire dataset.
        for epoch in range(self.max_iter):
            # before dive into SGD, shuffle dataset
            X_shuffled, y_shuffled = self.shuffle(X, y_encoded)
            
            # cost variable for printing/logging
            avg_loss = 0
            
            # batch_count = dataset_size / batch_size
            batch_count = int(np.ceil(np.shape(X)[0] / self.batch_size))
            
            # mini-batch loop
            for t in range(batch_count):
                # draw the {batch_size} number of samples from X and y
                X_batch, y_batch, bs = self.next_batch(X_shuffled, y_shuffled, t)
                
                # just in case, reshape batch of X and y into proper shape.
                X_batch = np.reshape(X_batch, (bs, n))
                y_batch = np.reshape(y_batch, (bs, self.num_classes))
                
                # prediction phase
                Z = self.forward_prop(X_batch)
                Z = np.reshape(Z, (bs, self.num_classes))
                
                # compute cost phase
                loss = self.compute_cost(y_batch, Z)
                
                # update weights phase
                self.backward_prop(X_batch, y_batch, Z, bs, cnt)
                
                # accumulate loss
                avg_loss += loss
                cnt += 1
        
            # logging
            avg_loss /= batch_count
            if epoch % (self.max_iter / 10) == 0:
                print('Cost at epoch {0}: {1}'.format(epoch, avg_loss))

        return self
    
    def encode_y(self, y):
        y_encoded = np.ones((np.shape(y)[0], self.num_classes)) #1로 이루어진 배열(60000,10)
        
        for i in range(self.num_classes):
            y_encoded[:, i][y != i] = -1
            
        return y_encoded
    
    def shuffle(self, X, y):
        """
        Random selection is required for SGD.
        But, my approach is to shuffle entire data before every iteration.
        This has same effect as random selection.
        
        Arguments:
        ---------------------------
        X: images (BATCH_SIZE, 784)
        y: labels (BATCH_SIZE, 1)
        
        Returns:
        ---------------------------
        shuffled data
        """
        
        # the number of dataset samples
        m = np.shape(X)[0]
        
        # variable for shuffle
        r = np.arange(0, m)
        
        np.random.shuffle(r)
        
        return X[r], y[r]
    
    def next_batch(self, X, y, t):
        """
        Get next batch.
        If it is SGD, next_batch function just pick one sample from dataset.
        
        Arguments:
        ---------------------------------
        X: images (60000, 784)
        y: labels (60000, 1)
        
        Returns:
        ---------------------------------
        X_batch: small subset of X (BATCH_SIZE, 784)
        y_batch: small subset of y (BATCH_SIZE, 1)
        """
        
        # the number of dataset samples
        m = np.shape(X)[0] #60000
        
        # draw the {batch_size} number of samples from X and y
        X_batch = X[t * self.batch_size : min(m, (t + 1) * self.batch_size)]
        y_batch = y[t * self.batch_size : min(m, (t + 1) * self.batch_size)]
        bs = min(m, (t + 1) * self.batch_size) - t * self.batch_size
        
        return X_batch, y_batch, bs
    
    def forward_prop(self, X):
        """
        Process of inference (prediction).
        
        Arguments:
        -----------------------
        X: images e.g (BATCH_SIZE, 784)
        params: weights dictionary(map in other programming language)
        
        Returns:
        -----------------------
        A: 
        """
        
        # prediction
        Z = np.matmul(X, self.params['W']) + self.params['b']
        
        return Z
    
#     def sigmoid(self, Z):
#         """
#         sigmoid activation for binary classification
        
#         Arguments:
#         ----------------------
#         Z: class score (W.T * X)
        
#         Returns:
#         ----------------------
#         sigmoid activation
#         """
        
#         return 1 / (1 + np.exp(-Z))
    
    def compute_cost(self, y, Z):
        """
        compute cost function (loss function)
        
        Arguments:
        ------------------------------
        y: true label
        Z: class score (W.T * X)
        
        Returns:
        ------------------------------
        loss: total cost (loss)
        """
        
        # compute loss function
        temp = 1 - np.multiply(y, Z)
        temp[temp < 0] = 0
        loss = np.mean(temp)
        return loss
    
    def backward_prop(self, X, y, Z, bs, cnt):
        """
        update weights
        
        Arguments:
        ----------------------------
        X: images e.g (BATCH_SIZE, 784)
        y: labels e.g (BATCH_SIZE, 1)
        Z: class score after forward propagation
        params: weights dictionary(map in other programming language)
        eta: learning rate
        
        Returns:
        ----------------------------
        params: weights dictionary
        """
        
        # number of features
        n = np.shape(X)[1]
        
        # differential vector of loss function to update weights
        dw = np.zeros(self.params['W'].shape)
        db = np.zeros(self.params['b'].shape)
        
        Z = np.reshape(Z, (bs, self.num_classes))
        temp = np.multiply(y, Z)
        temp = 1 - temp
        
        temp[temp <= 0] = 0
        temp[temp > 0] = 1
        
        y_temp = np.multiply(y, temp.reshape(bs, self.num_classes))
        
        dw = -(1 / bs) * np.matmul(X.T, y_temp) + (1 / self.C) * self.params['W']
        db = -(1 / bs) * np.sum(y_temp, axis=0)

#         if cnt == 1:
#             self.M['W'] = dw
#             self.M['b'] = db
#         else:
#             self.M['W'] = (self.beta1 * self.M['W'] + (1 - self.beta1) * dw)
#             self.M['b'] = (self.beta1 * self.M['b'] + (1 - self.beta1) * db)
        
#         if cnt == 1:
#             self.V['W'] = dw ** 2
#             self.V['b'] = db ** 2
#         else:
#             self.V['W'] = (self.beta2 * self.V['W'] + (1 - self.beta2) * (dw ** 2))
#             self.V['b'] = (self.beta2 * self.V['b'] + (1 - self.beta2) * (db ** 2))
    
#         self.M['W'] = (self.beta1 * self.M['W'] + (1 - self.beta1) * dw) / (1 - self.beta1 ** cnt)
#         self.M['b'] = (self.beta1 * self.M['b'] + (1 - self.beta1) * db) / (1 - self.beta1 ** cnt)
        
#         self.V['W'] = (self.beta2 * self.V['W'] + (1 - self.beta2) * np.square(dw)) / (1 - self.beta2 ** cnt)
#         self.V['b'] = (self.beta2 * self.V['b'] + (1 - self.beta2) * np.square(db)) / (1 - self.beta2 ** cnt)

        # update weights
#         self.params['W'] = self.params['W'] - np.divide(self.eta * self.M['W'], np.sqrt(self.V['W']) + self.epsilon)
#         self.params['b'] = self.params['b'] - np.divide(self.eta * self.M['b'], np.sqrt(self.V['b']) + self.epsilon)
        
        self.params['W'] = self.params['W'] - (self.eta / (1 + self.epsilon * cnt)) * dw
        self.params['b'] = self.params['b'] - (self.eta / (1 + self.epsilon * cnt)) * db

#         self.params['W'] = self.params['W'] - (self.eta / (1 + self.epsilon * cnt)) * dw
#         self.params['b'] = self.params['b'] - (self.eta / (1 + self.epsilon * cnt)) * db
        
        return self.params
    
    def predict(self, X, y=None):
        m = np.shape(X)[0]
        
        class_score = self.forward_prop(X)
        pred = np.argmax(class_score, axis=1)
        
        return pred
    
    def score(self, X, y=None):
        pred = self.predict(X)
        score = np.mean(pred == y)
        
        return score
    
    def get_parameters(self):
        return self.params

In [7]:
mine=MySVM()
mine.fit(X_train, y_train)

Cost at epoch 0: 0.20668101207912445
Cost at epoch 5: 0.19930864461253286
Cost at epoch 10: 0.19925636133943408
Cost at epoch 15: 0.1992415575053469
Cost at epoch 20: 0.19929039183303537


<__main__.MySVM at 0x7f80828bcdd0>

In [8]:
result=mine.test(X_test,y_test,w,b)
result

AttributeError: 'MySVM' object has no attribute 'test'

In [72]:
#result = pd.DataFrame(lr_grid.cv_results_)
#result.to_csv("cv_result_유현상.csv")