In [1]:
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd, numpy as np
matplotlib.rcParams[u'font.sans-serif'] = ['simhei']
matplotlib.rcParams['axes.unicode_minus'] = False

In [288]:
def get_data3(ratio = 0.6):
    data = pd.read_csv('./data3.csv')
    data.drop(axis=1,columns=['编号'],inplace=True)
    data[['好瓜']] = data[['好瓜']].apply(lambda x: 1.0 if (x == '是').bool() else 0.0, axis=1)
    
    discrete_fea = data.columns[data.dtypes == 'object']
    for fea in discrete_fea:
        onehot_fea = pd.get_dummies(data[[fea]], dtype='float')
        data.drop(fea, axis=1, inplace=True)
        data = pd.concat([onehot_fea, data], axis=1)
        
#     data = data.sample(frac = 1).reset_index(drop=True)
    data_posi = data[data['好瓜'] == 1].sample(frac = 1).reset_index(drop=True)
    data_nega = data[data['好瓜'] == 0].sample(frac = 1).reset_index(drop=True)
    data_train = data_posi[:int(len(data_posi)*ratio)].append(data_nega[:int(len(data_nega)*ratio)]).reset_index(drop=True)
    data_test = data_posi[int(len(data_posi)*ratio):].append(data_nega[int(len(data_nega)*ratio):]).reset_index(drop=True)
    
    return (data_train, data_test)

In [289]:
def sigmoid(x):
    return 1.0 / (np.exp(-x) + 1.0)

In [310]:
class NN(object):
    def __init__(self, data_pair, by_whole = False, lamda = 2e-8):
        (self.data_train, self.data_test) = data_pair
        self.by_whole = by_whole
        self.lamda = lamda ## ratio of non-regular item
        self.d = self.data_train.shape[1] - 1
        self.q = self.d
        self.l = 1
        self.data = [ np.zeros(self.d), np.zeros(self.q), np.zeros(self.l) ] ## [x, b, y]
        self.weight = [ np.random.random((self.d, self.q)), np.random.random((self.q, self.l)) ] ## [v, w]
        self.threshold = [ np.random.random(self.q), np.random.random(self.l) ] ## [gamma, theta]
        self.clear()
        
    def clear(self):
        self.delta_w = np.zeros((self.q, self.l))
        self.delta_theta = np.float64(0.0)
        self.delta_v = np.zeros((self.d, self.q))
        self.delta_gamma = np.zeros(self.q)
    
    def update(self, n):
        ## regular item added
        self.weight[1]+=(1-self.lamda)*self.delta_w/float(n) - self.lamda*np.square(self.weight[1]).sum()
        self.threshold[1]+=(1-self.lamda)*self.delta_theta/float(n) - self.lamda*np.square(self.threshold[1]).sum()
        self.weight[0]+=(1-self.lamda)*self.delta_v/float(n) - self.lamda*np.square(self.weight[0]).sum()
        self.threshold[0]+=(1-self.lamda)*self.delta_gamma/float(n) - self.lamda*np.square(self.threshold[0]).sum()
        self.clear()
        
    ## input: idx of data_train
    ## output: (y^, y)
    def forward(self, sample):
        sample = sample.values
        self.data[0] = sample[:-1]
        self.data[1] = sigmoid(np.dot(self.weight[0].T, self.data[0]) - self.threshold[0])
        self.data[2] = sigmoid(np.dot(self.weight[1].T, self.data[1]) - self.threshold[1])
        (_y, y) = self.data[2][0], sample[-1]
        return (_y, y)
    
    ## input: (y^, y)
    def bp(self, _y, y, alpha = 4.0, beta = 1.0, to_update = True):    
        g = _y * (1.0 - _y) * (y - _y) ## (1, )
        e = self.data[1] * (1.0 - self.data[1]) * np.dot(self.weight[1].T, self.data[1]) ## (q, )

        self.delta_w += beta * g * self.data[1].reshape(self.delta_w.shape[0], 1) ## (q, )
        self.delta_theta += -beta * g ## (1, )
        self.delta_v += alpha * np.outer(self.data[0], e) ## (d, q)
        self.delta_gamma += -alpha * e ## (q, 1)
        if to_update: self.update(1)
        
    def fit(self, iteration = 10000):
        
        for T in range(iteration + 1):
            if T % 1000 == 0: print ('iter {}:'.format(T))
            
            ## data_train
            acc, n = 0.0, self.data_train.shape[0]
            for i in range(n):
                (_y, y) = self.forward(self.data_train.iloc[i])
                acc += (_y < 0.5 and y == 0.0) or (_y >= 0.5 and y == 1.0)
                self.bp(_y, y, self.by_whole)
            if T % 1000 == 0: print ('train_accuracy: {}'.format(acc / float(n)))
            if self.by_whole == True: self.update(n)

            ## data_test
            acc, n = 0.0, self.data_test.shape[0]
            if n == 0: continue
            for i in range(n):
                (_y, y) = self.forward(self.data_test.iloc[i])
                acc += (_y < 0.5 and y == 0.0) or (_y >= 0.5 and y == 1.0)
            if T % 1000 == 0: print ('test_accuracy: {}'.format(acc / float(n)))

In [312]:
nn = NN(data_pair=get_data3(), by_whole=False)
nn.fit()

iter 0:
train_accuracy: 0.4444444444444444
test_accuracy: 0.5
iter 1000:
train_accuracy: 0.8888888888888888
test_accuracy: 0.5
iter 2000:
train_accuracy: 0.8888888888888888
test_accuracy: 0.875
iter 3000:
train_accuracy: 1.0
test_accuracy: 0.875
iter 4000:
train_accuracy: 1.0
test_accuracy: 0.875
iter 5000:
train_accuracy: 1.0
test_accuracy: 0.875
iter 6000:
train_accuracy: 1.0
test_accuracy: 0.875
iter 7000:
train_accuracy: 1.0
test_accuracy: 0.875
iter 8000:
train_accuracy: 1.0
test_accuracy: 0.875
iter 9000:
train_accuracy: 1.0
test_accuracy: 0.875
iter 10000:
train_accuracy: 1.0
test_accuracy: 0.875


In [345]:
nn.data

[array([0.   , 1.   , 0.   , 1.   , 0.   , 0.   , 1.   , 0.   , 0.   ,
        0.   , 1.   , 1.   , 0.   , 0.   , 0.   , 0.   , 1.   , 0.243,
        0.267]),
 array([0.92303218, 0.93575052, 0.50301132, 0.88127795, 0.74235609,
        0.93526469, 0.81453194, 0.75906516, 0.72673143, 0.77506768,
        0.9633189 , 0.63793674, 0.9452934 , 0.83496401, 0.75918474,
        0.90171826, 0.68978209, 0.95505926, 0.94493792]),
 array([0.25486036])]

In [342]:
nn.threshold

[array([ 0.24344668,  0.6516109 ,  0.75907478,  0.60997328,  0.79474004,
         0.17206575,  0.15094391,  0.82964143,  0.98708875,  0.3858398 ,
         0.01434463,  0.86190606,  0.11832951,  0.81139068,  0.82637912,
         0.65209751, -0.01151704,  0.71379395,  0.03926378]),
 array([-1.86558582])]