In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

In [2]:
dataroot = 'watermelon.csv'
df = pd.read_csv(dataroot)

In [3]:
samples = np.zeros((17, 8))
labels = np.zeros((17, 1))

feature1 = {'青绿': 1, '乌黑': 2, '浅白': 3}
feature2 = {'蜷缩': 1, '稍蜷': 2, '硬挺': 3}
feature3 = {'浊响': 1, '沉闷': 2, '清脆': 3}
feature4 = {'清晰': 1, '稍糊': 2, '模糊': 3}
feature5 = {'凹陷': 1, '稍凹': 2, '平坦': 3}
feature6 = {'硬滑': 1, '软粘': 2}

for row in range(df.shape[0]):
    samples[row][0] = feature1[df.iloc[row][1]]
    samples[row][1] = feature2[df.iloc[row][2]]
    samples[row][2] = feature3[df.iloc[row][3]]
    samples[row][3] = feature4[df.iloc[row][4]]
    samples[row][4] = feature5[df.iloc[row][5]]
    samples[row][5] = feature6[df.iloc[row][6]]
    samples[row][6] = df.iloc[row][7]
    samples[row][7] = df.iloc[row][8]
    labels[row][0] = 1 if df.iloc[row][9] == '是' else 0
    

labels = np.reshape(labels, -1)

In [79]:
class LogisticRegression(object):
    
    
    def __init__(self, sample_dim):
        self.sample_dim = sample_dim
        self.threshold = 0.5
    
    
    def train(self, training_samples, training_labels):
        sample_dim = self.sample_dim
        sample_num = training_samples.shape[0]
        if sample_dim != training_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
            
        w = np.ones_like(training_samples[0])
        b = 1
        prev_w = w
        prev_b = b
        
        lr = 0.001
        iteration = 10000
        cnt = 0

        while cnt < iteration:
            prev_w = w
            prev_b = b
            
            w = w - lr * self.derivative_over_w(training_labels, training_samples, w, b)
            b = b - lr * self.derivative_over_b(training_labels, training_samples, w, b)
            
            if abs(sum(prev_w-w))<1e-5:
                break
                
            cnt += 1
            
        self.w = w
        self.b = b
            
            
    def test(self, testing_samples, testing_labels=None):
        sample_dim = self.sample_dim
        threshold = self.threshold
        
        sample_num = testing_samples.shape[0]
        if sample_dim != testing_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
        predicted_labels = np.zeros((sample_num))
            
        w = self.w
        b = self.b
        
        for i in range(sample_num):
            xi = testing_samples[i]
            
            out = self.sigmoid(xi, b, w)
            predicted_labels[i] = 1 if out > threshold else 0
            
        print(predicted_labels)
        if testing_labels is not None:
            print('ACC of test:{}'.format(accuracy_score(testing_labels, predicted_labels)))
    
    
    def sigmoid(self, x, b, w):
        return 1 / (1 + np.exp(- (w.T).dot(x) - b))
    
    
    def prob_positive(self, w, x, b):
        tmp = np.exp((w.T).dot(x) + b)
        return tmp / (1 + tmp)
    
    
    def derivative_over_w(self, y, x, w, b):
        D = np.zeros_like(x[0])
        for i in range(x.shape[0]):
            D += (x[i] * y[i] - x[i] * self.prob_positive(w, x[i], b))
        return -D
    
    
    def derivative_over_b(self, y, x, w, b):
        D = 0
        for i in range(x.shape[0]):
            D += (y[i] - self.prob_positive(w, x[i], b))
        return -D

In [80]:
LR = LogisticRegression(8)

In [81]:
idx = list(range(17))
np.random.shuffle(idx)

training_idx = idx[:14]
training_samples = samples[training_idx]
training_labels = labels[training_idx]

testing_idx = idx[14:]
testing_samples = samples[testing_idx]
testing_labels = labels[testing_idx]

In [82]:
LR.train(training_samples, training_labels)

In [83]:
LR.test(testing_samples, testing_labels)

[0. 1. 0.]
ACC of test:0.6666666666666666
