# Project3 - 二元分类器

本部分实现的逻辑回归的二元分类，同时使用了BGD,SGD,MBGD三种不同方法。使用的数据集是糖尿病数据集pima-indians-diabetes。

In [11]:
import numpy as np
import random
import pandas as pd

# 用于对比sklearn结果所需的库
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler

In [12]:
# 切割数据为训练集和测试集，及归一化
def split_data(data):
    m = 500
    f = 8

    x = data[:,:-1]
    y = data[:,-1]

    # 归一化数据
    x = (x-np.mean(x))/np.std(x)

    new_X_train = x[:m,:f]
    new_X_train = np.c_[np.ones(len(new_X_train),dtype='int64'),new_X_train]
    new_y_train = y[:m]
    new_X_test = x[m:,:f]
    new_X_test = np.c_[np.ones(len(new_X_test),dtype='int64'),new_X_test]
    new_y_test = y[m:]

    return new_X_train,new_X_test,new_y_train,new_y_test

In [13]:
# 初始化学习率和迭代次数
alpha = 0.005
iters = 2000

使用了BGD,SGD,MBGD三种不同的梯度下降方式来训练模型，并使用了L2正则化。在MBGD中，每个批次包含100个数据。

In [14]:
# BGD
def BGD(X,y,B,alpha,iterations):
    x_transposed = X.T
    m = len(y)

    for itreation in range(iterations):
        h = sigmoid(np.dot(X,B))
        loss = h - y

        gradient = x_transposed.dot(loss) / m
        # L2正则化
        b = B[0]
        B = (1 - alpha * 0.5 / m) * B - alpha * gradient
        B[0] = b - alpha * gradient [0] / m

    return B

In [15]:
# SGD
def SGD(X,y,B,alpha,iterations):
    m = len(y)

    for itreation in range(iterations):
        for i in range(m):
            index=int(random.uniform(0,m))
            h = sigmoid(np.dot(X[index],B))
            loss = h - y[index]

            # L2正则化
            b = B[0]
            B = (1 - alpha * 0.5) * B - alpha * (loss * X[index])
            B[0] = b - alpha * alpha * (loss * X[index]) [0]

    return B

In [16]:
# MBSG
def MBGD(X,y,B,alpha,batch_size,iterations):
    m = len(y)

    for itreation in range(iterations):
        batch = int(np.ceil(m / batch_size))
        for i in range(batch):
            batch_id = batch_size * i
            x_batch = X[batch_id:min(batch_id+batch_size,len(X))]
            y_batch = y[batch_id:min(batch_id+batch_size,len(y))]

            h = sigmoid(np.dot(x_batch,B))
            loss = h - y_batch

            b = B[0]
            bm = len(x_batch)

            B = (1 - alpha * 0.5 / bm) * B - alpha * (np.dot(x_batch.T,loss)) / bm
            B[0] = b - alpha * (np.dot(x_batch.T,loss)) [0] / bm

    return B

In [17]:
def sigmoid(x):
    return 1.0/(1+np.exp(-x))

In [18]:
def predict(x,B):
    return sigmoid(np.dot(x,B)).T[0]

### 运行

分别使用BGD,SGD,MBGD三种方式进行训练，同时使用测试集来测试三种方式的准确率

In [19]:
# 运行
# 读取数据
data = pd.read_csv("binaryLR.csv")

# 归一化及切割数据
X_train,X_test,y_train,y_test = split_data(data.values)

B = np.ones(X_train.shape[1])

# BGD
new_beta1 = BGD(X_train,y_train,B,alpha,iters)
new_beta1 = np.atleast_2d(new_beta1).reshape((9,1))
y_pred1 = np.round(predict(X_test,new_beta1))
print("BGD准确率为:",np.mean(np.equal(y_pred1,y_test)))

# SGD
new_beta2 = SGD(X_train,y_train,B,alpha,iters)
new_beta2 = np.atleast_2d(new_beta2).reshape((9,1))
y_pred2 = np.round(predict(X_test,new_beta2))
print("SGD准确率为:",np.mean(np.equal(y_pred2,y_test)))

# MBGD
new_beta3 = MBGD(X_train,y_train,B,alpha,100,iters)
new_beta3 = np.atleast_2d(new_beta3).reshape((9,1))
y_pred3 = np.round(predict(X_test,new_beta3))
print("MBGD准确率为:",np.mean(np.equal(y_pred3,y_test)))

BGD准确率为: 0.7910447761194029
SGD准确率为: 0.6828358208955224
MBGD准确率为: 0.8134328358208955


### sklearn方式

调用sklearn提供的方法训练，并使用测试集测试该方法的准确率

In [20]:
#与sklearn比较
sk_data = pd.read_csv("binaryLR.csv")
sc = StandardScaler()
sk_data = sc.fit_transform(sk_data)
sk_x = sk_data[:,:-1]
sk_y = sk_data[:,-1]
m = 500
f = 8

X_train = sk_x[:m,:f]
X_train = np.c_[np.ones(len(X_train),dtype='int64'),X_train]

Y_train = sk_y[:m]

X_test = sk_x[m:,:f]
X_test = np.c_[np.ones(len(X_test),dtype='int64'),X_test]

Y_test = sk_y[m:]

model = linear_model.LogisticRegression(penalty='l2',max_iter=2000)
model.fit(X_train,Y_train.astype('int'))
print("sklearn准确率为:",model.score(X_test,Y_test.astype('int')))


BGD准确率为: 0.7910447761194029
SGD准确率为: 0.6828358208955224
MBGD准确率为: 0.8134328358208955
sklearn准确率为: 0.8097014925373134
