In [41]:
import numpy as np
import requests


def getData(url):
    content = requests.get(url).content
    content = content.decode('utf-8')
    x = []
    y = []
    content = content.split('\n')
    for line in content[:-1]:
        data = line.split(' ')
        y.append(int(data[-1]))
        x1 = data[:-1]
        for i in range(len(x1)):
            x1[i] = float(x1[i])
        x.append([1] + x1)
    x = np.array(x)
    y = np.array(y)
    return x, y


def ridgeReg(x, y, lamda):
    identity = lamda * np.eye(len(x[0]))
    w = np.dot(np.dot(np.linalg.inv(np.dot(x.transpose(), x) + identity), x.transpose()), y)
    return w


def sign(v):
    if v > 0:
        return 1
    else:
        return -1


def errorRate(w, x, y):
    yHat = np.dot(w, x.transpose())
    yHat = np.where(yHat >= 0, 1, -1)
    errorNum = np.sum(np.array(y) != np.array(yHat))
    return errorNum / len(y)


def main():
    print("#13")
    lamda = 10
    trainUrl = 'https://www.csie.ntu.edu.tw/~htlin/mooc/datasets/mlfound_algo/hw4_train.dat'
    testUrl = 'https://www.csie.ntu.edu.tw/~htlin/mooc/datasets/mlfound_algo/hw4_test.dat'
    X, Y = getData(trainUrl)
    testX, testY = getData(testUrl)
    w = ridgeReg(X, Y, lamda)
    errorIn = errorRate(w, X, Y)
    errorOut = errorRate(w, testX, testY)
    print("in sample error:", errorIn)
    print("out of sample error:", errorOut)

    print("#14,15")
    for i in range(13):
        lamda = pow(10, i - 10)
        w = ridgeReg(X, Y, lamda)
        errorIn = errorRate(w, X, Y)
        errorOut = errorRate(w, testX, testY)
        print("log:", i - 10, "in:", errorIn, "out:", errorOut)

    print("#16,17")
    valX = X[120:]
    valY = Y[120:]
    trainX = X[:120]
    trainY = Y[:120]
    for i in range(13):
        lamda = pow(10, i - 10)
        w = ridgeReg(trainX, trainY, lamda)
        errorIn = errorRate(w, trainX, trainY)
        errorVal = errorRate(w, valX, valY)
        errorOut = errorRate(w, testX, testY)
        print("log:", i - 10)
        print("in:", errorIn, "validataion:", errorVal, "out:", errorOut)

    print("#18")
    lamda = 1
    w = ridgeReg(X, Y, lamda)
    errorIn = errorRate(w, X, Y)
    errorOut = errorRate(w, testX, testY)
    print("in:", errorIn, "out:", errorOut)

    print("#19")
    lamda = 1
    for k in range(13):
        lamda = pow(10, k - 10)
        errorVal = 0
        for i in range(1, 6):
            XL = X.tolist()
            YL = Y.tolist()
            trainX = XL[:(i - 1) * 40] + XL[i * 40:]
            trainX = np.array(trainX)
            trainY = YL[:(i - 1) * 40] + YL[i * 40:]
            trainY = np.array(trainY)
            valX = XL[(i - 1) * 40:i * 40]
            valX = np.array(valX)
            valY = YL[(i - 1) * 40:i * 40]
            valY = np.array(valY)
            w = ridgeReg(trainX, trainY, lamda)
            errorVal += errorRate(w, valX, valY)
        print("log:", k - 10, "cv:", errorVal / 5)

    print("#20")
    lamda = pow(10, -8)
    w = ridgeReg(X, Y, lamda)
    errorIn = errorRate(w, X, Y)
    errorOut = errorRate(w, testX, testY)
    print("in:", errorIn, "out:", errorOut)


if __name__ == '__main__':
    main()


#13
[[-0.93238149]
 [ 1.04618645]
 [ 1.046171  ]]
in sample error: 0.05
out of sample error: 0.05
#14,15
[[-1.49779077e+00]
 [ 3.65275388e+03]
 [-3.64962662e+03]]
log: -10 in: 0.015 out: 0.015
[[-1.49804810e+00]
 [ 3.48531234e+03]
 [-3.48218502e+03]]
log: -9 in: 0.015 out: 0.015
[[-1.49973117e+00]
 [ 2.38999673e+03]
 [-2.38686906e+03]]
log: -8 in: 0.015 out: 0.015
[[  -1.50251562]
 [ 577.91270972]
 [-574.78443932]]
log: -7 in: 0.03 out: 0.03
[[ -1.50329802]
 [ 68.68360275]
 [-65.55516458]]
log: -6 in: 0.035 out: 0.035
[[-1.50338988]
 [ 8.3884692 ]
 [-5.26001258]]
log: -5 in: 0.03 out: 0.03
[[-1.50339141]
 [ 2.24779126]
 [ 0.88065311]]
log: -4 in: 0.03 out: 0.03
[[-1.5033133 ]
 [ 1.63252051]
 [ 1.49578124]]
log: -3 in: 0.03 out: 0.03
[[-1.50252334]
 [ 1.57027554]
 [ 1.55659884]]
log: -2 in: 0.03 out: 0.03
[[-1.49466456]
 [ 1.55702271]
 [ 1.55565251]]
log: -1 in: 0.035 out: 0.035
[[-1.42004196]
 [ 1.48896417]
 [ 1.48882476]]
log: 0 in: 0.035 out: 0.035
[[-0.93238149]
 [ 1.04618645]
 [ 1.