In [2]:
import numpy as np
from scipy.sparse import csr_matrix

def read_data(filename):
    with open(filename) as f:
        lines = f.readlines()

        labels = []

        rows = []
        cols = []
        vals = []

        max_row = 0
        max_col = 0

        for line in lines:
            label = int(line.split()[0])
            if (label != 2 and label != 6):
                continue
            labels.append(label)

            feature_row = line.split()[1:]
            for feature in feature_row:
                col, val = feature.split(':')

                rows.append(max_row)
                cols.append(int(col))
                vals.append(float(val))

                if int(col) > max_col:
                    max_col = int(col)

            max_row += 1

        # change to numpy arrays
        labels = np.array(labels)
        features = csr_matrix((vals, (rows, cols)), shape=(max_row, max_col + 1)).toarray()

        print(f">> read {filename}")
        print("read labels: ", labels.shape)
        print("read features: ", features.shape)

    return labels, features

In [3]:
from liblinear.liblinearutil import *

def problem10(result_file, round=1):
    labels, features = read_data('mnist.scale.txt')
    lables_t, features_t = read_data('mnist.scale.t.txt')

    with open(result_file, 'w') as f:
        f.write("lambda,eout,non-zero\n")

    lambdas = [0.01, 0.1, 1, 10, 100, 1000]

    for i in range(round):
        # find best lambda with min ein
        min_ein = 1
        best_lambda = 0
        best_model = None

        for l in lambdas:
            cost = 1 / l
            # train
            prob = problem(labels, features)
            param = parameter(f'-s 6 -c {cost}')
            m = train(prob, param)

            _, p_acc, _ = predict(labels, features, m)
            ein = 1 - p_acc[0] / 100
            # print(f"train on lambda: {l}, ein: {ein}")

            if ein < min_ein:
                min_ein = ein
                best_lambda = l
                best_model = m
            elif ein == min_ein:
                # choose larger lambda
                if l > best_lambda:
                    best_lambda = l
                    best_model = m

        # test on best lambda
        _, p_acc_t, _ = predict(lables_t, features_t, best_model)
        eout = 1 - p_acc_t[0] / 100

        # calculate non-zero components of model
        nz = np.count_nonzero(best_model.get_decfun()[0])

        with open(result_file, 'a') as f:
            f.write(f"{best_lambda},{eout},{nz}\n")

    print("problem 10 done")

In [4]:
from liblinear.liblinearutil import *

def problem11(result_file, round=1):
    labels, features = read_data('mnist.scale.txt')
    lables_t, features_t = read_data('mnist.scale.t.txt')

    with open(result_file, 'w') as f:
        f.write("lambda,eout\n")

    lambdas = [0.01, 0.1, 1, 10, 100, 1000]

    for i in range(round):
        # randomly select 8000 samples for training, others for validation
        idx = np.random.permutation(labels.shape[0])
        idx_train = idx[:8000]
        idx_val = idx[8000:]

        labels_train = labels[idx_train]
        features_train = features[idx_train]

        labels_val = labels[idx_val]
        features_val = features[idx_val]

        min_eval = 1
        best_lambda = 0

        for l in lambdas:
            cost = 1 / l
            # train
            prob = problem(labels_train, features_train)
            param = parameter(f'-s 6 -c {cost}')
            m = train(prob, param)

            _, p_acc, _ = predict(labels_val, features_val, m)
            eval_ = 1 - p_acc[0] / 100

            if eval_ < min_eval:
                min_eval = eval_
                best_lambda = l
            elif eval_ == min_eval:
                # choose larger lambda
                if l > best_lambda:
                    best_lambda = l

        # re-run training with best lambda on whole training set
        prob = problem(labels, features)
        param = parameter(f'-s 6 -c {1/best_lambda}')
        best_model = train(prob, param)

        _, p_acc_t, _ = predict(lables_t, features_t, best_model)
        eout = 1 - p_acc_t[0] / 100

        with open(result_file, 'a') as f:
            f.write(f"{best_lambda},{eout}\n")

    print("problem 11 done")

In [11]:
from liblinear.liblinearutil import *

def problem12(result_file, round=1):
    labels, features = read_data('mnist.scale.txt')
    lables_t, features_t = read_data('mnist.scale.t.txt')

    with open(result_file, 'w') as f:
        f.write("lambda,eout\n")

    lambdas = [0.01, 0.1, 1, 10, 100, 1000]

    for i in range(round):
        # spilt data set 3-fold cross validation
        np.random.seed(i)
        idx = np.random.permutation(labels.shape[0])

        min_ecv = 1
        best_lambda = 0

        for l in lambdas:
            cost = 1 / l

            ecv = 1
            # 3-fold cross validation
            for m in range(3):
                idx_train = idx[int(labels.shape[0] * m / 3):int(labels.shape[0] * (m+1) / 3)]

                labels_train = labels[idx_train]
                features_train = features[idx_train]

                labels_val = labels[~idx_train]
                features_val = features[~idx_train]

                # train
                prob = problem(labels_train, features_train)
                param = parameter(f'-s 6 -c {cost}')
                m = train(prob, param)

                _, p_acc, _ = predict(labels_val, features_val, m)
                em = 1 - p_acc[0] / 100

                if em < ecv:
                    ecv = em

            if ecv < min_ecv:
                min_ecv = ecv
                best_lambda = l
            elif ecv == min_ecv:
                # choose larger lambda
                if l > best_lambda:
                    best_lambda = l

        # re-run training with best lambda on whole training set
        prob = problem(labels, features)
        param = parameter(f'-s 6 -c {1/best_lambda}')
        best_model = train(prob, param)

        _, p_acc_t, _ = predict(lables_t, features_t, best_model)
        eout = 1 - p_acc_t[0] / 100

        with open(result_file, 'a') as f:
            f.write(f"{best_lambda},{eout}\n")

    print("problem 12 done")

In [None]:
problem10('result10.csv', 1126)
problem11('result11.csv', 1126)
problem12('result12.csv', 1126)

>> read mnist.scale.txt
read labels:  (11876,)
read features:  (11876, 728)
>> read mnist.scale.t.txt
read labels:  (1990,)
read features:  (1990, 725)
Accuracy = 98.8631% (3913/3958) (classification)
Accuracy = 98.6865% (3907/3959) (classification)
Accuracy = 98.5855% (3903/3959) (classification)
Accuracy = 98.9136% (3915/3958) (classification)
Accuracy = 98.6865% (3907/3959) (classification)
Accuracy = 98.5097% (3900/3959) (classification)
Accuracy = 98.5599% (3901/3958) (classification)
Accuracy = 98.5602% (3902/3959) (classification)
Accuracy = 98.3582% (3894/3959) (classification)
Accuracy = 97.1703% (3846/3958) (classification)
Accuracy = 97.3983% (3856/3959) (classification)
Accuracy = 97.07% (3843/3959) (classification)
Accuracy = 93.7847% (3712/3958) (classification)
Accuracy = 94.2662% (3732/3959) (classification)
Accuracy = 93.559% (3704/3959) (classification)
Accuracy = 50.1769% (1986/3958) (classification)
Accuracy = 49.0275% (1941/3959) (classification)
Accuracy = 50.6441