In [None]:
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from tqdm import tqdm
from scripts import save_csv

# Problem2:Data Partitioning

In [None]:
def Partitioning_MNIST(data, labels):
    """Return training_data, training_labels, validation_data, validation_labels of MNIST"""

    rng = np.random.default_rng()
    index = rng.permutation(60000)
    data_copy, labels_copy = np.reshape(np.squeeze(data[index], axis=1), (60000, 784)), labels[index]

    training_data = data_copy[:50000]
    training_labels = labels_copy[:50000]

    validation_data = data_copy[50000:]
    validation_labels = labels_copy[50000:]

    return training_data, training_labels, validation_data, validation_labels

In [None]:
def Partitioning_spam(data, labels):
    """Return training_data, training_labels, validation_data, validation_labels of SPAM"""

    rng = np.random.default_rng()
    index = rng.permutation(4172)
    data_copy, labels_copy = data[index], labels[index]

    num = int(data.shape[0] * 0.8)

    training_data = data_copy[:num]
    training_labels = labels_copy[:num]

    validation_data = data_copy[num:]
    validation_labels = labels_copy[num:]

    return training_data, training_labels, validation_data, validation_labels

In [None]:
def Partitioning_cifar10(data, labels):
    """Return training_data, training_labels, validation_data, validation_labels of CIFAR10"""

    rng = np.random.default_rng()
    index = rng.permutation(50000)
    data_copy, labels_copy = data[index], labels[index]

    training_data = data_copy[:45000]
    training_labels = labels_copy[:45000]

    validation_data = data_copy[45000:]
    validation_labels = labels_copy[45000:]

    return training_data, training_labels, validation_data, validation_labels

# Problem3:Support Vector Machines: Coding

In [None]:
def SVM_MNIST():
    """Train your model with the following numbers of training examples:
    100, 200, 500, 1,000, 2,000, 5,000, 10,000
    """
    data = np.load("data\mnist-data.npz")
    training_data, training_labels, validation_data, validation_labels = Partitioning_MNIST(data["training_data"], data["training_labels"])
    training_accuracy, validation_accuracy =[], []

    clf = LinearSVC(max_iter=10000, loss='hinge', C=0.1)
    samples = [100, 200, 500, 1000, 2000, 5000, 10000]

    for num in tqdm(samples):
        clf.fit(training_data[:num], training_labels[:num])
        training_accuracy.append(accuracy_score(training_labels[:num], clf.predict(training_data[:num])))
        validation_accuracy.append(accuracy_score(validation_labels, clf.predict(validation_data)))
    return samples, training_accuracy, validation_accuracy

In [None]:
def SVM_spam():
    """Train your model with the following numbers of training examples: 100, 200, 500, 1,000, 2,000, ALL"""
    data = np.load("data\spam-data.npz")
    training_data, training_labels, validation_data, validation_labels = Partitioning_spam(data["training_data"], data["training_labels"])
    training_accuracy, validation_accuracy =[], []

    clf = LinearSVC(max_iter=100000, loss='hinge', C=1)
    samples = [100, 200, 500, 1000, 2000, 4172]

    for num in tqdm(samples):
        clf.fit(training_data[:num], training_labels[:num])
        training_accuracy.append(accuracy_score(training_labels[:num], clf.predict(training_data[:num])))
        validation_accuracy.append(accuracy_score(validation_labels, clf.predict(validation_data)))
    return samples, training_accuracy, validation_accuracy

In [None]:
def SVM_cifar10():
    """Train your model with the following numbers of training examples: 100, 200, 500, 1,000, 2,000, 5,000"""
    data = np.load("data\cifar10-data.npz")
    training_data, training_labels, validation_data, validation_labels = Partitioning_cifar10(data["training_data"], data["training_labels"])
    training_accuracy, validation_accuracy =[], []

    clf = LinearSVC(max_iter=100000, loss='hinge', C=1)
    samples = [100, 200, 500, 1000, 2000, 5000]

    for num in tqdm(samples):
        clf.fit(training_data[:num], training_labels[:num])
        training_accuracy.append(accuracy_score(training_labels[:num], clf.predict(training_data[:num])))
        validation_accuracy.append(accuracy_score(validation_labels, clf.predict(validation_data)))
    return samples, training_accuracy, validation_accuracy

In [None]:
def SVM_Plt(samples, training_accuracy, validation_accuracy, dataset):
    """draw plots showing number of examples versus training and validation accuracy for each of the datasets"""
    plt.xlabel("samples")
    plt.ylabel("accuracy")
    plt.plot(samples, training_accuracy, color='blue', label='training_accuracy')
    plt.plot(samples, validation_accuracy, color='red', label='validation_accuracy')
    plt.title(dataset)
    plt.legend()
    plt.savefig('Image\{}-Problem3.png'.format(dataset))
    plt.show()

In [None]:
# MNIST分类
samples, training_accuracy, validation_accuracy = SVM_MNIST()
SVM_Plt(samples, training_accuracy, validation_accuracy, 'MNIST')

In [None]:
# spam分类
samples, training_accuracy, validation_accuracy = SVM_spam()
SVM_Plt(samples, training_accuracy, validation_accuracy, 'spam')

In [None]:
# cifar-10分类
samples, training_accuracy, validation_accuracy = SVM_cifar10()
SVM_Plt(samples, training_accuracy, validation_accuracy, 'cifar10')

# Problem4:Hyperparameter Tuning
## the result is saved in Problem4.txt

In [None]:
def train_with_different_C():
    """we train the model repeatedly with different hyperparameters on MNIST"""

    data = np.load("data\mnist-data.npz")
    training_data, training_labels, validation_data, validation_labels = Partitioning_MNIST(data["training_data"], data["training_labels"])
    validation_accuracy = []

    C_list = [0.01 * pow(2,i) for i in range(8)]

    for num in tqdm(C_list):
        clf = LinearSVC(max_iter=100000, loss='hinge', C=num)
        clf.fit(training_data, training_labels)
        validation_accuracy.append(accuracy_score(validation_labels, clf.predict(validation_data)))
    
    for i in range(8):
        print('C:{}, validation_accuracy:{}'.format(C_list[i], validation_accuracy[i]))

    best_C = C_list[validation_accuracy.index(max(validation_accuracy))]

    return best_C

In [None]:
def test_with_best_C():
    """retrain SVM with best_C and predict test_data"""
    data = np.load("data\mnist-data.npz")

    best_C = train_with_different_C()
    clf = LinearSVC(max_iter=1000000, loss='hinge', C=best_C)
    clf.fit(np.reshape(np.squeeze(data["training_data"]), (60000, 28*28)), data["training_labels"])

    X_test = np.reshape(np.squeeze(data["test_data"]), (10000, 28*28))
    Y_test = clf.predict(X_test)

    save_csv.results_to_csv(Y_test)
    

In [None]:
# Problem4
test_with_best_C()

# Problem5:K-Fold Cross-Validation

In [None]:
def spam_K_FOLD_partitioning(K):
    """partition spam dataset to K parts"""

    data = np.load("data\spam-data.npz")
    rng = np.random.default_rng()
    index = rng.permutation(4172)
    data_shuffled, labels_shuffled = data["training_data"][index], data["training_labels"][index]
    data_list, labels_list = [], []

    num = 4172 // K
    for i in range(K-1):
        dataset, labelsset = data_shuffled[i*num:(i+1)*num], labels_shuffled[i*num:(i+1)*num]
        data_list.append(dataset)
        labels_list.append(labelsset)
    data_list.append(data_shuffled[(K-1)*num:])
    labels_list.append(labels_shuffled[(K-1)*num:])
    return data_list, labels_list
    

In [None]:
def K_fold_cross_validation(K, C):
    """do K_fold_cross_validation with hyperparameter C"""

    data_list, labels_list = spam_K_FOLD_partitioning(K)
    score = []

    for i in tqdm(range(K)):
        clf = LinearSVC(loss='hinge', C=C, max_iter=1000000)
        training_data, training_labels = [], []
        validation_data, validation_labels = [], []
        for index in range(K):
            if index == i:
                validation_data.extend(data_list[index])
                validation_labels.extend(labels_list[index])
            else:
                training_data.extend(data_list[index])
                training_labels.extend(labels_list[index])
        clf.fit(training_data, training_labels)
        score.append(accuracy_score(validation_labels, clf.predict(validation_data)))
    return sum(score) / len(score)

In [None]:
def best_C(K):
    """K fold cross validation for different C, and choose the best C"""
    validation_accuracy = []
    C_list = [0.01 * pow(2,i) for i in range(8)]

    for C in tqdm(C_list):
        validation_accuracy.append(K_fold_cross_validation(K, C))
    
    for i in range(8):
        print('C:{}, validation_accuracy:{}'.format(C_list[i], validation_accuracy[i]))

    best_C = C_list[validation_accuracy.index(max(validation_accuracy))]
    
    return best_C

In [None]:
best_C = best_C(5)
print('best_C:{}'.format(best_C))

# Problem6: Kaggle

略

# Problem7: Theory of Hard-Margin Support Vector Machines

test