In [33]:
import numpy as np

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

rng = np.random.default_rng(1)

In this problem, you will use simulation to evaluate the expected misclassification error rate given a particular generating model.

The data:
- X is ten-dimensional, normally distributed
- y has classes 0 and 1, equally distributed

In [30]:
# generating the training data
def generate_ordered_data(nrSamples):
    '''
    Returns 10-dimensional data and their respective classes (0 or 1)
    '''
    cov = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]
    X_0 = rng.multivariate_normal([0]*10, cov, nrSamples)
    X_1 = rng.multivariate_normal(np.array([1]*5+[0]*5), cov, nrSamples)
    X = np.concatenate((X_0, X_1), axis=0)
    y = np.array([0]*nrSamples+[1]*nrSamples)
    return X, y

# generate the training data
training_X, training_y = generate_ordered_data(50)
# fit the model
svm = SVC(C=10)
svm.fit(training_X, training_y)

error_rates = []
for i in range(0,500):
    # now fit the model on a large test set
    test_X, test_y = generate_ordered_data(1000)
    pred_test_y = svm.predict(test_X)

    # get error rate
    error_rate = 1 - accuracy_score(test_y, pred_test_y)
    error_rates.append(error_rate)
print(sum(error_rates) / len(error_rates))

0.21719499999999983


Now the same with a linear kernel

In [32]:
svm_linear = SVC(C=10, kernel='linear')
svm_linear.fit(training_X, training_y)

error_rates = []
for i in range(0,500):
    # now fit the model on a large test set
    test_X, test_y = generate_ordered_data(1000)
    pred_test_y = svm_linear.predict(test_X)

    # get error rate
    error_rate = 1 - accuracy_score(test_y, pred_test_y)
    error_rates.append(error_rate)
print(sum(error_rates) / len(error_rates))

0.15674900000000003


And for LDA

In [34]:
lda = LDA()
lda.fit(training_X, training_y)

error_rates = []
for i in range(0,500):
    test_X, test_y = generate_ordered_data(1000)
    pred_test_y = lda.predict(test_X)

    # get error rate
    error_rate = 1 - accuracy_score(test_y, pred_test_y)
    error_rates.append(error_rate)
print(sum(error_rates) / len(error_rates))

0.15401700000000013
