# Testing the GPC on a given toy model


In [103]:
# importing all needed modules
import numpy as np
import scipy as sc

# defining the used toy model
import tensorflow_probability as tfp
tfd = tfp.distributions

bg_dist = tfd.MultivariateNormalDiag(np.linspace(1., -1, 7), np.linspace(1.1, 0.9, 7))
sig_dist = tfd.MultivariateNormalDiag(np.linspace(-1, 1, 7), np.linspace(0.9, 1.1, 7))

def generate_data(n, shuffle=True, add_noise=False):
    # Sample n bg and signal samples
    bg = bg_dist.sample(n).numpy()[..., np.newaxis]
    sig = sig_dist.sample(n).numpy()[..., np.newaxis]
        
    if add_noise:
        bg = bg + np.random.normal(0., 0.1, size=bg.shape)
        sig = sig + np.random.normal(0., 0.1, size=sig.shape)
    
    data = np.append(bg, sig, axis=0)
    labels = np.append(np.zeros(n), np.ones(n))

    if shuffle:
        shuffle = np.random.permutation(2 * n)
        data = (data[shuffle],)
        labels = labels[shuffle]
    else:
        data = (data, )

    return data, labels

In [104]:
# instantiating the gpc estimator
from sklearn.gaussian_process import GaussianProcessClassifier

gpc = GaussianProcessClassifier()
print(gpc)

GaussianProcessClassifier()


In [105]:
# extracting the data
data, labels = generate_data(100, add_noise=True)
print(np.shape(data))
print(np.shape(labels))

X = data[0][:,:,0]
y = labels
print(np.shape(X))
print(np.shape(y))

(1, 200, 7, 1)
(200,)
(200, 7)
(200,)


In [106]:
# fitting the data
gpc.fit(X, y)

GaussianProcessClassifier()

## evaluation method 1: train and test on the entire dataset

In [107]:
# predicting the response values for the observations in X
y_pred = gpc.predict(X)


In [108]:
# train and test on the entire dataset
from sklearn import metrics
print(metrics.accuracy_score(y, y_pred))

0.99


## evaluation method 2: train/test split


In [109]:
# split X and y intro training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [110]:
# train the gpc on the training set
gpc.fit(X_train, y_train)

GaussianProcessClassifier()

In [111]:
# make predictions on the testing set
y_pred = gpc.predict(X_test)

# compare actual response values with predicted response values
print(metrics.accuracy_score(y_test, y_pred))

0.8875


In [112]:
# optimizing accuracy by varying the used kernel
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import DotProduct
from sklearn.gaussian_process.kernels import Matern
# from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel

kernels = [1*RBF(), 1*DotProduct(), 1*Matern(), 1*WhiteKernel()]

for i in kernels:
    gpc = GaussianProcessClassifier(kernel=i)
    gpc.fit(X_train, y_train)
    y_pred = gpc.predict(X_test)
    print('Used Kernel:', i, 'Testing accuracy:', metrics.accuracy_score(y_test, y_pred))

Used Kernel: 1**2 * RBF(length_scale=1) Testing accuracy: 0.9
Used Kernel: 1**2 * DotProduct(sigma_0=1) Testing accuracy: 0.8875
Used Kernel: 1**2 * Matern(length_scale=1, nu=1.5) Testing accuracy: 0.9
Used Kernel: 1**2 * WhiteKernel(noise_level=1) Testing accuracy: 0.5125


## using cross validation to reduce the variance of the testing accuracy


In [130]:
from sklearn.model_selection import cross_val_score

for i in kernels:
    gpc = GaussianProcessClassifier(kernel=i)
    print('Used Kernel:', i, 'Cross-validated accuracy:', cross_val_score(gpc, X, y, cv=10, scoring='accuracy').mean())

Used Kernel: 1**2 * RBF(length_scale=1) Cross-validated accuracy: 0.945
Used Kernel: 1**2 * DotProduct(sigma_0=1) Cross-validated accuracy: 0.9399999999999998
Used Kernel: 1**2 * Matern(length_scale=1, nu=1.5) Cross-validated accuracy: 0.945
Used Kernel: 1**2 * WhiteKernel(noise_level=1) Cross-validated accuracy: 0.5


In [131]:
# using the RepeatedStratifiedKFold as the cross validator
from sklearn.model_selection import RepeatedStratifiedKFold

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)

for i in kernels:
    gpc = GaussianProcessClassifier(kernel=i)
    print('Used Kernel:', i, 'RepeatedStratifiedKFold accuracy:', cross_val_score(gpc, X, y, cv=cv, scoring='accuracy').mean())


Used Kernel: 1**2 * RBF(length_scale=1) RepeatedStratifiedKFold accuracy: 0.9299999999999999
Used Kernel: 1**2 * DotProduct(sigma_0=1) RepeatedStratifiedKFold accuracy: 0.9366666666666666
Used Kernel: 1**2 * Matern(length_scale=1, nu=1.5) RepeatedStratifiedKFold accuracy: 0.9249999999999997
Used Kernel: 1**2 * WhiteKernel(noise_level=1) RepeatedStratifiedKFold accuracy: 0.5


# Parameter tuning using GridSearchCV: Finding the best kernel


In [118]:
from sklearn.model_selection import GridSearchCV
# create a parameter grid
param_grid = dict(kernel=kernels)

In [122]:
# Instantiate the grid
grid = GridSearchCV(gpc, param_grid, cv=cv, scoring='accuracy')

In [126]:
# Fit the grid with data
grid.fit(X, y)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=None),
             estimator=GaussianProcessClassifier(kernel=1**2 * WhiteKernel(noise_level=1)),
             param_grid={'kernel': [1**2 * RBF(length_scale=1),
                                    1**2 * DotProduct(sigma_0=1),
                                    1**2 * Matern(length_scale=1, nu=1.5),
                                    1**2 * WhiteKernel(noise_level=1)]},
             scoring='accuracy')

In [127]:
# examine the best model
print('Best score:', grid.best_score_)
print('Best kernel:', grid.best_params_)

Best score: 0.9283333333333332
Best kernel: {'kernel': 1**2 * RBF(length_scale=1)}
