In [11]:
# evaluate a gaussian process classifier model on the dataset
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import DotProduct
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel


import pandas as pd

### Importing data

In [10]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
concrete_compressive_strength = fetch_ucirepo(id=165) 
  
# data (as pandas dataframes) 
X = concrete_compressive_strength.data.features 
y = concrete_compressive_strength.data.targets 
  
# metadata 
print(concrete_compressive_strength.metadata) 
  
# variable information 
print(concrete_compressive_strength.variables) 


{'uci_id': 165, 'name': 'Concrete Compressive Strength', 'repository_url': 'https://archive.ics.uci.edu/dataset/165/concrete+compressive+strength', 'data_url': 'https://archive.ics.uci.edu/static/public/165/data.csv', 'abstract': 'Concrete is the most important material in civil engineering. The concrete compressive strength is a highly nonlinear function of age and ingredients. ', 'area': 'Physics and Chemistry', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 1030, 'num_features': 8, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Concrete compressive strength'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1998, 'last_updated': 'Sun Feb 11 2024', 'dataset_doi': '10.24432/C5PK67', 'creators': ['I-Cheng Yeh'], 'intro_paper': {'title': 'Modeling of strength of high-performance concrete using artificial neural networks', 'authors': 'I. Yeh', 'published_in': 'Cement and Concrete Res

In [12]:
# Convert features and targets to numeric values
features_df = pd.DataFrame(concrete_compressive_strength.data.features)
targets_df = pd.DataFrame(concrete_compressive_strength.data.targets)

targets_df.head(10)

Unnamed: 0,Concrete compressive strength
0,79.99
1,61.89
2,40.27
3,41.05
4,44.3
5,47.03
6,43.7
7,36.45
8,45.85
9,39.29


In [14]:
# Convert features and targets to numeric values
features_df = pd.DataFrame(concrete_compressive_strength.data.features)
targets_df = pd.DataFrame(concrete_compressive_strength.data.targets)

# Define threshold
threshold = 50  # Define your threshold value here

# Create binary target based on threshold
targets_df = (targets_df > threshold).astype(int)

targets_df.head(10)

Unnamed: 0,Concrete compressive strength
0,1
1,1
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


### EDA

In [16]:
# define dataset
X = features_df.apply(pd.to_numeric, errors='coerce').dropna(axis=1)
y = targets_df.apply(pd.to_numeric, errors='coerce').fillna(0)

print(X.shape, y.shape)

(1030, 8) (1030, 1)


In [17]:
# define model
model = GaussianProcessClassifier()
# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# summarize result
print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Mean Accuracy: 0.894 (0.023)


In [19]:
# fit model
model.fit(X, y)
# define new data
row = [2.47475454,0.40165523,1.68081787,2.88940715,0.91704519,-2.21290585,-3.139579, 0.91704519]
# make a prediction
yhat = model.predict([row])
# summarize prediction
print('Predicted Class: %d' % yhat)

  y = column_or_1d(y, warn=True)


Predicted Class: 0


  print('Predicted Class: %d' % yhat)


In [20]:
# define grid
grid = dict()
grid['kernel'] = [1*RBF(), 1*DotProduct(), 1*Matern(),  1*RationalQuadratic(), 1*WhiteKernel()]
# define search
search = GridSearchCV(model, grid, scoring='accuracy', cv=cv, n_jobs=-1)

In [22]:
# perform the search
results = search.fit(X, y)
# summarize best
print('Best Mean Accuracy: %.3f' % results.best_score_)
print('Best Config: %s' % results.best_params_)
# summarize all
means = results.cv_results_['mean_test_score']
params = results.cv_results_['params']
for mean, param in zip(means, params):
    print(">%.3f with: %r" % (mean, param))

  y = column_or_1d(y, warn=True)


Best Mean Accuracy: 0.933
Best Config: {'kernel': 1**2 * Matern(length_scale=1, nu=1.5)}
>0.927 with: {'kernel': 1**2 * RBF(length_scale=1)}
>0.846 with: {'kernel': 1**2 * DotProduct(sigma_0=1)}
>0.933 with: {'kernel': 1**2 * Matern(length_scale=1, nu=1.5)}
>0.930 with: {'kernel': 1**2 * RationalQuadratic(alpha=1, length_scale=1)}
>0.796 with: {'kernel': 1**2 * WhiteKernel(noise_level=1)}


In [23]:
# fit model
#search.fit(X, y)
# define new data
row = [2.47475454,0.40165523,1.68081787,2.88940715,0.91704519,-2.21290585,-3.139579, 0.91704519]
# make a prediction
yhat = search.predict([row])
# summarize prediction
print('Predicted Class: %d' % yhat)

Predicted Class: 0


  print('Predicted Class: %d' % yhat)


In [41]:
import matplotlib.pyplot as plt
import numpy as np


xx, yy = np.meshgrid(np.linspace(-3, 8, 8), np.linspace(-3, 3, 50))
rng = np.random.RandomState(0)


xx.shape

(50, 8)

In [42]:
# fit the model
plt.figure(figsize=(10, 5))
kernels = [1.0 * RBF(length_scale=1.15), 1.0 * DotProduct(sigma_0=1.0) ** 2]
for i, kernel in enumerate(kernels):
    clf = GaussianProcessClassifier(kernel=kernel, warm_start=True).fit(X, y)

    # plot the decision function for each datapoint on the grid
    Z = clf.predict_proba(np.vstack((xx.ravel(), yy.ravel(), )).T)[:, 1]
    Z = Z.reshape(xx.shape)

    plt.subplot(1, 2, i + 1)
    image = plt.imshow(
        Z,
        interpolation="nearest",
        extent=(xx.min(), xx.max(), yy.min(), yy.max()),
        aspect="auto",
        origin="lower",
        cmap=plt.cm.PuOr_r,
    )
    contours = plt.contour(xx, yy, Z, levels=[0.5], linewidths=2, colors=["k"])
    plt.scatter(X[:, 0], X[:, 1], s=30, c=y, cmap=plt.cm.Paired, edgecolors=(0, 0, 0))
    plt.xticks(())
    plt.yticks(())
    plt.axis([-3, 3, -3, 3])
    plt.colorbar(image)
    plt.title(
        "%s\n Log-Marginal-Likelihood:%.3f"
        % (clf.kernel_, clf.log_marginal_likelihood(clf.kernel_.theta)),
        fontsize=12,
    )

plt.tight_layout()
plt.show()

  y = column_or_1d(y, warn=True)


ValueError: X has 2 features, but GaussianProcessClassifier is expecting 8 features as input.

<Figure size 1000x500 with 0 Axes>