# Features optimisation (cont.)

This notebook explore optimisation approaches for linear and polynomial kernels for continuous data.

In [1]:
from pathlib import Path

import h5py
import numpy as np
from numba import njit

from hsic_optimization.hsic_tools import generate

## Dataset generation

For testing purpose, we will use only 1 feature with 1,000,000 samples.

In [2]:
n_samples = 1_000_000
n_features = 1
n_active = 1
dset_file = Path(f"../data/test_dataset_{n_features}_by_{n_samples}_continous.h5")

In [3]:
if not dset_file.exists():
    generate(
        dset_file,
        samples=n_samples,
        feats=n_features,
        active=n_active,
        seed=1234,
        continuous_features=True,
        continuous_target=True,
    )

In [4]:
with h5py.File(dset_file, "r") as fd:
    X = np.array(fd["X"])

## Accelerated linear feature function

Here we'll compare speed of linear kernel with/without jit.

In [5]:
# parameters for kernel generation
B = 20
M = 3

# test feature vectore
x = X[0] / X[0].std()

In [6]:
def compute_kernel(x, kernel, B=0, M=1, discarded=0):
    n = len(x)
    H = np.eye(B, dtype=np.float32) - np.full((B, B), 1 / B, dtype=np.float32)
    K = np.zeros(n * B * M, dtype=np.float32)

    st = 0
    ed = B**2
    index = np.arange(n)
    for m in range(M):
        np.random.seed(m)
        index = np.random.permutation(index)
        X_k = x[index]

        for i in range(0, n - discarded, B):
            j = min(n, i + B)

            k = kernel(X_k[i:j])
            k = (H @ k) @ H
            k = k / (np.sqrt(np.sum(k**2)) + 1e-9)

            K[st:ed] = k.ravel()
            st += B**2
            ed += B**2

    return K


compute_kernel_fast = njit(nogil=True, cache=True)(compute_kernel)

Let's explore multiple implementations of the dot product.
We'll try to leverage the fact that it is an outer product of an input vector with itself, to see if we can make it fastert this way.

In [7]:
def kernel_linear_(X_in_1, X_in_2):
    K = X_in_1.T.dot(X_in_2)
    return K


def kernel_linear_0(x):
    return kernel_linear_(x[None, :], x[None, :])


@njit(nogil=True, cache=True)
def kernel_linear_1(x):
    K = np.outer(x, x)
    return K


@njit(nogil=True, cache=True)
def kernel_linear_2(x):
    K = x[:, None] * x
    return K


@njit(nogil=True, cache=True)
def kernel_linear_3(x):
    n = len(x)
    out = np.empty((n, n), dtype=np.float32)
    for i in range(n):
        for j in range(i, n):
            res = x[i] * x[j]
            out[i, j] = res
            out[j, i] = res
    return out

In [8]:
%%timeit
compute_kernel(x, kernel_linear_0, B, M)

4.84 s ± 34.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%%timeit
compute_kernel(x, kernel_linear_1, B, M)

4.66 s ± 38.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit
compute_kernel(x, kernel_linear_2, B, M)

4.7 s ± 33.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%%timeit
compute_kernel(x, kernel_linear_3, B, M)

4.69 s ± 46.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


And using the accelerated feature function.

In [12]:
%%timeit
compute_kernel_fast(x, kernel_linear_1, B, M)

1.04 s ± 9.34 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%%timeit
compute_kernel_fast(x, kernel_linear_2, B, M)

950 ms ± 10.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%%timeit
compute_kernel_fast(x, kernel_linear_3, B, M)

972 ms ± 12.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


All versions are almost at the same speed.
We'll check that is it close to the original version.

In [15]:
k_ref = compute_kernel(x, kernel_linear_0, B, M)
k_2 = compute_kernel_fast(x, kernel_linear_2, B, M)
np.allclose(k_ref, k_2)

True

## Accelerated polynomial feature function

We'll do the same exercise but for the polynomial kernel with degree = 2.

In [16]:
def kernel_polynomial_(X_in_1, X_in_2, degree):
    K = X_in_1.T.dot(X_in_2) ** degree
    return K


def kernel_quadratic_0(x):
    return kernel_polynomial_(x[None, :], x[None, :], 2)


@njit(nogil=True, cache=True)
def kernel_quadratic_1(x):
    return kernel_linear_3(x) ** 2


@njit(nogil=True, cache=True)
def kernel_quadratic_2(x):
    return kernel_linear_3(x**2)


@njit(nogil=True, cache=True)
def kernel_quadratic_3(x):
    n = len(x)
    out = np.empty((n, n), dtype=np.float32)
    for i in range(n):
        for j in range(i, n):
            res = (x[i] * x[j]) ** 2
            out[i, j] = res
            out[j, i] = res
    return out

In [17]:
%%timeit
compute_kernel(x, kernel_quadratic_0, B, M)

5.09 s ± 31.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%%timeit
compute_kernel(x, kernel_quadratic_1, B, M)

4.87 s ± 12.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%%timeit
compute_kernel(x, kernel_quadratic_2, B, M)

4.65 s ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%%timeit
compute_kernel(x, kernel_quadratic_3, B, M)

4.51 s ± 32.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


And the accelerated feature versions.

In [21]:
%%timeit
compute_kernel_fast(x, kernel_quadratic_1, B, M)

1.04 s ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%%timeit
compute_kernel_fast(x, kernel_quadratic_2, B, M)

1 s ± 4.38 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
%%timeit
compute_kernel_fast(x, kernel_quadratic_3, B, M)

979 ms ± 9.55 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
k_ref = compute_kernel(x, kernel_quadratic_0, B, M)
k_1 = compute_kernel_fast(x, kernel_quadratic_1, B, M)
np.allclose(k_ref, k_1)

True

## Linear algebra libraries

Let's double check which version of BLAS is used.

In [25]:
np.show_config()

Build Dependencies:
  blas:
    detection method: pkgconfig
    found: true
    include directory: /usr/local/include
    lib directory: /usr/local/lib
    name: openblas64
    openblas configuration: USE_64BITINT=1 DYNAMIC_ARCH=1 DYNAMIC_OLDER= NO_CBLAS=
      NO_LAPACK= NO_LAPACKE= NO_AFFINITY=1 USE_OPENMP= HASWELL MAX_THREADS=2
    pc file directory: /usr/local/lib/pkgconfig
    version: 0.3.23.dev
  lapack:
    detection method: internal
    found: true
    include directory: unknown
    lib directory: unknown
    name: dep139863411681952
    openblas configuration: unknown
    pc file directory: unknown
    version: 1.26.4
Compilers:
  c:
    args: -fno-strict-aliasing
    commands: cc
    linker: ld.bfd
    linker args: -Wl,--strip-debug, -fno-strict-aliasing
    name: gcc
    version: 10.2.1
  c++:
    commands: c++
    linker: ld.bfd
    linker args: -Wl,--strip-debug
    name: gcc
    version: 10.2.1
  cython:
    commands: cython
    linker: cython
    name: cython
    versio