## Reranking an existing model

Can we improve on the existing LR model?

Ideally we'd first now how it's poor, then impose constraints to correct for that.

In [1]:
import numpy as np
import scipy.stats
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.linear_model import LogisticRegression
import maxentropy
import maxentropy.utils as utils

import plotly.io as pio
pio.renderers.default = 'plotly_mimetype'

import plotly.express as px

from sklearn.datasets import load_iris, load_breast_cancer

iris = load_iris()
cancer = load_breast_cancer(as_frame=True)

## First example: find the model with minimum relative entropy to some prior model subject to a non-negativity constraint

In [2]:
cancer['feature_names']

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [3]:
df_cancer = cancer['data']
X_cancer = cancer['data'].values
y_cancer = cancer['target']

### Question: Can we fit a neural network for classification, remove the final softmax layer, and then apply this?

In [None]:
from sklearn.neural_network import MLPClassifier

net = MLPClassifier(hidden_layer_sizes=(100,))

net.fit(X_cancer, y_cancer)

net.score(X_cancer, y_cancer)

In [6]:
net._predict??

[0;31mSignature:[0m [0mnet[0m[0;34m.[0m[0m_predict[0m[0;34m([0m[0mX[0m[0;34m,[0m [0mcheck_input[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0m_predict[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mX[0m[0;34m,[0m [0mcheck_input[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m"""Private predict method with optional input validation"""[0m[0;34m[0m
[0;34m[0m        [0my_pred[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_forward_pass_fast[0m[0;34m([0m[0mX[0m[0;34m,[0m [0mcheck_input[0m[0;34m=[0m[0mcheck_input[0m[0;34m)[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m        [0;32mif[0m [0mself[0m[0;34m.[0m[0mn_outputs_[0m [0;34m==[0m [0;36m1[0m[0;34m:[0m[0;34m[0m
[0;34m[0m            [0my_pred[0m [0;34m=[0m [0my_pred[0m[0;34m.[0m[0mravel[0m[0;34m([0m[0;34m)[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m       

In [7]:
# outputs = net._forward_pass_fast(X_cancer, check_input=True)

In [8]:
import toolz as tz

In [9]:
@tz.curry
# def forward_pass_without_output_layer(net, target=slice(None)):
def forward_pass(net, X, target=slice(None)):
    from sklearn.neural_network._base import ACTIVATIONS
    from sklearn.utils.extmath import safe_sparse_dot
    
    # Initialize first layer
    activation = X

    # Forward propagate
    hidden_activation = ACTIVATIONS[net.activation]
    for i in range(net.n_layers_ - 1):
        activation = safe_sparse_dot(activation, net.coefs_[i])
        activation += net.intercepts_[i]
        if i != net.n_layers_ - 2:
            hidden_activation(activation)
    # Should we really apply the logistic function in the output layer?
    # print(net.out_activation_)
    output_activation = ACTIVATIONS[net.out_activation_]
    output_activation(activation)
    return activation[:, target]

In [10]:
forward_pass(net, X_cancer, 0)[:10]

array([1.51656078e-16, 2.32867637e-10, 9.29204775e-08, 6.32877304e-01,
       1.79076208e-02, 2.80405857e-02, 3.44312011e-08, 1.32751855e-03,
       2.30689234e-01, 4.01083480e-03])

## Now: apply it to a multi-class classification problem (n_classes > 2)

In [None]:
from sklearn.datasets import load_wine

wine = load_wine(as_frame=True)

df_wine = wine['data']
X_wine = wine['data'].values
y_wine = wine['target']

In [14]:
# wine['feature_names']

In [15]:
df_wine[:3]

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0


In [16]:
X_wine[:3]

array([[1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
        3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, 1.120e+01, 1.000e+02, 2.650e+00,
        2.760e+00, 2.600e-01, 1.280e+00, 4.380e+00, 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, 1.860e+01, 1.010e+02, 2.800e+00,
        3.240e+00, 3.000e-01, 2.810e+00, 5.680e+00, 1.030e+00, 3.170e+00,
        1.185e+03]])

In [17]:
model_lr = LogisticRegression(max_iter=5_000)
model_lr.fit(X_wine, y_wine)
model_lr.score(X_wine, y_wine)

0.9943820224719101

In [18]:
net = MLPClassifier(hidden_layer_sizes=(100,), learning_rate_init=0.01, max_iter=1000, random_state=7)
net.fit(X_wine, y_wine)
net.score(X_wine, y_wine)

0.8820224719101124

In [19]:
forward_pass(net, X_wine)[:10]

array([[9.99997359e-01, 1.74069242e-06, 9.00339952e-07],
       [9.99999998e-01, 1.14744691e-09, 4.92073562e-10],
       [1.00000000e+00, 1.53418384e-11, 2.32317785e-11],
       [1.00000000e+00, 3.20726835e-19, 3.52267199e-17],
       [9.35703608e-01, 5.87869811e-02, 5.50941106e-03],
       [1.00000000e+00, 8.85365192e-19, 4.49256502e-17],
       [1.00000000e+00, 1.93530313e-16, 8.48028138e-16],
       [1.00000000e+00, 4.77580015e-12, 8.62673306e-11],
       [9.99999990e-01, 7.44782829e-09, 2.49191611e-09],
       [9.99999958e-01, 9.93821203e-09, 3.16033288e-08]])

In [20]:
net.predict_proba(X_wine)[:10]

array([[9.99997359e-01, 1.74069242e-06, 9.00339952e-07],
       [9.99999998e-01, 1.14744691e-09, 4.92073562e-10],
       [1.00000000e+00, 1.53418384e-11, 2.32317785e-11],
       [1.00000000e+00, 3.20726835e-19, 3.52267199e-17],
       [9.35703608e-01, 5.87869811e-02, 5.50941106e-03],
       [1.00000000e+00, 8.85365192e-19, 4.49256502e-17],
       [1.00000000e+00, 1.93530313e-16, 8.48028138e-16],
       [1.00000000e+00, 4.77580015e-12, 8.62673306e-11],
       [9.99999990e-01, 7.44782829e-09, 2.49191611e-09],
       [9.99999958e-01, 9.93821203e-09, 3.16033288e-08]])

In [21]:
# net.predict_log_proba(X_wine[:10])

In [22]:
# net.predict_proba(X_wine[:10])

#### Now define a sampler

In [23]:
# auxiliary = scipy.stats.uniform(-0.2, 1.2)   # i.e. from -0.2 to 1.0

# sampler = maxentropy.utils.auxiliary_sampler_scipy(auxiliary, n_samples=10_000)

In [24]:
stretched_minima, stretched_maxima = utils.bounds_stretched(X_wine, 10.0)
uniform_dist = scipy.stats.uniform(
    stretched_minima, stretched_maxima - stretched_minima
)
sampler = utils.auxiliary_sampler_scipy(
    uniform_dist, n_dims=len(wine["feature_names"]), n_samples=100_000
)

In [25]:
np.mean(next(sampler)[0] < 0)

0.4059430769230769

In [26]:
@tz.curry
def non_neg(column, x):
    return x[:, column] >= 0

In [27]:
# def non_neg(x):
#     return x >= 0

In [28]:
def scalar(x):
    return 1.0

In [29]:
feature_functions = [non_neg(i) for i in range(len(wine['feature_names']))]

In [30]:
from maxentropy.utils import feature_sampler

In [31]:
sampleFgen = feature_sampler(
            feature_functions,
            sampler,
            vectorized=True,
            matrix_format='ndarray',
        )

In [32]:
next(sampleFgen)[0].shape

(100000, 13)

In [33]:
next(sampleFgen)[0].mean()

0.5944407692307693

In [34]:
next(sampleFgen)[1].shape

(100000,)

In [35]:
next(sampleFgen)[2].shape

(100000, 13)

#### The neural network has fit K different models for the K different target classes.

Here we just twiddle the density for the first target class:

In [36]:
outputs = forward_pass(net, X_wine)
outputs[:3]

array([[9.99997359e-01, 1.74069242e-06, 9.00339952e-07],
       [9.99999998e-01, 1.14744691e-09, 4.92073562e-10],
       [1.00000000e+00, 1.53418384e-11, 2.32317785e-11]])

In [37]:
outputs = net.predict_proba(X_wine)
outputs[:3]

array([[9.99997359e-01, 1.74069242e-06, 9.00339952e-07],
       [9.99999998e-01, 1.14744691e-09, 4.92073562e-10],
       [1.00000000e+00, 1.53418384e-11, 2.32317785e-11]])

In [38]:
outputs.mean(axis=0)

array([0.42208937, 0.28603808, 0.29187255])

In [39]:
np.unique(y_wine, return_counts=True)[1]

array([59, 71, 48])

In [40]:
np.bincount(y_wine)

array([59, 71, 48])

In [41]:
centered_outputs = outputs - outputs.mean(axis=0)

In [42]:
np.round(outputs.mean(axis=0), 2)

array([0.42, 0.29, 0.29])

In [43]:
centered_outputs[:3]

array([[ 0.57790799, -0.28603634, -0.29187165],
       [ 0.57791063, -0.28603808, -0.29187255],
       [ 0.57791063, -0.28603808, -0.29187255]])

In [44]:
@tz.curry
def forward_pass_centered(net, target_class, xs):
    # return forward_pass_without_output_layer(net, xs)[:, target_class] - outputs[:, target_class].mean()
    output = net.predict_proba(xs)[:, target_class] - outputs[:, target_class].mean()
    return output

The above seems to work, but the logic is wrong ...

#### Now do the sensible thing.

In [45]:
@tz.curry
def log_p_x_given_k(net, class_probabilities, target_class, xs):
    """
    This calculates the log of p(x | k = target_class) up to an additive constant (independent of k).

    Since:
    
        p(x | k) = p(k | x) / p(k) * p(x)

    we have:

        log p(x | k) = log p(k | x) - log p(k) + additive_constant_indep_of_k

    """
    import pdb
    pdb.set_trace()
    # return forward_pass_without_output_layer(net, xs)[:, target_class] - outputs[:, target_class].mean()
    output = net.predict_log_proba(xs)[:, target_class] - np.log(class_probabilities)
    return output

In [46]:
np.bincount(y_cancer) / np.bincount(y_cancer).sum()

array([0.37258348, 0.62741652])

In [47]:
from sklearn.utils.validation import check_is_fitted, check_array

In [48]:
from sklearn.utils.estimator_checks import check_estimator

In [50]:
forward_pass_centered(net, 0, X_wine)[:3]

array([0.57790799, 0.57791063, 0.57791063])

In [51]:
target_class = 0

model0 = maxentropy.SamplingMinKLDensity(
    feature_functions,
    sampler,
    prior_log_pdf = forward_pass_centered(net, target_class),
    matrix_format='ndarray',
    vectorized=True
)

In [52]:
np.array([X_wine.mean()])

array([69.13366292])

In [53]:
# X_wine[y_wine==target_class]

In [54]:
X_wine_subset = X_wine[y_wine == target_class]
X_wine_subset.shape

(59, 13)

In [55]:
k = model0.features(X_wine_subset).mean(axis=0)

In [56]:
model0.fit(k)

In [57]:
model0.feature_expectations()

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [58]:
model0.params

array([18.00820299, 18.83243279, 18.81425216, 18.92228727, 19.13200853,
       18.92074615, 18.75665235, 19.06740194, 18.97790287, 18.92145215,
       19.15076487, 19.02495085, 19.24442547])

In [59]:
model0.predict_log_proba(X_wine)[:5]

array([-49.42234311, -49.42234047, -49.42234047, -49.42234047,
       -49.48663686])

#### Very low values! Let's proceed anyway. These will be compared against other very low values (for the other classes).

In [60]:
target_class = 1

model1 = maxentropy.SamplingMinKLDensity(
    feature_functions,
    sampler,
    prior_log_pdf = forward_pass_centered(net, target_class),
    matrix_format='ndarray',
    vectorized=True
)
X_wine_subset = X_wine[y_wine == target_class]
k1 = model0.features(X_wine_subset).mean(axis=0)
model1.fit(k1)

target_class = 2

model2 = maxentropy.SamplingMinKLDensity(
    feature_functions,
    sampler,
    prior_log_pdf = forward_pass_centered(net, target_class),
    matrix_format='ndarray',
    vectorized=True
)
X_wine_subset = X_wine[y_wine == target_class]
k2 = model0.features(X_wine_subset).mean(axis=0)
model2.fit(k2)

In [61]:
log_scores = np.array([
    model0.predict_log_proba(X_wine),
    model1.predict_log_proba(X_wine),
    model2.predict_log_proba(X_wine)
]).T
log_scores.shape

(178, 3)

In [62]:
from scipy.special import softmax

In [63]:
log_proba = softmax(log_scores, axis=1)

In [64]:
log_proba[:5]

array([[0.36623087, 0.29668391, 0.33708522],
       [0.36623179, 0.29668335, 0.33708486],
       [0.36623179, 0.29668335, 0.33708486],
       [0.36623179, 0.29668335, 0.33708486],
       [0.34445195, 0.3155878 , 0.33996025]])

In [65]:
net.n_outputs_

3

In [66]:
pred = net._label_binarizer.inverse_transform(log_proba)
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 1, 1, 1, 1,
       1, 1, 2, 1, 2, 1, 1, 1, 0, 1, 1, 1, 2, 1, 1, 0, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [67]:
from sklearn.metrics import accuracy_score

In [68]:
accuracy_score(y_wine, pred)

0.9325842696629213

### Can we do it just using the neural network's `predict_proba` outputs?

In [69]:
np.sort([4, 1, 2, 3])

array([1, 2, 3, 4])

In [70]:
def thing1(xs):
    return forward_pass_centered(net, 0)(xs)

In [71]:
thing1(X_wine)[:5]

array([0.57790799, 0.57791063, 0.57791063, 0.57791063, 0.51361424])

In [72]:
def thing2(xs):
    return net.predict_proba(xs)[:, 0] - outputs[:, 0].mean()

In [73]:
thing2(X_wine)[:5]

array([0.57790799, 0.57791063, 0.57791063, 0.57791063, 0.51361424])

#### By hand ...

In [74]:
models = {}
for target_class in np.sort(np.unique(y_wine)):
    print(f'Target class {target_class}')
    model = maxentropy.SamplingMinKLDensity(
        feature_functions,
        sampler,
        # Doesn't work:
        # prior_log_pdf = lambda xs: net.predict_log_proba(xs)[:, target_class] - outputs[:, target_class].mean(),
        prior_log_pdf = forward_pass_centered(net, target_class),
        # prior_log_pdf = lambda xs: net.predict_proba(xs)[:, target_class],
        matrix_format='ndarray',
        vectorized=True
    )
    X_wine_subset = X_wine[y_wine == target_class]
    k = model.features(X_wine_subset).mean(axis=0)
    print(k)
    model.fit(k)
    print(model.params)
    models[target_class] = model

Target class 0
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[16.55447654 17.75266809 17.63298667 17.86341458 17.49765145 17.60300951
 17.63360092 17.77760719 17.72621586 17.90124024 17.35349792 17.62800338
 17.86200984]
Target class 1
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[18.15476455 19.44608149 18.86927603 18.82158421 18.77860779 19.32584373
 19.27428345 19.16393571 19.41111538 19.57281794 19.10686713 19.29184125
 19.10285827]
Target class 2
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[18.24899412 19.30685926 19.23940691 18.87335722 19.09669044 19.23775746
 19.76148338 19.48096982 19.47282365 19.30959839 19.10089376 19.26124504
 19.27892856]


In [75]:
log_scores = np.array([
    model.predict_log_proba(X_wine)
    for model in models.values()
]).T
log_scores.shape

(178, 3)

In [76]:
log_scores[:5]

array([[-49.37621184, -49.40148889, -49.44148178],
       [-49.37620921, -49.40149063, -49.44148268],
       [-49.3762092 , -49.40149064, -49.44148268],
       [-49.3762092 , -49.40149064, -49.44148268],
       [-49.4405056 , -49.34270365, -49.43597326]])

In [77]:
from scipy.special import logsumexp

In [78]:
log_proba = (log_scores.T - logsumexp(log_scores, axis=1)).T
log_proba[:5]

array([[-1.06879009, -1.09406714, -1.13406002],
       [-1.06878749, -1.09406892, -1.13406096],
       [-1.06878749, -1.09406892, -1.13406096],
       [-1.06878749, -1.09406892, -1.13406096],
       [-1.13374999, -1.03594805, -1.12921766]])

In [79]:
np.exp(log_proba)[:5]

array([[0.34342378, 0.33485183, 0.32172439],
       [0.34342467, 0.33485124, 0.32172409],
       [0.34342467, 0.33485124, 0.32172409],
       [0.34342467, 0.33485124, 0.32172409],
       [0.32182415, 0.35488977, 0.32328608]])

In [80]:
proba = softmax(log_scores, axis=1)
proba[:5]

array([[0.34342378, 0.33485183, 0.32172439],
       [0.34342467, 0.33485124, 0.32172409],
       [0.34342467, 0.33485124, 0.32172409],
       [0.34342467, 0.33485124, 0.32172409],
       [0.32182415, 0.35488977, 0.32328608]])

In [81]:
np.argmax(log_proba, axis=1)

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [85]:
pred = net._label_binarizer.inverse_transform(log_proba)

In [86]:
pred

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [87]:
accuracy_score(y_wine, pred)

0.9662921348314607

In [89]:
net.score(X_wine, y_wine)

0.8820224719101124

### Using MinKLClassifier

In [90]:
y_freq = np.bincount(y_wine)
y_freq = y_freq / np.sum(y_freq)

In [91]:
clf = maxentropy.MinKLClassifier(
    feature_functions,
    sampler,
    prior_clf=net,
    prior_class_probs=y_freq,
    # prior_log_proba_fn=lambda xs: forward_pass_centered(net, slice(None), xs),
    matrix_format='ndarray',
    vectorized=True
)
clf.fit(X_wine, y_wine)


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log



In [93]:
# check_estimator(clf)

In [None]:
# clf.predict_proba(X_wine)

In [94]:
clf.predict(X_wine)

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [95]:
clf.score(X_wine, y_wine)

0.9775280898876404

In [None]:
models = {}
for target_class in np.sort(np.unique(y_wine)):
    print(f'Target class {target_class}')
    model = maxentropy.SamplingMinKLDensity(
        feature_functions,
        sampler,
        # Doesn't work:
        # prior_log_pdf = lambda xs: net.predict_log_proba(xs)[:, target_class] - outputs[:, target_class].mean(),
        prior_log_pdf = forward_pass_centered(net, target_class),
        # prior_log_pdf = lambda xs: net.predict_proba(xs)[:, target_class],
        matrix_format='ndarray',
        vectorized=True
    )
    X_wine_subset = X_wine[y_wine == target_class]
    k = model.features(X_wine_subset).mean(axis=0)
    model.fit(k)
    models[target_class] = model

### Ideas for improving the usability

##### Current API

In [None]:
BREAK

In [None]:
def non_neg(x):
    return x >= 0

prior_model_params = scipy.stats.norm.fit(df_cancer['mean concavity'])

auxiliary = scipy.stats.uniform(-0.2, 1.2)   # i.e. from -0.2 to 1.0

sampler = maxentropy.utils.auxiliary_sampler_scipy(auxiliary, n_samples=10_000)

model = maxentropy.SamplingMinKLDensity(
    [non_neg], sampler, prior_log_pdf = prior_model.logpdf, matrix_format='ndarray', 
)

k = model.features(np.array([X_cancer['mean concavity'].mean()]))

model.fit(k)

##### Desired API

In [None]:
model = maxentropy.SamplingMinKLDensity(sampler='uniform', matrix_format='ndarray', sampling_stretch_factor=0.1, n_samples=10_000)

In [None]:
feature_functions = [non_neg] * X_cancer.shape[1]

model.fit(X_cancer, feature_functions=feature_functions)

In [None]:
def non_neg(x):
    return x >= 0