## Now: apply it to a multi-class classification problem (n_classes > 2)

In [None]:
from sklearn.datasets import load_wine

wine = load_wine(as_frame=True)

df_wine = wine['data']
X_wine = wine['data'].values
y_wine = wine['target']

In [None]:
model_lr = LogisticRegression(max_iter=5_000)
model_lr.fit(X_wine, y_wine)
model_lr.score(X_wine, y_wine)

In [None]:
# outputs = net._forward_pass_fast(X_cancer, check_input=True)

### Neat API

In [None]:
y_freq = np.bincount(y_wine)
y_freq = y_freq / np.sum(y_freq)

In [None]:
y_freq

In [None]:
clf = maxentropy.MinKLClassifier(
    feature_functions,
    sampler,
    prior_clf=model_lr,
    prior_class_probs=y_freq,
    matrix_format='ndarray',
    vectorized=True
)
clf.fit(X_wine, y_wine)

In [None]:
%debug

In [None]:
clf.score(X_wine, y_wine)

In [None]:
clf.predict(X_wine)

In [None]:
clf.models[0].params

In [None]:
# wine['feature_names']

In [None]:
df_wine[:3]

In [None]:
X_wine[:3]

### Old stuff

In [None]:
@tz.curry
# def forward_pass_without_output_layer(net, target=slice(None)):
def forward_pass(net, X, target=slice(None)):
    from sklearn.neural_network._base import ACTIVATIONS
    from sklearn.utils.extmath import safe_sparse_dot
    
    # Initialize first layer
    activation = X

    # Forward propagate
    hidden_activation = ACTIVATIONS[net.activation]
    for i in range(net.n_layers_ - 1):
        activation = safe_sparse_dot(activation, net.coefs_[i])
        activation += net.intercepts_[i]
        if i != net.n_layers_ - 2:
            hidden_activation(activation)
    # Should we really apply the logistic function in the output layer?
    # print(net.out_activation_)
    output_activation = ACTIVATIONS[net.out_activation_]
    output_activation(activation)
    return activation[:, target]

In [None]:
forward_pass(net, X_cancer, 0)[:10]

### Question: Can we fit a neural network for classification, remove the final softmax layer, and then apply this?

In [None]:
from sklearn.neural_network import MLPClassifier

net = MLPClassifier(hidden_layer_sizes=(100,))

net.fit(X_cancer, y_cancer)

net.score(X_cancer, y_cancer)

In [None]:
model_lr = LogisticRegression(max_iter=5_000)
model_lr.fit(X_wine, y_wine)
model_lr.score(X_wine, y_wine)

In [None]:
net = MLPClassifier(hidden_layer_sizes=(100,), learning_rate_init=0.01, max_iter=1000, random_state=7)
net.fit(X_wine, y_wine)
net.score(X_wine, y_wine)

In [None]:
forward_pass(net, X_wine)[:10]

In [None]:
net.predict_proba(X_wine)[:10]

In [None]:
# net.predict_log_proba(X_wine[:10])

In [None]:
# net.predict_proba(X_wine[:10])

#### Now define a sampler

In [None]:
# auxiliary = scipy.stats.uniform(-0.2, 1.2)   # i.e. from -0.2 to 1.0

# sampler = maxentropy.utils.auxiliary_sampler_scipy(auxiliary, n_samples=10_000)

In [None]:
stretched_minima, stretched_maxima = utils.bounds_stretched(X_wine, 10.0)
uniform_dist = scipy.stats.uniform(
    stretched_minima, stretched_maxima - stretched_minima
)
sampler = utils.auxiliary_sampler_scipy(
    uniform_dist, n_dims=len(wine["feature_names"]), n_samples=100_000
)

In [None]:
np.mean(next(sampler)[0] < 0)

In [None]:
@tz.curry
def non_neg(column, x):
    return x[:, column] >= 0

In [None]:
# def non_neg(x):
#     return x >= 0

In [None]:
def scalar(x):
    return 1.0

In [None]:
feature_functions = [non_neg(i) for i in range(len(wine['feature_names']))]

In [None]:
from maxentropy.utils import feature_sampler

In [None]:
sampleFgen = feature_sampler(
            feature_functions,
            sampler,
            vectorized=True,
            matrix_format='ndarray',
        )

In [None]:
next(sampleFgen)[0].shape

In [None]:
next(sampleFgen)[0].mean()

In [None]:
next(sampleFgen)[1].shape

In [None]:
next(sampleFgen)[2].shape

#### The neural network has fit K different models for the K different target classes.

Here we just twiddle the density for the first target class:

In [None]:
outputs = forward_pass(net, X_wine)
outputs[:3]

In [None]:
outputs = net.predict_proba(X_wine)
outputs[:3]

In [None]:
outputs.mean(axis=0)

In [None]:
np.unique(y_wine, return_counts=True)[1]

In [None]:
np.bincount(y_wine)

In [None]:
centered_outputs = outputs - outputs.mean(axis=0)

In [None]:
np.round(outputs.mean(axis=0), 2)

In [None]:
centered_outputs[:3]

In [None]:
@tz.curry
def forward_pass_centered(net, target_class, xs):
    # return forward_pass_without_output_layer(net, xs)[:, target_class] - outputs[:, target_class].mean()
    output = net.predict_proba(xs)[:, target_class] - outputs[:, target_class].mean()
    return output

The above seems to work, but the logic is wrong ...

#### Now do the sensible thing.

In [None]:
@tz.curry
def log_p_x_given_k(net, class_probabilities, target_class, xs):
    """
    This calculates the log of p(x | k = target_class) up to an additive constant (independent of k).

    Since:
    
        p(x | k) = p(k | x) / p(k) * p(x)

    we have:

        log p(x | k) = log p(k | x) - log p(k) + additive_constant_indep_of_k

    """
    import pdb
    pdb.set_trace()
    # return forward_pass_without_output_layer(net, xs)[:, target_class] - outputs[:, target_class].mean()
    output = net.predict_log_proba(xs)[:, target_class] - np.log(class_probabilities)
    return output

In [None]:
np.bincount(y_cancer) / np.bincount(y_cancer).sum()

In [None]:
from sklearn.utils.validation import check_is_fitted, check_array

In [None]:
from sklearn.utils.estimator_checks import check_estimator

In [None]:
forward_pass_centered(net, 0, X_wine)[:3]

In [None]:
target_class = 0

model0 = maxentropy.SamplingMinKLDensity(
    feature_functions,
    sampler,
    prior_log_pdf = forward_pass_centered(net, target_class),
    matrix_format='ndarray',
    vectorized=True
)

In [None]:
np.array([X_wine.mean()])

In [None]:
# X_wine[y_wine==target_class]

In [None]:
X_wine_subset = X_wine[y_wine == target_class]
X_wine_subset.shape

In [None]:
k = model0.features(X_wine_subset).mean(axis=0)

In [None]:
model0.fit(k)

In [None]:
model0.feature_expectations()

In [None]:
model0.params

In [None]:
model0.predict_log_proba(X_wine)[:5]

#### Very low values! Let's proceed anyway. These will be compared against other very low values (for the other classes).

In [None]:
target_class = 1

model1 = maxentropy.SamplingMinKLDensity(
    feature_functions,
    sampler,
    prior_log_pdf = forward_pass_centered(net, target_class),
    matrix_format='ndarray',
    vectorized=True
)
X_wine_subset = X_wine[y_wine == target_class]
k1 = model0.features(X_wine_subset).mean(axis=0)
model1.fit(k1)

target_class = 2

model2 = maxentropy.SamplingMinKLDensity(
    feature_functions,
    sampler,
    prior_log_pdf = forward_pass_centered(net, target_class),
    matrix_format='ndarray',
    vectorized=True
)
X_wine_subset = X_wine[y_wine == target_class]
k2 = model0.features(X_wine_subset).mean(axis=0)
model2.fit(k2)

In [None]:
log_scores = np.array([
    model0.predict_log_proba(X_wine),
    model1.predict_log_proba(X_wine),
    model2.predict_log_proba(X_wine)
]).T
log_scores.shape

In [None]:
from scipy.special import softmax

In [None]:
log_proba = softmax(log_scores, axis=1)

In [None]:
log_proba[:5]

In [None]:
net.n_outputs_

In [None]:
pred = net._label_binarizer.inverse_transform(log_proba)
pred

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_wine, pred)

### Can we do it just using the neural network's `predict_proba` outputs?

In [None]:
np.sort([4, 1, 2, 3])

In [None]:
def thing1(xs):
    return forward_pass_centered(net, 0)(xs)

In [None]:
thing1(X_wine)[:5]

In [None]:
def thing2(xs):
    return net.predict_proba(xs)[:, 0] - outputs[:, 0].mean()

In [None]:
thing2(X_wine)[:5]

#### By hand ...

In [None]:
models = {}
for target_class in np.sort(np.unique(y_wine)):
    print(f'Target class {target_class}')
    model = maxentropy.SamplingMinKLDensity(
        feature_functions,
        sampler,
        # Doesn't work:
        # prior_log_pdf = lambda xs: net.predict_log_proba(xs)[:, target_class] - outputs[:, target_class].mean(),
        prior_log_pdf = forward_pass_centered(net, target_class),
        # prior_log_pdf = lambda xs: net.predict_proba(xs)[:, target_class],
        matrix_format='ndarray',
        vectorized=True
    )
    X_wine_subset = X_wine[y_wine == target_class]
    k = model.features(X_wine_subset).mean(axis=0)
    print(k)
    model.fit(k)
    print(model.params)
    models[target_class] = model

In [None]:
log_scores = np.array([
    model.predict_log_proba(X_wine)
    for model in models.values()
]).T
log_scores.shape

In [None]:
log_scores[:5]

In [None]:
from scipy.special import logsumexp

In [None]:
log_proba = (log_scores.T - logsumexp(log_scores, axis=1)).T
log_proba[:5]

In [None]:
np.exp(log_proba)[:5]

In [None]:
proba = softmax(log_scores, axis=1)
proba[:5]

In [None]:
np.argmax(log_proba, axis=1)

In [None]:
pred = net._label_binarizer.inverse_transform(log_proba)

In [None]:
pred

In [None]:
accuracy_score(y_wine, pred)

In [None]:
net.score(X_wine, y_wine)

### Using MinKLClassifier

In [None]:
y_freq = np.bincount(y_wine)
y_freq = y_freq / np.sum(y_freq)

clf = maxentropy.MinKLClassifier(
    feature_functions,
    sampler,
    prior_clf=net,
    prior_class_probs=y_freq,
    # prior_log_proba_fn=lambda xs: forward_pass_centered(net, slice(None), xs),
    matrix_format='ndarray',
    vectorized=True
)
clf.fit(X_wine, y_wine)

In [None]:
# check_estimator(clf)

In [None]:
# clf.predict_proba(X_wine)

In [None]:
clf.predict(X_wine)

In [None]:
clf.score(X_wine, y_wine)

In [None]:
models = {}
for target_class in np.sort(np.unique(y_wine)):
    print(f'Target class {target_class}')
    model = maxentropy.SamplingMinKLDensity(
        feature_functions,
        sampler,
        # Doesn't work:
        # prior_log_pdf = lambda xs: net.predict_log_proba(xs)[:, target_class] - outputs[:, target_class].mean(),
        prior_log_pdf = forward_pass_centered(net, target_class),
        # prior_log_pdf = lambda xs: net.predict_proba(xs)[:, target_class],
        matrix_format='ndarray',
        vectorized=True
    )
    X_wine_subset = X_wine[y_wine == target_class]
    k = model.features(X_wine_subset).mean(axis=0)
    model.fit(k)
    models[target_class] = model

### Ideas for improving the usability

##### Current API

In [None]:
BREAK

In [None]:
def non_neg(x):
    return x >= 0

prior_model_params = scipy.stats.norm.fit(df_cancer['mean concavity'])

auxiliary = scipy.stats.uniform(-0.2, 1.2)   # i.e. from -0.2 to 1.0

sampler = maxentropy.utils.auxiliary_sampler_scipy(auxiliary, n_samples=10_000)

model = maxentropy.SamplingMinKLDensity(
    [non_neg], sampler, prior_log_pdf = prior_model.logpdf, matrix_format='ndarray', 
)

k = model.features(np.array([X_cancer['mean concavity'].mean()]))

model.fit(k)

##### Desired API

In [None]:
model = maxentropy.SamplingMinKLDensity(sampler='uniform', matrix_format='ndarray', sampling_stretch_factor=0.1, n_samples=10_000)

In [None]:
feature_functions = [non_neg] * X_cancer.shape[1]

model.fit(X_cancer, feature_functions=feature_functions)

In [None]:
def non_neg(x):
    return x >= 0