In [None]:
# Softmax regression.
# aka multinomial logistic regression.
# Generalization of (binary) logistic regression to multi-class.

# For mutually exclusive labels (or classes). 
# It is not multi-output.

# Builds a collection of linear decision boundaries between classes.
# Always predits the highest probability (even if that prob is low).

# First,
# Compute prob ( label=k ) for each label k.
# Softmax score for label k = s_k(X) = XT * theta_k

# Second,
# Softmax function aka normalized exponential
#   = exp(s_k(X)) / sum_over_k(exp(s_k(X)))

# Classifier: yhat = argmax_over_k(softmax(s_k(X)))

# Cost function: cross entropy.
# This is used generally to compare distributions.
# Cost (pred) = (1/m)sum_over_m(sum_over_k(y_ki*log(prob_ki))).
# Note y_ki is 0 or 1, so it is an indicator variable.
# Note for only 2 classes, cost reduces to log loss i.e. logistic regression.

# Gradient vector DEL_k  w.r.t. theta = (1/m)sum_over_m(prob_ki - y_ki)*Xi

In [1]:
# Reproduce previous notebook data.
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()
X=iris['data'][:,(2,3)]  # use two features
y=(iris['target'])   # three possible species
X[:5], y[:5]

(array([[1.4, 0.2],
        [1.4, 0.2],
        [1.3, 0.2],
        [1.5, 0.2],
        [1.4, 0.2]]),
 array([0, 0, 0, 0, 0]))

In [3]:
from sklearn.linear_model import LogisticRegression
alpha = 0.1
alpha_inverse=1.0/alpha
sm = LogisticRegression(multi_class="multinomial",solver="lbfgs",C=alpha_inverse)
sm.fit(X,y)
sample_data=[5,2]
sm.predict([sample_data])

array([2])

In [8]:
p=sm.predict_proba([sample_data])
# Prob(sp1) close to zero.
# Prob(sp2) 5.7%
# Prob(sp3) 94%.
p

array([[6.38014896e-07, 5.74929995e-02, 9.42506362e-01]])

In [9]:
# The probabilities sum to 1.
sum(p[0])

0.9999999999999999