This notebook demonstrates multiclass probability estimation classification using logistic regression.

There are many other ways to do this, including using:
- Random Forest
- Gradient Boosting Machines (GBMs) such as XGBoost and LightGBM
- Neural Networks
- etc

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
model.fit(X_train_scaled, y_train)

In [4]:
probabilities = model.predict_proba(X_test_scaled)
probabilities  # Each row corresponds to an observation, each column to a class

array([[1.14571961e-02, 8.75978526e-01, 1.12564278e-01],
       [9.64411302e-01, 3.55882864e-02, 4.11287569e-07],
       [3.77322994e-08, 2.88231142e-03, 9.97117651e-01],
       [1.32093187e-02, 7.59399159e-01, 2.27391523e-01],
       [1.88856076e-03, 7.52135755e-01, 2.45975684e-01],
       [9.32198746e-01, 6.77997236e-02, 1.52998880e-06],
       [8.92318972e-02, 8.78433572e-01, 3.23345305e-02],
       [8.41214553e-05, 6.41588413e-02, 9.35757037e-01],
       [7.39566554e-04, 5.77274330e-01, 4.21986104e-01],
       [3.02277568e-02, 9.25748179e-01, 4.40240642e-02],
       [1.18146351e-03, 2.10259681e-01, 7.88558855e-01],
       [9.49659823e-01, 5.03396873e-02, 4.89734158e-07],
       [9.60143539e-01, 3.98562053e-02, 2.55678334e-07],
       [9.51922881e-01, 4.80766480e-02, 4.71031383e-07],
       [9.89928422e-01, 1.00714866e-02, 9.09160454e-08],
       [1.91579798e-02, 7.24444237e-01, 2.56397783e-01],
       [4.03118891e-05, 3.17741225e-02, 9.68185566e-01],
       [2.66588942e-02, 9.34943