In [232]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from functools import partial
from collections import namedtuple, defaultdict
import inspect
import configparser
from prettytable import PrettyTable
import ast
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [277]:
config = configparser.ConfigParser()
config.read('../config.ini')

['../config.ini']

In [261]:
binary_classification_model_parameters = {
    "max_iter": ast.literal_eval(config["CLASSIFICATION"]["maxiterations"]),
    "cv": ast.literal_eval(config["CLASSIFICATION"]["cvfolds"]),
    "tolerance": ast.literal_eval(config["CLASSIFICATION"]["tolerance"]),
    "random_state": ast.literal_eval(config["CLASSIFICATION"]["randomstate"]),
    "verbose": ast.literal_eval(config["CLASSIFICATION"]["verbose"]),
}

binary_classification_model_parameters = defaultdict(lambda: None, binary_classification_model_parameters)

# Simple Classification Models

## Binary Classification

1. `LogisticRegression`: Logistic Regression (aka logit, MaxEnt) classifier.

2. `SGDClassifier`: Linear classifiers (SVM, logistic regression, etc.) with SGD training.

3. `Perceptron`: The perceptron is a simple classification algorithm suitable for large scale learning.

4. `PassiveAggressiveClassifier`: Passive Aggressive Classifier.

5. `RidgeClassifier`: Classifier using Ridge regression.

6. `RidgeClassifierCV`: Ridge classifier with built-in cross-validation.

7. `LinearSVC`: Linear Support Vector Classification.

8. `SVC`: C-Support Vector Classification.

9. `NuSVC`: Nu-Support Vector Classification.

10. `DecisionTreeClassifier`: A decision tree classifier.

11. `RandomForestClassifier`: A random forest classifier.

12. `ExtraTreesClassifier`: An extra-trees classifier.

13. `GradientBoostingClassifier`: Gradient Boosting for classification.

14. `HistGradientBoostingClassifier`: Histogram-based Gradient Boosting Classification Tree.

15. `AdaBoostClassifier`: An AdaBoost classifier.

16. `BaggingClassifier`: A Bagging classifier.

17. `VotingClassifier`: Soft Voting/Majority Rule classifier for unfitted estimators.

18. `StackingClassifier`: Stacking classifier for unfitted estimators.

19. `KNeighborsClassifier`: Classifier implementing the k-nearest neighbors vote.

20. `RadiusNeighborsClassifier`: Classifier implementing a vote among neighbors within a given radius.

21. `MLPClassifier`: Multi-layer Perceptron classifier.

22. `GaussianNB`: Gaussian Naive Bayes (GaussianNB).

23. `BernoulliNB`: Naive Bayes classifier for multivariate Bernoulli models.

24. `ComplementNB`: The Complement Naive Bayes classifier.

25. `MultinomialNB`: Naive Bayes classifier for multinomial models.

Please refer to the [Scikit-learn documentation](https://scikit-learn.org/stable/supervised_learning.html#supervised-learning) for more details on each of these methods.


### Data Prep

In [214]:
from sklearn.datasets import load_breast_cancer

iris_df = load_breast_cancer(as_frame=True).frame

y = iris_df.pop('target').values
X_df = iris_df

scaler = StandardScaler()

for column in X_df.columns:
    X_df[column] = scaler.fit_transform(X_df[column].values.reshape(-1, 1))

In [215]:
train_X, test_X, train_y, test_y = train_test_split(X_df.values, y, test_size=0.3, random_state=42)
test_X, val_X, test_y, val_y = train_test_split(test_X, test_y, test_size=0.5, random_state=42)

### Running Models

In [216]:
from sklearn.linear_model import LogisticRegressionCV, SGDClassifier, Perceptron, PassiveAggressiveClassifier, RidgeClassifierCV
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, StackingClassifier, VotingClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from sklearn.neural_network import MLPClassifier

In [265]:
def add_appropriate_kwargs(function, config_parameters):
    signature = inspect.signature(function)
    parameters = signature.parameters
    keywords = {k: v.default for k, v in parameters.items() if v.default is not inspect.Parameter.empty}

    for key, _ in keywords.items():
        if key in config_parameters:
            keywords[key] = config_parameters[key]

    return partial(function, **keywords)        

In [253]:
model_and_metrics = namedtuple('model_and_metrics', ['model', 'accuracy', 'log_loss', 'confusion_matrix'])

def train_model(model, train_X, train_y, test_X, test_y):
    model.fit(train_X, train_y)
    pred_y = model.predict(test_X)
    cm = confusion_matrix(test_y, pred_y)
    return model_and_metrics(model, accuracy_score(test_y, pred_y), log_loss(test_y, pred_y), cm)

models = [
    LogisticRegressionCV,
    SGDClassifier,
    Perceptron,
    PassiveAggressiveClassifier,
    RidgeClassifierCV,
    LinearSVC,
    SVC,
    NuSVC,
    DecisionTreeClassifier,
    RandomForestClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    KNeighborsClassifier,
    MLPClassifier,
    GaussianNB,
    BernoulliNB,
]

curried_models = [add_appropriate_kwargs(model, binary_classification_model_parameters) for model in models]

In [219]:
all_keywords = set()
def get_kwargs(function):
    signature = inspect.signature(function)
    parameters = signature.parameters
    keywords = {k: v.default for k, v in parameters.items() if v.default is not inspect.Parameter.empty}
    all_keywords.update(keywords.keys())
    return keywords

[get_kwargs(model) for model in models]

[{'Cs': 10,
  'fit_intercept': True,
  'cv': None,
  'dual': False,
  'penalty': 'l2',
  'scoring': None,
  'solver': 'lbfgs',
  'tol': 0.0001,
  'max_iter': 100,
  'class_weight': None,
  'n_jobs': None,
  'verbose': 0,
  'refit': True,
  'intercept_scaling': 1.0,
  'multi_class': 'auto',
  'random_state': None,
  'l1_ratios': None},
 {'loss': 'hinge',
  'penalty': 'l2',
  'alpha': 0.0001,
  'l1_ratio': 0.15,
  'fit_intercept': True,
  'max_iter': 1000,
  'tol': 0.001,
  'shuffle': True,
  'verbose': 0,
  'epsilon': 0.1,
  'n_jobs': None,
  'random_state': None,
  'learning_rate': 'optimal',
  'eta0': 0.0,
  'power_t': 0.5,
  'early_stopping': False,
  'validation_fraction': 0.1,
  'n_iter_no_change': 5,
  'class_weight': None,
  'warm_start': False,
  'average': False},
 {'penalty': None,
  'alpha': 0.0001,
  'l1_ratio': 0.15,
  'fit_intercept': True,
  'max_iter': 1000,
  'tol': 0.001,
  'shuffle': True,
  'verbose': 0,
  'eta0': 1.0,
  'n_jobs': None,
  'random_state': 0,
  'early_

In [220]:
all_keywords

{'C',
 'Cs',
 'activation',
 'algorithm',
 'alpha',
 'alphas',
 'average',
 'batch_size',
 'beta_1',
 'beta_2',
 'binarize',
 'bootstrap',
 'break_ties',
 'cache_size',
 'categorical_features',
 'ccp_alpha',
 'class_prior',
 'class_weight',
 'coef0',
 'criterion',
 'cv',
 'decision_function_shape',
 'degree',
 'dual',
 'early_stopping',
 'epsilon',
 'eta0',
 'fit_intercept',
 'fit_prior',
 'force_alpha',
 'gamma',
 'hidden_layer_sizes',
 'init',
 'interaction_cst',
 'intercept_scaling',
 'kernel',
 'l1_ratio',
 'l1_ratios',
 'l2_regularization',
 'leaf_size',
 'learning_rate',
 'learning_rate_init',
 'loss',
 'max_bins',
 'max_depth',
 'max_features',
 'max_fun',
 'max_iter',
 'max_leaf_nodes',
 'max_samples',
 'metric',
 'metric_params',
 'min_impurity_decrease',
 'min_samples_leaf',
 'min_samples_split',
 'min_weight_fraction_leaf',
 'momentum',
 'monotonic_cst',
 'multi_class',
 'n_estimators',
 'n_iter_no_change',
 'n_jobs',
 'n_neighbors',
 'nesterovs_momentum',
 'nu',
 'oob_score

In [222]:
pbar = tqdm(total=len(curried_models))
all_models = []
for curried_model in curried_models:
    pbar.set_description(f"Processing {curried_model.func.__name__}")
    all_models.append(train_model(curried_model(), train_X, train_y, test_X, test_y))
    pbar.update(1)
pbar.close()

Processing BernoulliNB: 100%|██████████| 17/17 [00:08<00:00,  2.02it/s]                   


In [223]:
#  all_models = [train_model(model(), train_X, train_y, test_X, test_y) for model in curried_models]

In [224]:
binary_classification_table = PrettyTable()
binary_classification_table.field_names = ["Model", "Accuracy", "Log Loss"]
for curried_model in all_models:
    binary_classification_table.add_row([curried_model.model.__class__.__name__, curried_model.accuracy, curried_model.log_loss])

print(binary_classification_table)

+--------------------------------+--------------------+--------------------+
|             Model              |      Accuracy      |      Log Loss      |
+--------------------------------+--------------------+--------------------+
|      LogisticRegressionCV      | 0.9882352941176471 | 0.4240429810484373 |
|         SGDClassifier          | 0.9764705882352941 | 0.8480859620968744 |
|           Perceptron           | 0.9294117647058824 | 2.5442578862906227 |
|  PassiveAggressiveClassifier   | 0.9764705882352941 | 0.8480859620968744 |
|       RidgeClassifierCV        | 0.9647058823529412 | 1.2721289431453116 |
|           LinearSVC            | 0.9764705882352941 | 0.8480859620968744 |
|              SVC               | 0.9529411764705882 | 1.6961719241937483 |
|             NuSVC              | 0.9529411764705882 | 1.6961719241937483 |
|     DecisionTreeClassifier     | 0.9411764705882353 | 2.1202149052421855 |
|     RandomForestClassifier     | 0.9647058823529412 | 1.2721289431453116 |

## Multiclass Classification

### Data Prep

In [225]:
from sklearn.datasets import load_iris

iris_df = load_iris(as_frame=True).frame

iris_df = iris_df.sample(frac=1, random_state=42).reset_index(drop=True)

y = iris_df.pop('target').values
X_df = iris_df

scaler = StandardScaler()

for column in X_df.columns:
    X_df[column] = scaler.fit_transform(X_df[column].values.reshape(-1, 1))

In [226]:
train_X, test_X, train_y, test_y = train_test_split(X_df.values, y, test_size=0.3, random_state=42)
test_X, val_X, test_y, val_y = train_test_split(test_X, test_y, test_size=0.5, random_state=42)

In [278]:
multiclass_classification_model_parameters = {
    "max_iter": ast.literal_eval(config["MULTICLASS CLASSIFICATION"]["maxiterations"]),
    "cv": ast.literal_eval(config["MULTICLASS CLASSIFICATION"]["cvfolds"]),
    "tolerance": ast.literal_eval(config["MULTICLASS CLASSIFICATION"]["tolerance"]),
    "random_state": ast.literal_eval(config["MULTICLASS CLASSIFICATION"]["randomstate"]),
    "verbose": ast.literal_eval(config["MULTICLASS CLASSIFICATION"]["verbose"]),
    "multi_class": ast.literal_eval(config["MULTICLASS CLASSIFICATION"]["multiclass"]),
    "labels": ast.literal_eval(config["MULTICLASS CLASSIFICATION"]["labels"]),
}

multiclass_classification_model_parameters = defaultdict(lambda: None, multiclass_classification_model_parameters)

In [279]:
multiclass_classification_model_parameters

defaultdict(<function __main__.<lambda>()>,
            {'max_iter': 10000,
             'cv': 5,
             'tolerance': 0.0001,
             'random_state': 42,
             'verbose': False,
             'multi_class': 'multinomial',
             'labels': [0, 1, 2]})

In [280]:
from sklearn.linear_model import LogisticRegressionCV, SGDClassifier, Perceptron, PassiveAggressiveClassifier, RidgeClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier, NearestCentroid
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier, StackingClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [281]:
models = [
    LogisticRegressionCV,
    SGDClassifier,
    Perceptron,
    PassiveAggressiveClassifier,
    RidgeClassifierCV,
    SVC,
    NuSVC,
    LinearSVC,
    KNeighborsClassifier,
    RadiusNeighborsClassifier,
    NearestCentroid,
    GaussianProcessClassifier,
    DecisionTreeClassifier,
    ExtraTreeClassifier,
    RandomForestClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    BaggingClassifier,
    VotingClassifier,
    StackingClassifier,
    HistGradientBoostingClassifier,
    GaussianNB,
    MultinomialNB,
    ComplementNB,
    BernoulliNB,
    MLPClassifier,
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis
]

In [325]:
model_and_metrics = namedtuple('model_and_metrics', ['model', 'accuracy', 'confusion_matrix'])

def train_model(model, train_X, train_y, test_X, test_y):
    model.fit(train_X, train_y)
    pred_y = model.predict(test_X)
    cm = confusion_matrix(y_true=test_y, y_pred=pred_y)
    return model_and_metrics(model, accuracy_score(y_true=test_y, y_pred=pred_y),  cm)

In [326]:
curried_models = [add_appropriate_kwargs(model, multiclass_classification_model_parameters) for model in models]

In [327]:
curried_models[0]

functools.partial(<class 'sklearn.linear_model._logistic.LogisticRegressionCV'>, Cs=10, fit_intercept=True, cv=5, dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=10000, class_weight=None, n_jobs=None, verbose=False, refit=True, intercept_scaling=1.0, multi_class='multinomial', random_state=42, l1_ratios=None)

In [328]:
multiclass_classification_models = []
pbar = tqdm(total=len(curried_models))
for curried_model in curried_models:
    pbar.set_description(f"Processing {curried_model.func.__name__}")
    multiclass_classification_models.append(train_model(curried_model(), train_X, train_y, test_X, test_y))
    pbar.update(1)

Processing LogisticRegressionCV:   0%|          | 0/29 [03:17<?, ?it/s]
Processing LinearSVC:  24%|██▍       | 7/29 [00:00<00:02,  8.19it/s]                  

InvalidParameterError: The 'multi_class' parameter of LinearSVC must be a str among {'crammer_singer', 'ovr'}. Got 'multinomial' instead.

Processing LinearSVC:  24%|██▍       | 7/29 [00:12<00:02,  8.19it/s]

In [317]:
lg_model = LogisticRegressionCV(max_iter=1000, cv=5, random_state=42, multi_class='multinomial')

In [318]:
lg_model.fit(train_X, train_y)

In [319]:
y_pred = lg_model.predict(test_X)

In [320]:
accuracy_score(test_y, y_pred)

1.0

In [322]:
help(log_loss)

Help on function log_loss in module sklearn.metrics._classification:

log_loss(y_true, y_pred, *, eps='auto', normalize=True, sample_weight=None, labels=None)
    Log loss, aka logistic loss or cross-entropy loss.
    
    This is the loss function used in (multinomial) logistic regression
    and extensions of it such as neural networks, defined as the negative
    log-likelihood of a logistic model that returns ``y_pred`` probabilities
    for its training data ``y_true``.
    The log loss is only defined for two or more labels.
    For a single sample with true label :math:`y \in \{0,1\}` and
    a probability estimate :math:`p = \operatorname{Pr}(y = 1)`, the log
    loss is:
    
    .. math::
        L_{\log}(y, p) = -(y \log (p) + (1 - y) \log (1 - p))
    
    Read more in the :ref:`User Guide <log_loss>`.
    
    Parameters
    ----------
    y_true : array-like or label indicator matrix
        Ground truth (correct) labels for n_samples samples.
    
    y_pred : array-like

In [324]:
y_pred

array([2, 0, 0, 0, 0, 0, 1, 2, 2, 1, 1, 0, 2, 1, 2, 1, 2, 0, 1, 2, 1, 1])

In [323]:
log_loss(test_y, y_pred, labels=[0, 1, 2])

ValueError: The number of classes in labels is different from that in y_pred. Classes found in labels: [0 1 2]

In [251]:
curried_models[0]

functools.partial(<class 'sklearn.linear_model._logistic.LogisticRegressionCV'>, Cs=10, fit_intercept=True, cv=5, dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=10000, class_weight=None, n_jobs=None, verbose=False, refit=True, intercept_scaling=1.0, multi_class='auto', random_state=42, l1_ratios=None)

In [250]:
lg_model.predict_proba(test_X)

array([[1.04741578e-06, 2.85604443e-02, 9.71438508e-01],
       [9.96921010e-01, 3.07899024e-03, 2.50170077e-14],
       [9.99935442e-01, 6.45578167e-05, 5.49210809e-18],
       [9.99816331e-01, 1.83669411e-04, 3.10237113e-17],
       [9.96428092e-01, 3.57190798e-03, 7.96396729e-16],
       [9.96586544e-01, 3.41345632e-03, 3.42934519e-15],
       [1.34950844e-03, 9.86331994e-01, 1.23184979e-02],
       [3.77511672e-06, 3.25949964e-02, 9.67401228e-01],
       [3.78253120e-08, 4.68999362e-03, 9.95309969e-01],
       [5.16236747e-03, 9.94810778e-01, 2.68549347e-05],
       [7.78957531e-05, 9.66759050e-01, 3.31630542e-02],
       [9.99176831e-01, 8.23169227e-04, 1.68064591e-15],
       [1.87590274e-12, 8.19217067e-05, 9.99918078e-01],
       [3.75713422e-05, 9.40207134e-01, 5.97552947e-02],
       [1.17997569e-09, 3.74100100e-03, 9.96258998e-01],
       [2.66312305e-03, 9.97112203e-01, 2.24673885e-04],
       [2.45591130e-06, 8.04774113e-02, 9.19520133e-01],
       [9.96264870e-01, 3.73512