In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from functools import partial
from collections import namedtuple, defaultdict
import inspect
import configparser
from prettytable import PrettyTable
import ast
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [42]:
config = configparser.ConfigParser()
config.read('../config.ini')
config_dict = {section: dict(config[section]) for section in config.sections()}

In [48]:
model_parameters = {
    "max_iter": ast.literal_eval(config["CLASSIFICATION"]["maxiterations"]),
    "cv": ast.literal_eval(config["CLASSIFICATION"]["cvfolds"]),
    "tolerance": ast.literal_eval(config["CLASSIFICATION"]["tolerance"]),
    "random_state": ast.literal_eval(config["CLASSIFICATION"]["randomstate"]),
    "verbose": ast.literal_eval(config["CLASSIFICATION"]["verbose"]),
}

model_parameters = defaultdict(lambda: None, model_parameters)

# Simple Classification Models

## Binary Classification

1. `LogisticRegression`: Logistic Regression (aka logit, MaxEnt) classifier.

2. `SGDClassifier`: Linear classifiers (SVM, logistic regression, etc.) with SGD training.

3. `Perceptron`: The perceptron is a simple classification algorithm suitable for large scale learning.

4. `PassiveAggressiveClassifier`: Passive Aggressive Classifier.

5. `RidgeClassifier`: Classifier using Ridge regression.

6. `RidgeClassifierCV`: Ridge classifier with built-in cross-validation.

7. `LinearSVC`: Linear Support Vector Classification.

8. `SVC`: C-Support Vector Classification.

9. `NuSVC`: Nu-Support Vector Classification.

10. `DecisionTreeClassifier`: A decision tree classifier.

11. `RandomForestClassifier`: A random forest classifier.

12. `ExtraTreesClassifier`: An extra-trees classifier.

13. `GradientBoostingClassifier`: Gradient Boosting for classification.

14. `HistGradientBoostingClassifier`: Histogram-based Gradient Boosting Classification Tree.

15. `AdaBoostClassifier`: An AdaBoost classifier.

16. `BaggingClassifier`: A Bagging classifier.

17. `VotingClassifier`: Soft Voting/Majority Rule classifier for unfitted estimators.

18. `StackingClassifier`: Stacking classifier for unfitted estimators.

19. `KNeighborsClassifier`: Classifier implementing the k-nearest neighbors vote.

20. `RadiusNeighborsClassifier`: Classifier implementing a vote among neighbors within a given radius.

21. `MLPClassifier`: Multi-layer Perceptron classifier.

22. `GaussianNB`: Gaussian Naive Bayes (GaussianNB).

23. `BernoulliNB`: Naive Bayes classifier for multivariate Bernoulli models.

24. `ComplementNB`: The Complement Naive Bayes classifier.

25. `MultinomialNB`: Naive Bayes classifier for multinomial models.

Please refer to the [Scikit-learn documentation](https://scikit-learn.org/stable/supervised_learning.html#supervised-learning) for more details on each of these methods.


### Data Prep

In [50]:
from sklearn.datasets import load_breast_cancer

iris_df = load_breast_cancer(as_frame=True).frame

y = iris_df.pop('target').values
X_df = iris_df

scaler = StandardScaler()

for column in X_df.columns:
    X_df[column] = scaler.fit_transform(X_df[column].values.reshape(-1, 1))

In [51]:
train_X, test_X, train_y, test_y = train_test_split(X_df.values, y, test_size=0.3, random_state=42)
test_X, val_X, test_y, val_y = train_test_split(test_X, test_y, test_size=0.5, random_state=42)

### Running Models

In [52]:
from sklearn.linear_model import LogisticRegressionCV, SGDClassifier, Perceptron, PassiveAggressiveClassifier, RidgeClassifierCV
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, StackingClassifier, VotingClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from sklearn.neural_network import MLPClassifier

In [53]:
def add_appropriate_kwargs(function):
    signature = inspect.signature(function)
    parameters = signature.parameters
    keywords = {k: v.default for k, v in parameters.items() if v.default is not inspect.Parameter.empty}

    for key, _ in keywords.items():
        if key in model_parameters:
            keywords[key] = model_parameters[key]

    return partial(function, **keywords)        

In [62]:
model_and_metrics = namedtuple('model_and_metrics', ['model', 'accuracy', 'log_loss', 'confusion_matrix'])

def train_model(model, train_X, train_y, test_X, test_y):
    model.fit(train_X, train_y)
    pred_y = model.predict(test_X)
    cm = confusion_matrix(test_y, pred_y)
    return model_and_metrics(model, accuracy_score(test_y, pred_y), log_loss(test_y, pred_y), cm)

models = [
    LogisticRegressionCV,
    SGDClassifier,
    Perceptron,
    PassiveAggressiveClassifier,
    RidgeClassifierCV,
    LinearSVC,
    SVC,
    NuSVC,
    DecisionTreeClassifier,
    RandomForestClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    KNeighborsClassifier,
    MLPClassifier,
    GaussianNB,
    BernoulliNB,
]

curried_models = [add_appropriate_kwargs(model) for model in models]

In [63]:
all_keywords = set()
def get_kwargs(function):
    signature = inspect.signature(function)
    parameters = signature.parameters
    keywords = {k: v.default for k, v in parameters.items() if v.default is not inspect.Parameter.empty}
    all_keywords.update(keywords.keys())
    return keywords

[get_kwargs(model) for model in models]

[{'Cs': 10,
  'fit_intercept': True,
  'cv': None,
  'dual': False,
  'penalty': 'l2',
  'scoring': None,
  'solver': 'lbfgs',
  'tol': 0.0001,
  'max_iter': 100,
  'class_weight': None,
  'n_jobs': None,
  'verbose': 0,
  'refit': True,
  'intercept_scaling': 1.0,
  'multi_class': 'auto',
  'random_state': None,
  'l1_ratios': None},
 {'loss': 'hinge',
  'penalty': 'l2',
  'alpha': 0.0001,
  'l1_ratio': 0.15,
  'fit_intercept': True,
  'max_iter': 1000,
  'tol': 0.001,
  'shuffle': True,
  'verbose': 0,
  'epsilon': 0.1,
  'n_jobs': None,
  'random_state': None,
  'learning_rate': 'optimal',
  'eta0': 0.0,
  'power_t': 0.5,
  'early_stopping': False,
  'validation_fraction': 0.1,
  'n_iter_no_change': 5,
  'class_weight': None,
  'warm_start': False,
  'average': False},
 {'penalty': None,
  'alpha': 0.0001,
  'l1_ratio': 0.15,
  'fit_intercept': True,
  'max_iter': 1000,
  'tol': 0.001,
  'shuffle': True,
  'verbose': 0,
  'eta0': 1.0,
  'n_jobs': None,
  'random_state': 0,
  'early_

In [64]:
all_keywords

{'C',
 'Cs',
 'activation',
 'algorithm',
 'alpha',
 'alphas',
 'average',
 'batch_size',
 'beta_1',
 'beta_2',
 'binarize',
 'bootstrap',
 'break_ties',
 'cache_size',
 'categorical_features',
 'ccp_alpha',
 'class_prior',
 'class_weight',
 'coef0',
 'criterion',
 'cv',
 'decision_function_shape',
 'degree',
 'dual',
 'early_stopping',
 'epsilon',
 'eta0',
 'fit_intercept',
 'fit_prior',
 'force_alpha',
 'gamma',
 'hidden_layer_sizes',
 'init',
 'interaction_cst',
 'intercept_scaling',
 'kernel',
 'l1_ratio',
 'l1_ratios',
 'l2_regularization',
 'leaf_size',
 'learning_rate',
 'learning_rate_init',
 'loss',
 'max_bins',
 'max_depth',
 'max_features',
 'max_fun',
 'max_iter',
 'max_leaf_nodes',
 'max_samples',
 'metric',
 'metric_params',
 'min_impurity_decrease',
 'min_samples_leaf',
 'min_samples_split',
 'min_weight_fraction_leaf',
 'momentum',
 'monotonic_cst',
 'multi_class',
 'n_estimators',
 'n_iter_no_change',
 'n_jobs',
 'n_neighbors',
 'nesterovs_momentum',
 'nu',
 'oob_score

In [68]:
curried_models[0].func.__name__

'LogisticRegressionCV'

In [69]:
pbar = tqdm(total=len(curried_models))
all_models = []
for curried_model in curried_models:
    pbar.set_description(f"Processing {curried_model.func.__name__}")
    all_models.append(train_model(curried_model(), train_X, train_y, test_X, test_y))
    pbar.update(1)
pbar.close()

  0%|          | 0/17 [00:41<?, ?it/s]
Processing BernoulliNB: 100%|██████████| 17/17 [00:20<00:00,  1.22s/it]


In [71]:
#  all_models = [train_model(model(), train_X, train_y, test_X, test_y) for model in curried_models]

In [23]:
binary_classification_table = PrettyTable()
binary_classification_table.field_names = ["Model", "Accuracy", "Log Loss"]
for curried_model in all_models:
    binary_classification_table.add_row([curried_model.model.__class__.__name__, curried_model.accuracy, curried_model.log_loss])

print(binary_classification_table)

+--------------------------------+--------------------+--------------------+
|             Model              |      Accuracy      |      Log Loss      |
+--------------------------------+--------------------+--------------------+
|      LogisticRegressionCV      | 0.9882352941176471 | 0.4240429810484373 |
|         SGDClassifier          | 0.9764705882352941 | 0.8480859620968744 |
|           Perceptron           | 0.9294117647058824 | 2.5442578862906227 |
|  PassiveAggressiveClassifier   | 0.9764705882352941 | 0.8480859620968744 |
|       RidgeClassifierCV        | 0.9647058823529412 | 1.2721289431453116 |
|           LinearSVC            | 0.9764705882352941 | 0.8480859620968744 |
|              SVC               | 0.9529411764705882 | 1.6961719241937483 |
|             NuSVC              | 0.9529411764705882 | 1.6961719241937483 |
|     DecisionTreeClassifier     | 0.9411764705882353 | 2.1202149052421855 |
|     RandomForestClassifier     | 0.9647058823529412 | 1.2721289431453116 |

## Multiclass Classification

### Data Prep

In [31]:
from sklearn.datasets import load_iris

iris_df = load_iris(as_frame=True).frame

iris_df = iris_df.sample(frac=1, random_state=42).reset_index(drop=True)

y = iris_df.pop('target').values
X_df = iris_df

scaler = StandardScaler()

for column in X_df.columns:
    X_df[column] = scaler.fit_transform(X_df[column].values.reshape(-1, 1))

In [72]:
train_X, test_X, train_y, test_y = train_test_split(X_df.values, y, test_size=0.3, random_state=42)
test_X, val_X, test_y, val_y = train_test_split(test_X, test_y, test_size=0.5, random_state=42)

In [75]:
from sklearn.linear_model import LogisticRegressionCV, SGDClassifier, Perceptron, PassiveAggressiveClassifier, RidgeClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier, NearestCentroid
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier, StackingClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [76]:
models = [
    LogisticRegressionCV,
    SGDClassifier,
    Perceptron,
    PassiveAggressiveClassifier,
    RidgeClassifierCV,
    SVC,
    NuSVC,
    LinearSVC,
    KNeighborsClassifier,
    RadiusNeighborsClassifier,
    NearestCentroid,
    GaussianProcessClassifier,
    DecisionTreeClassifier,
    ExtraTreeClassifier,
    RandomForestClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    BaggingClassifier,
    VotingClassifier,
    StackingClassifier,
    HistGradientBoostingClassifier,
    GaussianNB,
    MultinomialNB,
    ComplementNB,
    BernoulliNB,
    MLPClassifier,
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis
]

In [77]:
model_and_metrics = namedtuple('model_and_metrics', ['model', 'accuracy', 'log_loss', 'confusion_matrix'])

def train_model(model, train_X, train_y, test_X, test_y):
    model.fit(train_X, train_y)
    pred_y = model.predict(test_X)
    cm = confusion_matrix(test_y, pred_y)
    return model_and_metrics(model, accuracy_score(test_y, pred_y), log_loss(test_y, pred_y), cm)

In [78]:
curried_models = [add_appropriate_kwargs(model) for model in models]