# Example Notebook for classifier finder

## 1. libraries

In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from sam_ml.models import CTest

## 2. data

In [2]:
df = load_iris()
y = pd.Series(df.target)
X = pd.DataFrame(df.data, columns=df.feature_names)
x_train, x_test, y_train, y_test = train_test_split(X,y, train_size=0.80, random_state=42)

## 3. model

## 3.1. evaluation of the models

### 3.1.1. small dataset crossvalidation

In [3]:
tester = CTest()
tester.eval_models_cv(X ,y , avg="macro", small_data_eval=True)
tester.output_scores_as_pd(sort_by="recall", console_out=False)

Crossvalidation: 100%|██████████| 18/18 [02:35<00:00,  8.63s/it]


Unnamed: 0,accuracy,precision,recall,s_score,l_score,avg train score,avg train time
LinearDiscriminantAnalysis,0.98,0.980125,0.98,0.9904373,1.0,0.98,0:00:00
MLP Classifier,0.973333,0.975309,0.973333,0.9888789,1.0,0.979866,0:00:00
QuadraticDiscriminantAnalysis,0.973333,0.973825,0.973333,0.9894085,1.0,0.980045,0:00:00
LogisticRegression,0.966667,0.966787,0.966667,0.9885291,1.0,0.973468,0:00:00
SupportVectorClassifier (rbf-kernel),0.966667,0.966787,0.966667,0.9885291,1.0,0.972796,0:00:00
KNeighborsClassifier,0.966667,0.966787,0.966667,0.9885291,1.0,0.966935,0:00:00
BaggingClassifier (DTC based),0.96,0.96,0.96,0.9874448,1.0,0.995884,0:00:00
BaggingClassifier (RFC based),0.96,0.96,0.96,0.9874448,1.0,0.985101,0:00:00
LinearSupportVectorClassifier,0.96,0.96,0.96,0.9874448,1.0,0.967562,0:00:00
GradientBoostingMachine,0.953333,0.953448,0.953333,0.9861117,1.0,1.0,0:00:00


### 3.1.2. multiple split crossvalidation

In [4]:
tester = CTest()
tester.eval_models_cv(X, y, avg="macro", small_data_eval=False)
tester.output_scores_as_pd(sort_by="recall", console_out=False)

Crossvalidation: 100%|██████████| 18/18 [00:07<00:00,  2.33it/s]


Unnamed: 0,accuracy,precision,recall,s_score,l_score,avg train score,avg train time
LogisticRegression,0.0,0.0,0.0,4e-06,0.0,0.986667,0:00:00
QuadraticDiscriminantAnalysis,0.0,0.0,0.0,6e-06,0.0,0.99,0:00:00
BaggingClassifier (DTC based),0.0,0.0,0.0,4e-06,0.0,1.0,0:00:00
GaussianProcessClassifier,0.0,0.0,0.0,4e-06,0.0,0.986667,0:00:00
BernoulliNB,0.0,0.0,0.0,6e-06,0.0,0.5,0:00:00
GaussianNB,0.0,0.0,0.0,6e-06,0.0,0.98,0:00:00
ExtraTreesClassifier,0.0,0.0,0.0,6e-06,0.0,1.0,0:00:00
KNeighborsClassifier,0.0,0.0,0.0,4e-06,0.0,0.983333,0:00:00
AdaBoostClassifier (RFC based),0.0,0.0,0.0,4e-06,0.0,1.0,0:00:00
AdaBoostClassifier (DTC based),0.0,0.0,0.0,4e-06,0.0,1.0,0:00:00


### 3.1.3. evaluate on given train-test-split

In [5]:
tester = CTest()
tester.eval_models(x_train, y_train, x_test, y_test, avg="macro")
tester.output_scores_as_pd(sort_by="recall", console_out=False)

Crossvalidation: 100%|██████████| 18/18 [00:01<00:00, 17.06it/s]


Unnamed: 0,accuracy,precision,recall,s_score,l_score,train_score,train_time
LogisticRegression,1.0,1.0,1.0,0.9926,1.0,0.975,0:00:00
GradientBoostingMachine,1.0,1.0,1.0,0.9926,1.0,1.0,0:00:00
BaggingClassifier (DTC based),1.0,1.0,1.0,0.9926,1.0,1.0,0:00:00
GaussianProcessClassifier,1.0,1.0,1.0,0.9926,1.0,0.966667,0:00:00
GaussianNB,1.0,1.0,1.0,0.9926,1.0,0.95,0:00:00
ExtraTreesClassifier,1.0,1.0,1.0,0.9926,1.0,1.0,0:00:00
KNeighborsClassifier,1.0,1.0,1.0,0.9926,1.0,0.966667,0:00:00
AdaBoostClassifier (RFC based),1.0,1.0,1.0,0.9926,1.0,1.0,0:00:00
AdaBoostClassifier (DTC based),1.0,1.0,1.0,0.9926,1.0,0.966667,0:00:00
SupportVectorClassifier (rbf-kernel),1.0,1.0,1.0,0.9926,1.0,0.975,0:00:00


### 3.2. find best model

#### 3.2.1. creating scores in find_best_model method

In [6]:
tester = CTest()
tester.find_best_model(x_train, y_train, x_test, y_test, scoring="recall", avg="macro", rand_search=True)

no scores are already created -> creating scores using 'eval_models()'


Crossvalidation: 100%|██████████| 18/18 [00:01<00:00, 17.10it/s]


                                      accuracy  precision    recall   s_score  \
LogisticRegression                    1.000000   1.000000  1.000000  0.992600   
LinearDiscriminantAnalysis            1.000000   1.000000  1.000000  0.992600   
MLP Classifier                        1.000000   1.000000  1.000000  0.992600   
LinearSupportVectorClassifier         1.000000   1.000000  1.000000  0.992600   
DecisionTreeClassifier                1.000000   1.000000  1.000000  0.992600   
RandomForestClassifier                1.000000   1.000000  1.000000  0.992600   
SupportVectorClassifier (rbf-kernel)  1.000000   1.000000  1.000000  0.992600   
GradientBoostingMachine               1.000000   1.000000  1.000000  0.992600   
AdaBoostClassifier (DTC based)        1.000000   1.000000  1.000000  0.992600   
AdaBoostClassifier (RFC based)        1.000000   1.000000  1.000000  0.992600   
KNeighborsClassifier                  1.000000   1.000000  1.000000  0.992600   
ExtraTreesClassifier        

<sam_ml.models.LogisticRegression.LR at 0x7fa86a7ae970>

#### 3.2.2. creating scores using eval_models_cv

In [7]:
tester = CTest()
tester.eval_models_cv(X ,y , avg="macro", small_data_eval=True, secondary_scoring="recall", pos_label=-1, strength=4)
tester.find_best_model(x_train, y_train, x_test, y_test, scoring="s_score", avg="macro", rand_search=True, secondary_scoring="recall", pos_label=-1, strength=4)

Crossvalidation: 100%|██████████| 18/18 [02:32<00:00,  8.45s/it]


-> using already created scores for the models. Please run 'eval_models()'/'eval_models_cv()' again if something changed with the data

best model type (s_score):  LinearDiscriminantAnalysis  -  0.9762483732195558
starting to hyperparametertune best model type (rand_search = True)...


Best: 0.943683 using {'solver': 'lsqr', 'shrinkage': 'auto'}


... hyperparameter tuning finished

accuracy:  1.0
precision:  1.0
recall:  1.0
s_score:  0.9926004570086354
l_score:  1.0

classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



<sam_ml.models.LinearDiscriminantAnalysis.LDA at 0x7fa86b081070>