In [32]:
%reload_ext autoreload
%autoreload 2


import numpy as np
from shared.models_store import Store

In [33]:
store = Store(None, '../dist/store.json')
store.load()

import json 

names_translations = {
  "Conv1d v2": "Konwolucja 1D wariant 2",
  "Decision Tree": "Drzewo decyzyjne",
  "Gaussian Naive Bayes": "Gaussian Naive Bayes",
  "Multi-Layer Perceptron": "Perceptron wielowarstwowy",
  "SVC": "SVC",
  "Quadratic Discriminant Analysis": "Kwadratowa analiza dyskryminacyjna",
  "Logistic Regression": "Regresja logistyczna",
  "Conv1d Deep": "Konwolucja 1D wariant 3",
  "Conv1d": "Konwolucja 1D wariant 1",
  "XGBoost": "XGBoost",
  "Random Forest": "Las losowy",
}

def rename(model):
    for source, target in sorted(names_translations.items(), reverse=True):
        model['name'] = model['name'].replace(source, target)    
    return model

configuration = store.configuration
models = [rename(m) for m in configuration.values() if 'Ada Boost' not in m['name'] and 'Gradient Boosting' not in m['name']]
imdb_models = [m for m in models if m['metadata'].get('Dataset') == 'Internet Movie Database']
food_models = [m for m in models if m['metadata'].get('Dataset') == 'Amazon Fine Food Reviews']

In [64]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from pylab import rcParams
from IPython.core.display import display, HTML
rcParams['figure.figsize'] = 15, 10
rcParams['font.size'] = 19

def plot_roc_curve(models, phase):
    # calculate the fpr and tpr for all thresholds of the classification
    plt.title('Krzywa ROC dla {}'.format('danych treningowych' if phase == 'train' else 'danych testowych'))
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('Czułość')
    plt.xlabel('1 - swoistość')
    
    models = sorted(models, reverse=True, key=lambda m: metrics.auc(m[phase]['roc']['fpr'], m[phase]['roc']['tpr']))
    
    for i, model in enumerate(models):
        fpr, tpr = model[phase]['roc']['fpr'], model[phase]['roc']['tpr']
        roc_auc = '{:.3f}'.format(metrics.auc(fpr, tpr), 3)
        model['metadata']['ROCAUC'] = roc_auc
        name = model['name'].replace('Food Reviews ', '').replace('IMDB ', '')
        plt.plot(fpr, tpr, label='{} {}'.format(roc_auc, name), linewidth=3, linestyle=[None, '--', '-.'][i%3])
    
    plt.legend(loc='lower right')
    plt.show()

    
def show_accuracy_summary(models):
    cols = ['Name', 'Accuracy', 'F1-Score', 'MCC', 'ROC AUC']
    train_df = pd.DataFrame([
        {
            'Name': m['name'].replace('Food Reviews ', '').replace('IMDB ', ''), 
            **{k: v for k, v in m['train']['metrics'].items() if k in cols},
        } for m in models
    ]).sort_values('MCC', ascending=False).set_index('Name')
    test_df = pd.DataFrame([
        {
            'Name': m['name'].replace('Food Reviews ', '').replace('IMDB ', ''), 
            **{k: v for k, v in m['test']['metrics'].items() if k in cols},
        } for m in models
    ]).sort_values('MCC', ascending=False).set_index('Name')
    print('Train mean')
    print(train_df.mean())
    print('Test mean')
    print(test_df.mean())
    display(HTML(train_df.style.background_gradient(cmap='viridis').render()))
    display(HTML(test_df.style.background_gradient(cmap='viridis').render()))
    display(HTML((test_df - train_df).sort_values('MCC', ascending=False).style.background_gradient(cmap='viridis').render()))
#     print(train_df.to_latex(index=False, float_format="%.3f", columns=cols))
#     print(test_df.to_latex(index=False, float_format="%.3f", columns=cols))
    

In [65]:
# plot_roc_curve(imdb_models, 'train')
# plot_roc_curve(imdb_models, 'test')
show_accuracy_summary(imdb_models)

Train mean
Accuracy    0.860607
F1-Score    0.857855
MCC         0.723271
ROC AUC     0.925679
dtype: float64
Test mean
Accuracy    0.801415
F1-Score    0.797785
MCC         0.606106
ROC AUC     0.884101
dtype: float64


Unnamed: 0_level_0,Accuracy,F1-Score,MCC,ROC AUC
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Konwolucja 1D wariant 2,0.95544,0.956606,0.912199,0.986134
Konwolucja 1D wariant 1,0.94964,0.951324,0.901441,0.986669
Las losowy,0.93004,0.928638,0.860744,0.980382
Konwolucja 1D wariant 3,0.92688,0.923219,0.857669,0.978798
Perceptron wielowarstwowy,0.89212,0.894578,0.785094,0.958553
Regresja logistyczna,0.85904,0.859254,0.718083,0.933385
XGBoost,0.84188,0.841747,0.683761,0.921478
Kwadratowa analiza dyskryminacyjna,0.83368,0.818094,0.677379,0.932427
SVC,0.82,0.818877,0.640047,0.898058
Drzewo decyzyjne,0.74828,0.753419,0.496992,0.817846


Unnamed: 0_level_0,Accuracy,F1-Score,MCC,ROC AUC
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Konwolucja 1D wariant 2,0.86156,0.869381,0.728362,0.941052
Konwolucja 1D wariant 1,0.85852,0.86944,0.727288,0.947424
Konwolucja 1D wariant 3,0.85372,0.842037,0.715309,0.942331
Regresja logistyczna,0.84684,0.846391,0.693692,0.924898
Perceptron wielowarstwowy,0.84348,0.846627,0.687539,0.924151
SVC,0.8112,0.808114,0.622722,0.889972
XGBoost,0.81036,0.80933,0.620756,0.893351
Las losowy,0.79288,0.79016,0.585957,0.875874
Kwadratowa analiza dyskryminacyjna,0.72624,0.69781,0.460709,0.827563
Drzewo decyzyjne,0.70816,0.713117,0.416569,0.776151


Unnamed: 0_level_0,Accuracy,F1-Score,MCC,ROC AUC
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gaussian Naive Bayes,-0.00708,-0.0074226,-0.014305,-0.0063979
SVC,-0.0088,-0.0107626,-0.0173251,-0.00808578
Regresja logistyczna,-0.0122,-0.0128625,-0.0243915,-0.00848682
XGBoost,-0.03152,-0.0324167,-0.0630048,-0.0281268
Drzewo decyzyjne,-0.04012,-0.0403014,-0.080423,-0.0416949
Perceptron wielowarstwowy,-0.04864,-0.0479512,-0.0975551,-0.0344024
Konwolucja 1D wariant 3,-0.07316,-0.081182,-0.14236,-0.0364678
Konwolucja 1D wariant 1,-0.09112,-0.0818841,-0.174153,-0.0392451
Konwolucja 1D wariant 2,-0.09388,-0.087225,-0.183837,-0.045082
Kwadratowa analiza dyskryminacyjna,-0.10744,-0.120284,-0.21667,-0.104864


In [66]:
# plot_roc_curve(food_models, 'train')
# plot_roc_curve(food_models, 'test')
show_accuracy_summary(food_models)

Train mean
Accuracy    0.867150
F1-Score    0.863439
MCC         0.735428
ROC AUC     0.925317
dtype: float64
Test mean
Accuracy    0.840745
F1-Score    0.838200
MCC         0.682977
ROC AUC     0.910535
dtype: float64


Unnamed: 0_level_0,Accuracy,F1-Score,MCC,ROC AUC
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Konwolucja 1D wariant 1,0.994073,0.994069,0.988147,0.99498
Konwolucja 1D wariant 2,0.988075,0.987995,0.97622,0.99351
Las losowy,0.9306,0.92966,0.861473,0.980636
Konwolucja 1D wariant 3,0.926472,0.928611,0.854562,0.97657
Perceptron wielowarstwowy,0.924142,0.9227,0.848823,0.977952
Regresja logistyczna,0.85064,0.849363,0.701349,0.925708
XGBoost,0.825485,0.823155,0.651165,0.905981
SVC,0.82405,0.82264,0.648441,0.904326
Kwadratowa analiza dyskryminacyjna,0.815488,0.804356,0.634965,0.901814
Drzewo decyzyjne,0.739007,0.738327,0.478015,0.814626


Unnamed: 0_level_0,Accuracy,F1-Score,MCC,ROC AUC
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Konwolucja 1D wariant 1,0.9363,0.937099,0.872586,0.978064
Konwolucja 1D wariant 2,0.9319,0.931136,0.864678,0.974078
Konwolucja 1D wariant 3,0.9029,0.907268,0.80776,0.965005
Perceptron wielowarstwowy,0.8825,0.881157,0.765833,0.947929
Las losowy,0.8541,0.853382,0.708614,0.931812
Regresja logistyczna,0.8462,0.846599,0.692512,0.92383
SVC,0.8205,0.81958,0.641406,0.90186
XGBoost,0.8151,0.814339,0.630556,0.899456
Kwadratowa analiza dyskryminacyjna,0.8048,0.795087,0.61421,0.889387
Gaussian Naive Bayes,0.7254,0.705048,0.457527,0.8012


Unnamed: 0_level_0,Accuracy,F1-Score,MCC,ROC AUC
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gaussian Naive Bayes,0.00478332,0.0080908,0.0109834,-0.00118148
SVC,-0.00355,-0.00306013,-0.00703567,-0.00246592
Regresja logistyczna,-0.00444,-0.00276375,-0.00883697,-0.0018781
XGBoost,-0.0103852,-0.00881606,-0.0206096,-0.00652457
Kwadratowa analiza dyskryminacyjna,-0.0106885,-0.00926848,-0.0207549,-0.0124273
Drzewo decyzyjne,-0.0105068,-0.00882652,-0.0209492,-0.0113636
Konwolucja 1D wariant 3,-0.0235719,-0.0213429,-0.0468021,-0.011565
Perceptron wielowarstwowy,-0.0416415,-0.0415433,-0.0829906,-0.0300228
Konwolucja 1D wariant 2,-0.0561753,-0.0568592,-0.111542,-0.0194316
Konwolucja 1D wariant 1,-0.0577734,-0.0569705,-0.11556,-0.0169157
