In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tabulate import tabulate

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler #scalers, robust works the best, needed for KNN

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.metrics import confusion_matrix

import joblib

In [39]:
df = pd.read_csv("../data/processed/256_20k_14possibilidades_dataset_F1_F9_teste.csv",index_col=[0])
print(df.shape)
print(df.head)

X = df.drop('tipo',axis=1)
y = df['tipo']

(20000, 10)
<bound method NDFrame.head of         K1   K2   K3   K4   K5   K6   K7   K8    K9  \
0     1.00 0.00 0.01 0.08 1.65 0.08 0.01 0.00  0.00   
1     1.56 0.00 0.04 0.11 2.43 0.10 0.01 0.02  0.37   
2     1.00 0.00 0.02 0.08 2.24 0.08 0.01 0.01  1.21   
3     1.03 0.00 0.01 0.08 1.63 0.08 0.01 0.01  0.00   
4     1.02 0.00 0.02 0.09 3.47 0.09 0.01 0.02  3.34   
...    ...  ...  ...  ...  ...  ...  ...  ...   ...   
19995 1.01 0.00 0.01 0.09 7.71 0.09 0.01 0.01 21.18   
19996 1.00 0.00 0.01 0.07 1.53 0.07 0.01 0.01 -0.28   
19997 1.00 0.00 0.01 0.08 1.66 0.08 0.01 0.00  0.00   
19998 1.00 0.00 0.01 0.08 2.32 0.08 0.01 0.00  0.01   
19999 1.01 0.00 0.02 0.09 2.70 0.08 0.01 0.00  2.13   

                                     tipo  
0                                senoidal  
1                      swell_and_harmonic  
2                        sag_and_harmonic  
3                                 flicker  
4      oscillatory_transient_and_harmonic  
...                              

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

scaler = RobustScaler() #handles the presence of outliers
scaler.fit(X_train)

X_train_original = X_train
X_test_original = X_test
X_train_KNN = scaler.transform(X_train)
X_test_KNN = scaler.transform(X_test)

In [41]:
path = "../models/"

xgb_model = joblib.load(path + "xgb_model.pkl")
svm_model = joblib.load(path + "svm_model.pkl")
knn_model = joblib.load(path + "knn_model.pkl")
nb_model = joblib.load(path + "nb_model.pkl")
dt_model = joblib.load(path + "dt_model.pkl")
bdt_model = joblib.load(path + "bdt_model.pkl")
rf_model = joblib.load(path + "rf_model.pkl")
gb_model = joblib.load(path + "gb_model.pkl")

In [42]:
models = []
models.extend([xgb_model, svm_model, knn_model, nb_model, dt_model, bdt_model, rf_model, gb_model])

model_names = ["XGBoost", "SVM", "KNN", "Naive_Bayes", "Decision_Tree", "Bagged_Decision_Tree", "Random_Forest", "Gradient_Boosting_Tree"]
metrics_being_compared = ["Accuracy Score", "Precision Score", "Recall Score", "F1 Score"]

In [43]:
comparative_data = []

for index, model in enumerate(models):
    X_train = X_train_original
    X_test = X_test_original
    if (model_names[index] == "KNN"):
        X_train = X_train_KNN
        X_test = X_test_KNN

    predictions = model.predict(X_test)
    print("")
    print("---------------------------//---------------------------")
    print("")

    print("Model: ", model_names[index])
    print("")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("")
    print("Classification Report:")
    print(classification_report(y_test, predictions))

    accuracy_score_training = model.score(X_train, y_train)
    accuracy_score_validation = model.score(X_test, y_test)
    print("Accuracy score (training):  {0:.3f}".format(accuracy_score_training))
    print("Accuracy score (validation): {0:.3f}".format(accuracy_score_validation))

    M1 = accuracy_score(y_test, predictions)    
    M2 = precision_score(y_test, predictions, average='weighted')
    M3 = recall_score(y_test, predictions, average='weighted')
    M4 = f1_score(y_test, predictions, average='weighted')

    df_compare_values = np.array([M1,M2, M3, M4])
    comparative_data.append(df_compare_values)   


---------------------------//---------------------------

Model:  XGBoost

Confusion Matrix:
[[476   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0 364  21   0   0   0   1   0   0   0   0   0   0   0]
 [  0  16 397   0   0   0   1   0   1   0   0   0   0   0]
 [  0   0   0 394   0   0   0   3   0   0   0   0   0   0]
 [  0   1   0   0 431   0   0   0  18   0   0   0   0   0]
 [  0   0   0   0   0 472   0   0   0   0   0   0   0   0]
 [  0   1   1   0   0   0 429   0   0   0   0   0   0   0]
 [  0   0   0   2   0   0   0 421   0   0   0   0   0   0]
 [  0   2   1   0  19   0   0   0 387   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 441   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   2 406   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0 455   0   0]
 [  0   0   0   0   0   0   0   0   1   0   0   0 423   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0 413]]

Classification Report:
                                    precision    recall

In [44]:
#creating dataframe
comparative_df = pd.DataFrame(data=comparative_data,columns=metrics_being_compared)
comparative_df['Modelos'] = np.transpose(model_names)
comparative_df = comparative_df.round(3)
column_name="Modelos"
first_column = comparative_df.pop(column_name)
comparative_df.insert(0, column_name, first_column)

In [45]:
print("")
print("---------------------------//---------------------------")
print("")


---------------------------//---------------------------



In [46]:
print(tabulate(comparative_df, headers='keys', tablefmt='psql'))
comparative_df.to_html('comparative_df.html') #export the table

+----+------------------------+------------------+-------------------+----------------+------------+
|    | Modelos                |   Accuracy Score |   Precision Score |   Recall Score |   F1 Score |
|----+------------------------+------------------+-------------------+----------------+------------|
|  0 | XGBoost                |            0.985 |             0.985 |          0.985 |      0.985 |
|  1 | SVM                    |            0.934 |             0.936 |          0.934 |      0.934 |
|  2 | KNN                    |            0.918 |             0.92  |          0.918 |      0.918 |
|  3 | Naive_Bayes            |            0.828 |             0.849 |          0.828 |      0.828 |
|  4 | Decision_Tree          |            0.98  |             0.98  |          0.98  |      0.98  |
|  5 | Bagged_Decision_Tree   |            0.981 |             0.981 |          0.981 |      0.981 |
|  6 | Random_Forest          |            0.988 |             0.988 |          0.988 |    

In [68]:
''' #we can use lazyPredict to see diferent results for diferents classifiers (not the best strategy though)
import sklearn.utils
from lazypredict.Supervised import LazyClassifier

lazy_model = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = lazy_model.fit(X_train, X_test, y_train, y_test)

print(models)
'''

 90%|████████▉ | 26/29 [01:45<00:12,  4.20s/it]



100%|██████████| 29/29 [02:00<00:00,  4.16s/it]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
ExtraTreesClassifier               0.95               0.95    None      0.95   
RandomForestClassifier             0.95               0.95    None      0.95   
LGBMClassifier                     0.95               0.95    None      0.95   
XGBClassifier                      0.95               0.95    None      0.95   
BaggingClassifier                  0.94               0.93    None      0.94   
DecisionTreeClassifier             0.93               0.93    None      0.93   
QuadraticDiscriminantAnalysis      0.92               0.92    None      0.92   
ExtraTreeClassifier                0.91               0.90    None      0.91   
LogisticRegression                 0.85               0.85    None      0.85   
SVC                                0.85               0.84    None      0.85   
LinearSVC                          0.84 


