In [300]:
import sqlite3
import json

import pandas as pd
import numpy as np

In [301]:
con = sqlite3.connect("meter3.db")
cur = con.cursor()

In [302]:
data = []

sql = """
SELECT predictions.ida, predictions.idb, predictions.scores, ground_truth.label
FROM predictions
INNER JOIN ground_truth
ON predictions.ida = ground_truth.ida AND predictions.idb = ground_truth.idb
"""

for sid, tid, scores, label in cur.execute(sql).fetchall():
    data.append(
        {
            "sid": sid,
            "tid": tid,
            "scores": list(json.loads(scores).values()),
            "label_multi": label,
            "label_binary": 0 if label == "nd" else 1,
        }
    )

df = pd.DataFrame(data)
df

Unnamed: 0,sid,tid,scores,label_multi,label_binary
0,162,595,"[0.6269430051813472, 0.32225063938618925, 0.21...",pd,1
1,162,596,"[0.6453900709219859, 0.3155893536121673, 0.173...",nd,0
2,162,597,"[0.6861924686192469, 0.4491682070240296, 0.343...",pd,1
3,162,598,"[0.9633507853403142, 0.8232323232323232, 0.753...",wd,1
4,163,599,"[0.7659574468085106, 0.4342629482071713, 0.230...",pd,1
...,...,...,...,...,...
939,245,867,"[0.6867469879518072, 0.3565217391304348, 0.235...",pd,1
940,245,868,"[0.7619047619047619, 0.36470588235294116, 0.22...",wd,1
941,245,869,"[0.5480769230769231, 0.4218978102189781, 0.339...",pd,1
942,246,870,"[0.5572519083969466, 0.30456852791878175, 0.18...",pd,1


In [303]:
data = []

sql2 = """
SELECT ida, idb, scores, "nd" as label
FROM extra_predictions
"""

for sid, tid, scores, label in cur.execute(sql2).fetchall():
    data.append(
        {
            "sid": sid,
            "tid": tid,
            "scores": list(json.loads(scores).values()),
            "label_multi": label,
            "label_binary": 0 if label == "nd" else 1,
        }
    )

df2 = pd.DataFrame(data)
df2

Unnamed: 0,sid,tid,scores,label_multi,label_binary
0,5,426,"[0.14906832298136646, 0.04403131115459882, 0.0...",nd,0
1,5,445,"[0.16923076923076924, 0.06222222222222222, 0.0...",nd,0
2,5,255,"[0.1574074074074074, 0.05, 0.00992907801418439...",nd,0
3,5,331,"[0.08408163265306122, 0.019245773732119636, 0....",nd,0
4,7,406,"[0.07629164654756157, 0.017488315995778683, 0....",nd,0
...,...,...,...,...,...
3319,439,94,"[0.41223671013039115, 0.159303313508921, 0.038...",nd,0
3320,439,113,"[0.6008403361344538, 0.24205378973105135, 0.06...",nd,0
3321,439,132,"[0.34146341463414637, 0.12367293723225926, 0.0...",nd,0
3322,439,151,"[0.39787485242030696, 0.13925501432664755, 0.0...",nd,0


In [304]:
data = df.append(df2, ignore_index=True)
data

Unnamed: 0,sid,tid,scores,label_multi,label_binary
0,162,595,"[0.6269430051813472, 0.32225063938618925, 0.21...",pd,1
1,162,596,"[0.6453900709219859, 0.3155893536121673, 0.173...",nd,0
2,162,597,"[0.6861924686192469, 0.4491682070240296, 0.343...",pd,1
3,162,598,"[0.9633507853403142, 0.8232323232323232, 0.753...",wd,1
4,163,599,"[0.7659574468085106, 0.4342629482071713, 0.230...",pd,1
...,...,...,...,...,...
4263,439,94,"[0.41223671013039115, 0.159303313508921, 0.038...",nd,0
4264,439,113,"[0.6008403361344538, 0.24205378973105135, 0.06...",nd,0
4265,439,132,"[0.34146341463414637, 0.12367293723225926, 0.0...",nd,0
4266,439,151,"[0.39787485242030696, 0.13925501432664755, 0.0...",nd,0


In [305]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import numpy as np


In [306]:
svm_params = {
    "loss": ["hinge", "squared_hinge"],
    "C": [.1, 1, 10, 100, 1000],
    "max_iter": [10000]
}

nb_params = {
    "alpha": [0, .0001, .001, .01, .1, 1],
    "fit_prior": [True, False],
    "norm": [True, False],
}

rf_params = {
    "n_estimators": [55, 58, 60, 62, 65, 70],
    "criterion": ["entropy"], #["gini", "entropy"],
    "max_depth": [None], #[10, 25, 50, 100, None],
    "max_features": ["sqrt"],
}

mlp_params = {
    "clf__hidden_layer_sizes": [100, 110, 175, 200, 225, 250, 300, 500],
    "clf__activation": ["relu"],
    "clf__solver" : ["lbfgs"],
    "clf__alpha": [.5, .75, .0, 1, 1.1, 1.25, 1.5, 2], # [6.2, 6.225, 6.25, 6.75, 6.3], 
    "clf__max_iter": [10000],
    "clf__random_state": [1337]
}

ada_params = {
    "n_estimators": [10, 50, 100, 200, 500, 1000],
    "learning_rate": [.0001, .001, .01, .1, 1.0, 10.0],
}

knc_params = {
    "n_neighbors": [1, 5, 10, 20, 50, 100],
    "weights": ["uniform", "distance"],
}

pairs = [
    # (LinearSVC(), svm_params),
    # (ComplementNB(), nb_params),
    (RandomForestClassifier(), rf_params),
    # (Pipeline([("scaler", StandardScaler()), ("clf", MLPClassifier())]), mlp_params),
    # (AdaBoostClassifier(), ada_params),
    # (KNeighborsClassifier(), knc_params),
]

X_train, X_test, y_train, y_test = train_test_split(data.scores, data.label_binary, test_size=0.20, shuffle=True, stratify=data.label_binary, random_state=1337)

gscvs = []
best_models = []

for clf, params in pairs:
    gs = GridSearchCV(clf, params, cv=5, verbose=0, scoring="f1", n_jobs=24)
    gs.fit(list(X_train), list(y_train))
    gscvs.append(gs)
    best_models.append((gs.best_score_, gs.best_estimator_, params))

In [307]:
best_models.sort(key= lambda x: x[0], reverse=True)
best_models

[(0.8717383567650057,
  RandomForestClassifier(criterion='entropy', max_features='sqrt',
                         n_estimators=62),
  {'n_estimators': [55, 58, 60, 62, 65, 70],
   'criterion': ['entropy'],
   'max_depth': [None],
   'max_features': ['sqrt']})]

In [308]:
best_model = best_models[0][1]

best_predicted = best_model.predict(list(X_test))

print("Evalutaion Binary Best Model:\n-------------------------------------------")
print(type(best_model))
print("Precision:", metrics.accuracy_score(y_test, best_predicted))
print("Test Data len:", len(y_test))
print(metrics.classification_report(y_test, best_predicted))
print("Confusion Matrix:\n  n, d")
print(metrics.confusion_matrix(y_test, best_predicted))

Evalutaion Binary Best Model:
-------------------------------------------
<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Precision: 0.968384074941452
Test Data len: 854
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       706
           1       0.90      0.93      0.91       148

    accuracy                           0.97       854
   macro avg       0.94      0.95      0.95       854
weighted avg       0.97      0.97      0.97       854

Confusion Matrix:
  n, d
[[690  16]
 [ 11 137]]


In [309]:
for score, model, _ in best_models:
    predicted = model.predict(list(X_test))

    print("Evalutaion Binary Best Model:\n-------------------------------------------")
    print(model)
    print("Score:", score)
    print("Precision:", metrics.accuracy_score(y_test, predicted))
    print("Test Data len:", len(y_test))
    print(metrics.classification_report(y_test, predicted))
    print("Confusion Matrix:\n  n, d")
    print(metrics.confusion_matrix(y_test, predicted))

Evalutaion Binary Best Model:
-------------------------------------------
RandomForestClassifier(criterion='entropy', max_features='sqrt',
                       n_estimators=62)
Score: 0.8717383567650057
Precision: 0.968384074941452
Test Data len: 854
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       706
           1       0.90      0.93      0.91       148

    accuracy                           0.97       854
   macro avg       0.94      0.95      0.95       854
weighted avg       0.97      0.97      0.97       854

Confusion Matrix:
  n, d
[[690  16]
 [ 11 137]]


In [313]:
import pickle

with open("models/meter_best_model_modified_rf.pickle", mode="wb") as f:
    pickle.dump(best_model, f)

In [311]:
best_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 62,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [312]:
raise KeyboardInterrupt

###
# Multiclass
###

svm_params = {
    "loss": ["hinge", "squared_hinge"],
    "C": [.1, 1, 10, 100, 1000],
    "max_iter": [10000]
}

nb_params = {
    "alpha": [0, .0001, .001, .01, .1, 1],
    "fit_prior": [True, False],
    "norm": [True, False],
}

rf_params = {
    "n_estimators": [10, 50, 100, 200, 400, 800, 1000],
    "criterion": ["gini", "entropy"],
    "max_depth": [10, 100, None],
}

mlp_params = {
    "hidden_layer_sizes": [10, 50, 100, 500, 1000],
    "activation": ["identity", "logistic", "tanh", "relu"],
    "solver" : ["lbfgs", "sgd", "adam"],
    "alpha": [.0001, .001, .01],
    "max_iter": [2000],
    "random_state": [1337]
}

ada_params = {
    "n_estimators": [10, 50, 100, 200, 500, 1000],
    "learning_rate": [.0001, .001, .01, .1, 1.0, 10.0],
}

knc_params = {
    "n_neighbors": [1, 5, 10, 20, 50, 100],
    "weights": ["uniform", "distance"],
}

pairs = [
    (LinearSVC(), svm_params),
    (ComplementNB(), nb_params),
    (RandomForestClassifier(), rf_params),
    (MLPClassifier(), mlp_params),
    (AdaBoostClassifier(), ada_params),
    (KNeighborsClassifier(), knc_params),
]

X_train, X_test, y_train, y_test = train_test_split(data.scores, data.label_multi, test_size=0.30, shuffle=True, stratify=data.label_binary, random_state=1337)

gscvs_multi = []
best_models_multi = []

for clf, params in pairs:
    gs = GridSearchCV(clf, params, cv=5, verbose=1, scoring="f1_macro", n_jobs=22)
    gs.fit(list(X_train), list(y_train))
    gscvs_multi.append(gs)
    best_models_multi.append((gs.best_score_, gs.best_estimator_, params))

KeyboardInterrupt: 

In [None]:
best_models_multi.sort(key= lambda x: x[0], reverse=True)
best_models_multi

[(nan,
  RandomForestClassifier(max_depth=10, n_estimators=10),
  {'n_estimators': [10, 50, 100, 200, 400, 800, 1000],
   'criterion': ['gini', 'entropy'],
   'max_depth': [10, 100, None]}),
 (0.7857167943104744,
  AdaBoostClassifier(learning_rate=0.01, n_estimators=1000),
  {'n_estimators': [10, 50, 100, 200, 500, 1000],
   'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]}),
 (0.6890530000229902,
  KNeighborsClassifier(n_neighbors=100, weights='distance'),
  {'n_neighbors': [1, 5, 10, 20, 50, 100],
   'weights': ['uniform', 'distance']}),
 (0.555752497966449,
  MLPClassifier(activation='logistic', hidden_layer_sizes=500, max_iter=2000,
                random_state=1337, solver='lbfgs'),
  {'hidden_layer_sizes': [10, 50, 100, 500, 1000],
   'activation': ['identity', 'logistic', 'tanh', 'relu'],
   'solver': ['lbfgs', 'sgd', 'adam'],
   'alpha': [0.0001, 0.001, 0.01],
   'max_iter': [2000],
   'random_state': [1337]}),
 (0.3669091045605855,
  LinearSVC(C=100, max_iter=10000),
  {

In [None]:
best_model_multi = best_models_multi[0][1]

best_predicted_multi = best_model_multi.predict(list(X_test))

print("Evalutaion Multiclass Best Model:\n-------------------------------------------")
print(type(best_model_multi))
print("Params:", best_models_multi[0][2])
print("Precision:", metrics.accuracy_score(y_test, best_predicted_multi))
print(metrics.classification_report(y_test, best_predicted_multi, labels=['pd', 'wd', 'nd']))
print("Confusion Matrix:\n  'pd', 'wd', 'nd'")
print(metrics.confusion_matrix(y_test, best_predicted_multi))

Evalutaion Multiclass Best Model:
-------------------------------------------
<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Params: {'n_estimators': [10, 50, 100, 200, 400, 800, 1000], 'criterion': ['gini', 'entropy'], 'max_depth': [10, 100, None]}
Precision: 0.8896396396396397
              precision    recall  f1-score   support

          pd       0.65      0.62      0.64       130
          wd       0.75      0.72      0.73        92
          nd       0.95      0.97      0.96       666

    accuracy                           0.89       888
   macro avg       0.78      0.77      0.78       888
weighted avg       0.89      0.89      0.89       888

Confusion Matrix:
  'pd', 'wd', 'nd'
[[643  18   5]
 [ 32  81  17]
 [  1  25  66]]


In [None]:
with open("meter_best_model_multi.pickle", mode="wb") as f:
    pickle.dump(best_model_multi, f)