In [1]:
from script.formulation import *
from script.functions import *
import pandas as pd

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [2]:
# Prepare for paths
filenames = get_selected_files()
ds_paths = dict(zip(filenames, get_paths(filenames, 'ds')))
df_paths = dict(zip(filenames, get_paths(filenames, 'df')))
log_paths = dict(zip(filenames, get_paths(filenames, 'log')))
test_samples = ['i160-314',
                'i160-245',
                'i160-313',
                'i160-242',
                'i160-241',
                'i160-244',
                'i160-343',
                'i160-344',
                'i160-341',
                'i160-345',
                'i160-342']

In [3]:
# Read dataframes
train_list = []
test_list = []
runtimes = {}
for file in filenames:
    tmp_df, runtime = dataframe_generate(ds_paths[file], log_paths[file])
    if file in test_samples:
        test_list.append(tmp_df)
    else:
        train_list.append(tmp_df)
    runtimes[file] = runtime
df_train = pd.concat(train_list)
df_test = pd.concat(test_list)

In [4]:
# Save the runtime
df_runtime = pd.DataFrame(
    {'Filename' : runtimes.keys(), 
    'Runtime' : runtimes.values()})
df_runtime.to_csv("feature_runtimes.csv")

In [5]:
# Prepare the train, test set for Evaluation 1
x_train, y_train = split_x_y(df_train)
x_test, y_test = split_x_y(df_test)

In [6]:
clfs = {
    "Support Vector Machine" : SVC(
        class_weight='balanced', probability=True, random_state=0),
    "Random Forest" : RandomForestClassifier(class_weight='balanced'),
    "Logistic Regression" : LogisticRegression(
        class_weight='balanced', random_state=0),
}

In [7]:
# Adjust thresholds for LR classifier
clfs['Logistic Regression'].fit(x_train, y_train)
thresholds = np.arange(0,1,0.05)
for threshold in thresholds:
    y_pred_proba = clfs['Logistic Regression'].predict_proba(x_test)
    y_pred = (y_pred_proba [:,1] >= threshold).astype('int')
    tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
    print("Threshold:",np.round(threshold,2), 
        "FN rate:", np.round(fn/(fn+tp), 2),
        "Pruning Rate:", np.round(100*(fn+tn)/len(y_pred),2), '%')

Threshold: 0.0 FN rate: 0.0 Pruning Rate: 0.0 %
Threshold: 0.05 FN rate: 0.0 Pruning Rate: 0.0 %
Threshold: 0.1 FN rate: 0.01 Pruning Rate: 10.53 %
Threshold: 0.15 FN rate: 0.03 Pruning Rate: 57.43 %
Threshold: 0.2 FN rate: 0.04 Pruning Rate: 61.64 %
Threshold: 0.25 FN rate: 0.1 Pruning Rate: 82.11 %
Threshold: 0.3 FN rate: 0.14 Pruning Rate: 85.12 %
Threshold: 0.35 FN rate: 0.19 Pruning Rate: 88.16 %
Threshold: 0.4 FN rate: 0.23 Pruning Rate: 92.65 %
Threshold: 0.45 FN rate: 0.25 Pruning Rate: 93.78 %
Threshold: 0.5 FN rate: 0.3 Pruning Rate: 94.83 %
Threshold: 0.55 FN rate: 0.35 Pruning Rate: 95.54 %
Threshold: 0.6 FN rate: 0.38 Pruning Rate: 96.11 %
Threshold: 0.65 FN rate: 0.43 Pruning Rate: 96.67 %
Threshold: 0.7 FN rate: 0.48 Pruning Rate: 97.26 %
Threshold: 0.75 FN rate: 0.52 Pruning Rate: 97.62 %
Threshold: 0.8 FN rate: 0.56 Pruning Rate: 98.01 %
Threshold: 0.85 FN rate: 0.59 Pruning Rate: 98.31 %
Threshold: 0.9 FN rate: 0.62 Pruning Rate: 98.58 %
Threshold: 0.95 FN rate: 0.72 

In [8]:
# Adjust thresholds for SVM classifier
clfs['Support Vector Machine'].fit(x_train, y_train)
thresholds = np.arange(0,0.01,0.001)
for threshold in thresholds:
    y_pred_proba = clfs['Support Vector Machine'].predict_proba(x_test)
    y_pred = (y_pred_proba [:,1] >= threshold).astype('int')
    tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
    print("Threshold:",np.round(threshold,2), 
        "FN rate:", np.round(fn/(fn+tp), 2),
        "Pruning Rate:", np.round(100*(fn+tn)/len(y_pred),2), '%')