In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def get_sens_spec(y,pred):
    tp = np.sum(np.logical_and(y,pred==y))
    tn = np.sum(np.logical_and(1-y,pred==y))
    fp = np.sum(np.logical_and(pred,pred!=y))
    fn = np.sum(np.logical_and(1-pred,pred!=y))
    
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    acc = (tp+tn)/(tp+tn+fp+fn)
    prec = tp/(tp+fp)
    return [sens,spec,acc,prec]

In [3]:
X_raw = pd.read_csv("./Data/OHE_Eth.csv").iloc[:,1:17].to_numpy()
y = pd.read_csv("./Data/OHE_Eth.csv")["OverallPoF"].to_numpy()

In [4]:
from scipy.interpolate import CubicSpline
x_sp = np.arange(0,16)
cs=CubicSpline(x_sp,X_raw,axis=1)
X_in = cs(x_sp,1)

In [5]:
inds = (4, 5, 10, 13)
i1 = [9]

X = X_in[:,inds]
X = np.hstack((X,X_raw[:,i1]))

In [6]:
from sklearn.model_selection import train_test_split
rs = 646
tr,t = train_test_split([a for a in range(y.shape[0])],train_size=0.95,random_state=rs)

In [7]:
def rec_ss(curve):
    fpr = curve[0]
    tpr = curve[1]
    if tpr == 0:
        return 0
    if fpr ==0:
        return 0
    sigm = np.sum([1/fpr,1/tpr])
    return 2/sigm

In [8]:
def compare_cv_scores(cv_results_):
    nsplits = 116#len(tr)
    nparams = len(cv_results_['params'])
    
    senames = ["split"+str(a)+"_test_sens" for a in range(nsplits)]
    spnames = ["split"+str(a)+"_test_spec" for a in range(nsplits)]
    vlnames = ["split"+str(a)+"_test_vals" for a in range(nsplits)]
    
    scores = []
    for a in range(nparams):
        in_score = []
        for b in range(nsplits):
            s1 = int(cv_results_[senames[b]][a])
            s2 = int(cv_results_[spnames[b]][a])
            s3 = int(cv_results_[vlnames[b]][a])
            in_score.append([s1,s2,s3])
        scores.append(in_score)
    scores=np.array(scores)
    
    rss = []
    scs = []
    for a in range(len(scores)):
        ps = scores[a][:,2].astype(bool)
        ns = np.invert(ps)
        sens = np.mean(scores[a][ps][:,0])
        spec = np.mean(scores[a][ns][:,1])
        scs.append([sens,spec])
        c_rss = rec_ss([sens,spec])
        rss.append(c_rss)
    ind = np.argmax(rss)
    return ind

In [9]:
from sklearn.metrics import make_scorer

def spec_func(y_true,y_pred):
    s = np.mean(np.invert(y_true.astype(bool)) & np.invert(y_pred.astype(bool)))
    return s
def sens_func(y_true,y_pred):
    s = np.mean(y_true.astype(bool) & y_pred.astype(bool))
    return s
def vals_func(y_true,y_pred):
    return np.mean(y_true)

spec_scorer = make_scorer(spec_func)
sens_scorer = make_scorer(sens_func)
vals_scorer = make_scorer(vals_func)

multi_score={
    'sens':sens_scorer,
    'spec':spec_scorer,
    'vals':vals_scorer
}

params = {
    "max_depth":[1,2,3,4,5,6],
    "max_features":[1,2,3,4,5],
    "class_weight":['balanced'],
    "max_leaf_nodes":[2,3,4,5,6,7,8]
}

In [10]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore",category=UndefinedMetricWarning)

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut

dtc = DecisionTreeClassifier()
clf = GridSearchCV(dtc,params,scoring=multi_score,cv=LeaveOneOut(),refit=compare_cv_scores)
clf.fit(X_raw[tr],y[tr])
print(get_sens_spec(y[tr],clf.predict(X_raw[tr])))
print(get_sens_spec(y[t],clf.predict(X_raw[t])))

[0.8055555555555556, 0.7643979057591623, 0.7709251101321586, 0.3918918918918919]
[0.5, 0.8, 0.75, 0.3333333333333333]


In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut

dtc = DecisionTreeClassifier()
clf = GridSearchCV(dtc,params,scoring=multi_score,cv=LeaveOneOut(),refit=compare_cv_scores)
clf.fit(X[tr],y[tr])
print(get_sens_spec(y[tr],clf.predict(X[tr])))
print(get_sens_spec(y[t],clf.predict(X[t])))

[0.8611111111111112, 0.7905759162303665, 0.801762114537445, 0.43661971830985913]
[0.5, 0.8, 0.75, 0.3333333333333333]
