In [3]:
import argparse
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import time

In [16]:
def accuracy(yHat, yTest):
    return np.mean(yHat == yTest)

In [4]:
 def holdout(model, xFeat, y, testSize):
    """
    Split xFeat into random train and test based on the testSize and
    return the model performance on the training and test set. 

    Parameters
    ----------
    model : sktree.DecisionTreeClassifier
        Decision tree model
    xFeat : nd-array with shape n x d
        Features of the dataset 
    y : 1-array with shape n x 1
        Labels of the dataset
    testSize : float
        Portion of the dataset to serve as a holdout. 

    Returns
    -------
    trainAuc : float
        Average AUC of the model on the training dataset
    testAuc : float
        Average AUC of the model on the validation dataset
    timeElapsed: float
        Time it took to run this function
    """
    start = time.time()

    X_train, X_test, y_train, y_test = train_test_split(xFeat, y, test_size=testSize)

    model.fit(X_train, y_train)

    train_preds = model.predict_proba(X_train)[:, 1]
    trainAuc = roc_auc_score(y_train, train_preds)

    test_preds = model.predict_proba(X_test)[:, 1]
    testAuc = roc_auc_score(y_test, test_preds)

    timeElapsed = time.time() - start
    return trainAuc, testAuc, timeElapsed


def kfold_cv(model, xFeat, y, k):
    """
    Split xFeat into k different groups, and then use each of the
    k-folds as a validation set, with the model fitting on the remaining
    k-1 folds. Return the model performance on the training and
    validation (test) set. 


    Parameters
    ----------
    model : sktree.DecisionTreeClassifier
        Decision tree model
    xFeat : nd-array with shape n x d
        Features of the dataset 
    y : 1-array with shape n x 1
        Labels of the dataset
    k : int
        Number of folds or groups (approximately equal size)

    Returns
    -------
    trainAuc : float
        Average AUC of the model on the training dataset
    testAuc : float
        Average AUC of the model on the validation dataset
    timeElapsed: float
        Time it took to run this function
    """
    start = time.time()

    trainAuc = []
    testAuc = []

    kf = KFold(n_splits=k)
    for train_idx, test_idx in kf.split(xFeat):
        xTrain, xTest = xFeat.values[train_idx], xFeat.values[test_idx]
        yTrain, yTest = y.values[train_idx], y.values[test_idx]

        model.fit(xTrain, yTrain)

        yTrainPred = model.predict_proba(xTrain)[:,1]
        yTestPred = model.predict_proba(xTest)[:,1]

        train_auc = roc_auc_score(yTrain, yTrainPred)
        test_auc = roc_auc_score(yTest, yTestPred)

        trainAuc.append(train_auc)
        testAuc.append(test_auc)

    timeElapsed = time.time() - start
    return np.mean(trainAuc), np.mean(testAuc), timeElapsed


def mc_cv(model, xFeat, y, testSize, s):
    """
    Evaluate the model using s samples from the
    Monte Carlo cross validation approach where
    for each sample you split xFeat into
    random train and test based on the testSize.
    Returns the model performance on the training and
    test datasets.

    Parameters
    ----------
    model : sktree.DecisionTreeClassifier
        Decision tree model
    xFeat : nd-array with shape n x d
        Features of the dataset 
    y : 1-array with shape n x 1
        Labels of the dataset
    testSize : float
        Portion of the dataset to serve as a holdout. 

    Returns
    -------
    trainAuc : float
        Average AUC of the model on the training dataset
    testAuc : float
        Average AUC of the model on the validation dataset
    timeElapsed: float
        Time it took to run this function
    """
    start = time.time()

    trainAuc = []
    testAuc = []

    for _ in range(s):
        xTrain, xTest, yTrain, yTest = train_test_split(xFeat, y, test_size=testSize)
        model.fit(xTrain, yTrain)

        train_auc = roc_auc_score(yTrain, model.predict_proba(xTrain)[:,1])
        test_auc = roc_auc_score(yTest, model.predict_proba(xTest)[:,1])

        trainAuc.append(train_auc)
        testAuc.append(test_auc)

    timeElapsed = time.time() - start
    return np.mean(trainAuc), np.mean(testAuc), timeElapsed


def sktree_train_test(model, xTrain, yTrain, xTest, yTest):
    """
    Given a sklearn tree model, train the model using
    the training dataset, and evaluate the model on the
    test dataset.

    Parameters
    ----------
    model : DecisionTreeClassifier object
        An instance of the decision tree classifier 
    xTrain : nd-array with shape nxd
        Training data
    yTrain : 1d array with shape n
        Array of labels associated with training data
    xTest : nd-array with shape mxd
        Test data
    yTest : 1d array with shape m
        Array of labels associated with test data.

    Returns
    -------
    trainAUC : float
        The AUC of the model evaluated on the training data.
    testAuc : float
        The AUC of the model evaluated on the test data.
    """
    # fit the data to the training dataset
    model.fit(xTrain, yTrain)
    # predict training and testing probabilties
    yHatTrain = model.predict_proba(xTrain)
    yHatTest = model.predict_proba(xTest)
    # calculate auc for training
    fpr, tpr, _ = metrics.roc_curve(yTrain['IsBadBuy'],
                                             yHatTrain[:, 1])
    trainAuc = metrics.auc(fpr, tpr)
    # calculate auc for test dataset
    fpr, tpr, _ = metrics.roc_curve(yTest['IsBadBuy'],
                                             yHatTest[:, 1])
    testAuc = metrics.auc(fpr, tpr)
    return trainAuc, testAuc

In [5]:
from selection import get_dataset
xTrain, xTest, yTrain, yTest = get_dataset("../dataset/training.csv", True)

In [7]:
# create the decision tree classifier
dtClass = DecisionTreeClassifier(max_depth=15,
                                 min_samples_leaf=10)

In [18]:
# use the holdout set with a validation size of 30 of training
aucTrain1, aucVal1, time1 = holdout(dtClass, xTrain, yTrain, 0.70)
# use 2-fold validation
aucTrain2, aucVal2, time2 = kfold_cv(dtClass, xTrain, yTrain, 2)
# use 5-fold validation
aucTrain3, aucVal3, time3 = kfold_cv(dtClass, xTrain, yTrain, 5)
# use 10-fold validation
aucTrain4, aucVal4, time4 = kfold_cv(dtClass, xTrain, yTrain, 10)
# use MCCV with 5 samples
aucTrain5, aucVal5, time5 = mc_cv(dtClass, xTrain, yTrain, 0.70, 5)
# use MCCV with 10 samples
aucTrain6, aucVal6, time6 = mc_cv(dtClass, xTrain, yTrain, 0.70, 10)
# train it using all the data and assess the true value
trainAuc, testAuc = sktree_train_test(dtClass, xTrain, yTrain, xTest, yTest)
perfDF = pd.DataFrame([['Holdout', aucTrain1, aucVal1, time1],
                       ['2-fold', aucTrain2, aucVal2, time2],
                       ['5-fold', aucTrain3, aucVal3, time3],
                       ['10-fold', aucTrain4, aucVal4, time4],
                       ['MCCV w/ 5', aucTrain5, aucVal5, time5],
                       ['MCCV w/ 10', aucTrain6, aucVal6, time6],
                       ['True Test', trainAuc, testAuc, 0]],
                       columns=['Strategy', 'TrainAUC', 'ValAUC', 'Time'])
print(perfDF)

     Strategy  TrainAUC    ValAUC       Time
0     Holdout  0.845499  0.575938   3.853552
1      2-fold  0.812997  0.602786   9.313387
2      5-fold  0.815987  0.600676  36.198635
3     10-fold  0.801630  0.609975  93.897713
4   MCCV w/ 5  0.856601  0.572268  15.396957
5  MCCV w/ 10  0.838111  0.580082  35.083905
6   True Test  0.799722  0.616651   0.000000


In [12]:
dtClass.fit(xTrain, yTrain)
yhat = dtClass.predict(xTest)

In [17]:
accuracy(yhat, yTest["IsBadBuy"])

0.8627096349884906