## Step 1: set up controls for evaluation experiments

* (T/F) cross-validation on the training set
* (T/F) reweighting the samples for training set
* $K$, the number of CV folds
* (T/F) process features for training set
* (T/F) run evaluation on an independent test set
* (T/F) process features for test set

## Step 2: load data and train-test split

In [17]:
import scipy.io
import os
import numpy as np
import pandas as pd
import pickle

def get_points(file):
    '''load matlab style file'''
    mat = scipy.io.loadmat(file)
    return mat[list(mat.keys())[3]]

def pickle_save(filename, content):
    '''save the file into python pickle object under output folder'''
    with open('../output/%s.pkl'%filename, 'wb') as f:
        pickle.dump(content, f)
        
def pickle_open(filename):
    '''load the pickle file'''
    with open('../output/%s.pkl'%filename, 'rb') as f:
        content = pickle.load(f)
    return content
    

In [18]:
dir_list = os.listdir('../../train_set/points')
dir_list.sort()

fiducial_pt_full = np.stack((get_points('../../train_set/points/'+filename) for filename in dir_list))
label_full = pd.read_csv('../../train_set/label.csv')['label']

# save the data into pickle files, so that we don't need to read raw data everytime
pickle_save('fiducial_pt_full', fiducial_pt_full)
pickle_save('label_full', label_full)


### Train-test split

In [49]:
from sklearn.model_selection import train_test_split

# load data from pickle oject
fiducial_pt_full = pickle_open('fiducial_pt_full')
label_full = pickle_open('label_full')

## Note: randomly split into training & test set
X_train, X_test, y_train, y_test = train_test_split(fiducial_pt_full, label_full, test_size=0.2, random_state=42)


## Step 3: construct features and reponses

Here, use pairwise distances as features.

In [50]:
from sklearn.metrics import pairwise_distances

# extract pairwise distance as features (78*77/2=3303 features)
# nrow=number of records of the dataset; ncol=3303
feature_train = np.stack((pairwise_distances(X_train[i])[np.triu_indices(78, k = 1)] for i in range(X_train.shape[0])))
feature_test = np.stack((pairwise_distances(X_test[i])[np.triu_indices(78, k = 1)] for i in range(X_test.shape[0])))

### Not sure how to deal with the warning. Distances are calculated normally though

  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):


## Step 4: train classification model with training features and responses

### use 5-fold CV to calculate the validation AUC

(We should use CV to tune hyperparameters. But here no parameter is tuned LOL)

In [54]:
weights = np.zeros(len(y_train))
weights[y_train == 0] = 0.5
weights[y_train == 1] = 9.1

In [68]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
import time

gbt = GradientBoostingClassifier(random_state=0)
fit_params = {'sample_weight':weights}

time_start_val = time.time()

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
auc = cross_val_score(gbt, feature_train, y_train, fit_params=fit_params, scoring='roc_auc', cv=cv, n_jobs=-1)

time_end_val = time.time()
print('time cost {:.2f} s'.format(time_end_val-time_start_val))

time cost 175.69 s


In [69]:
print('Mean ROC AUC: %.5f' % np.mean(auc))

Mean ROC AUC: 0.75767


### train the model on whole training set, and save the model to pickle object

In [71]:
### there are lots of parameter to tune here
### could have used cross-validation to perform model selection (e.g. GBT with different parameters)

# train the model
time_start_train = time.time()

print(gbt.fit(feature_train, y_train, sample_weight = weights))

time_end_train = time.time()
print('time cost {:.2f} s'.format(time_end_train-time_start_train))

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
time cost 180.61 s


In [72]:
# store the trained model
pickle_save('gbt_model_1', gbt)

# load the trained model from file
gbt_model_1 = pickle_open('gbt_model_1')

## Step 5: run test on test images

In [73]:
# make prediction on validation set
pred_test = gbt_model_1.predict(feature_test)
score_test = gbt_model_1.decision_function(feature_test)

In [74]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score

def clf_metrics(y_true, y_pred, y_score):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_score)
    
    df = pd.DataFrame({'precision': [precision], 'recall': [recall], 'auc':[auc]})
    print(df)

In [75]:
clf_metrics(y_test, pred_test, score_test)

   precision    recall       auc
0   0.389262  0.834532  0.794909


# Is it strange that AUC on test set is higher...?