## Read fiducial points and train-test split

In [1]:
import scipy.io
import os
import numpy as np
import pandas as pd
import pickle

def get_points(file):
    '''load matlab style file'''
    mat = scipy.io.loadmat(file)
    return mat[list(mat.keys())[3]]

def pickle_save(filename, content):
    '''save the file into python pickle object under output folder'''
    with open('../output/%s.pkl'%filename, 'wb') as f:
        pickle.dump(content, f)
        
def pickle_open(filename):
    '''load the pickle file'''
    with open('../output/%s.pkl'%filename, 'rb') as f:
        content = pickle.load(f)
    return content
    

In [2]:
dir_list = os.listdir('../../train_set/points')
dir_list.sort()

fiducial_pt_full = np.stack((get_points('../../train_set/points/'+filename) for filename in dir_list))
label_full = pd.read_csv('../../train_set/label.csv')['label']

# save the data into pickle files, so that we don't need to read raw data everytime
pickle_save('fiducial_pt_full', fiducial_pt_full)
pickle_save('label_full', label_full)

  if (await self.run_code(code, result,  async_=asy)):


### Train-test split

In [68]:
from sklearn.model_selection import train_test_split

# load data from pickle oject
fiducial_pt_full = pickle_open('fiducial_pt_full')
label_full = pickle_open('label_full')

## Note: randomly split into training & test set
X_train, X_test, y_train, y_test = train_test_split(fiducial_pt_full, label_full, test_size=0.2, random_state=42)

## Construct features and reponses

We use pairwise distances as features.

In [None]:
from sklearn.metrics import pairwise_distances
import time
import warnings
warnings.filterwarnings('ignore')

# extract pairwise distance as features (78*77/2=3003 features)
# nrow=number of records of the dataset; ncol=3003
start_time = time.time()
feature_train = np.stack((pairwise_distances(X_train[i])[np.triu_indices(78, k = 1)] for i in range(X_train.shape[0])))
print('Baseline training feature extraction takes %s seconds.'%round((time.time()-start_time),3))

start_time = time.time()
feature_test = np.stack((pairwise_distances(X_test[i])[np.triu_indices(78, k = 1)] for i in range(X_test.shape[0])))
print('Baseline training feature extraction takes %s seconds.'%round((time.time()-start_time),3))


## Train classification model with training features and responses

In [79]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score
import time

def clf_metrics(y_true, y_pred, y_score):
    accuracy = accuracy_score(y_true, y_pred)
    
    # reweight dataset in order to estimate the accuracy of "balanced" data set
    weight_data = np.zeros(len(y_true))
    for v in np.unique(y_true):
        weight_data[y_true==v] = 0.5*len(y_true)/np.sum(y_true==v)
    weighted_acc = np.sum(weight_data * (y_pred==y_true)/np.sum(weight_data))
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_score, average='weighted')
    
    df = pd.DataFrame({'accuracy':[accuracy],'weighted acc':[weighted_acc],
                       'precision': [precision], 'recall': [recall], 'auc':[auc]})
    print(df)

In [174]:
weights = np.zeros(len(y_train))
weights[y_train == 0] = 0.5
weights[y_train == 1] = 2

baseline = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,
                                      max_depth=5, min_samples_split=5, min_samples_leaf=2, 
                                      subsample=0.8,max_features='sqrt', random_state=23,
                                      n_iter_no_change=40, ccp_alpha=0.001
                                     )
start = time.time()
baseline.fit(feature_train, y_train, sample_weight = weights)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 3.62 s


In [175]:
pred_train = baseline.predict(feature_train)
score_train = baseline.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = baseline.predict(feature_test)
score_test = baseline.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

Training set:
   accuracy  weighted acc  precision    recall      auc
0  0.935833      0.932883   0.778793  0.928105  0.98124


Test set:
   accuracy  weighted acc  precision   recall       auc
0     0.795      0.700768   0.561538  0.52518  0.799638




In [176]:
baseline.get_params()

{'ccp_alpha': 0.001,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 5,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': 40,
 'presort': 'deprecated',
 'random_state': 23,
 'subsample': 0.8,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

### Parameter tuning

### 1. max_depth and min_samples_split

In [96]:
from sklearn.model_selection import GridSearchCV

param_1 = {'max_depth':range(3,8,2)}

tuning1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=150, 
                                                              max_features='sqrt', subsample=0.8, random_state=299), 

                       param_grid = param_1, scoring='roc_auc',n_jobs=-1,cv=5)

time_start_gs1 = time.time()
tuning1.fit(resampled_features,resampled_labels)
time_end_gs1 = time.time()
print('Tuning time cost {:.2f} s'.format(time_end_gs1-time_start_gs1))

Tuning time cost 34.40 s


In [97]:
# report the best configuration
print("Best: %f using %s" % (tuning1.best_score_, tuning1.best_params_))

# report all configurations
means = tuning1.cv_results_['mean_test_score']
stds = tuning1.cv_results_['std_test_score']
params = tuning1.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.495842 using {'max_depth': 5}
0.488701 (0.008073) with: {'max_depth': 3}
0.495842 (0.014953) with: {'max_depth': 5}
0.491355 (0.015314) with: {'max_depth': 7}


### 2. learning_rate and n_estimators

In [98]:
param_2 = {'learning_rate':[0.1,0.05], 'n_estimators':[100,250,400,500]}

tuning2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=150, max_depth=5,
                                                              max_features='sqrt', subsample=0.8, random_state=299), 
                       param_grid = param_2, scoring='roc_auc',n_jobs=-1,cv=5)

time_start_gs2 = time.time()
tuning2.fit(resampled_features,resampled_labels)
time_end_gs2 = time.time()
print('Tuning time cost {:.2f} s'.format(time_end_gs2-time_start_gs2))

Tuning time cost 165.41 s


In [99]:
# report the best configuration
print("Best: %f using %s" % (tuning2.best_score_, tuning2.best_params_))

# report all configurations
means = tuning2.cv_results_['mean_test_score']
stds = tuning2.cv_results_['std_test_score']
params = tuning2.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.498413 using {'learning_rate': 0.1, 'n_estimators': 100}
0.498413 (0.012143) with: {'learning_rate': 0.1, 'n_estimators': 100}
0.494025 (0.012488) with: {'learning_rate': 0.1, 'n_estimators': 250}
0.493519 (0.012002) with: {'learning_rate': 0.1, 'n_estimators': 400}
0.494142 (0.013354) with: {'learning_rate': 0.1, 'n_estimators': 500}
0.486358 (0.017822) with: {'learning_rate': 0.05, 'n_estimators': 100}
0.489285 (0.018508) with: {'learning_rate': 0.05, 'n_estimators': 250}
0.492288 (0.018314) with: {'learning_rate': 0.05, 'n_estimators': 400}
0.494642 (0.019314) with: {'learning_rate': 0.05, 'n_estimators': 500}


In [100]:
best = tuning2.best_estimator_

In [101]:
pred_train = best.predict(resampled_features)
score_train = best.decision_function(resampled_features)
print('Training set:')
clf_metrics(resampled_labels, pred_train, score_train)
print('\n')

pred_test = best.predict(feature_test)
score_test = best.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

Training set:
   accuracy  weighted acc  precision    recall       auc
0  0.834621      0.834621   0.834277  0.835137  0.906406


Test set:
   accuracy  weighted acc  precision    recall       auc
0  0.508333      0.484067   0.219424  0.438849  0.494062




### Tune weights

In [47]:
weights1 = np.zeros(len(y_train))
weights1[y_train == 0] = 1
weights1[y_train == 1] = 20

weights2 = np.zeros(len(y_train))
weights2[y_train == 0] = 1
weights2[y_train == 1] = 10

weights3 = np.zeros(len(y_train))
weights3[y_train == 0] = 1
weights3[y_train == 1] = 30

weights4 = np.zeros(len(y_train))
weights4[y_train == 0] = 1
weights4[y_train == 1] = 40

In [48]:
tuning3 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=150,
                                      max_depth=4, min_samples_split=2, min_samples_leaf=1, 
                                      subsample=1,max_features='sqrt', random_state=299)
start = time.time()
tuning3.fit(feature_train, y_train, sample_weight = weights2)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 6.40 s


In [49]:
pred_train = tuning3.predict(feature_train)
score_train = tuning3.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = tuning3.predict(feature_test)
score_test = tuning3.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

Training set:
   accuracy  weighted acc  precision  recall       auc
0  0.974167      0.984029   0.880998     1.0  0.999893


Test set:
   accuracy  weighted acc  precision    recall       auc
0  0.761667      0.699176   0.487952  0.582734  0.803337




In [50]:
tuning4 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=150,
                                      max_depth=4, min_samples_split=2, min_samples_leaf=1, 
                                      subsample=1,max_features='sqrt', random_state=299)
start = time.time()
tuning4.fit(feature_train, y_train, sample_weight = weights3)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 6.58 s


In [51]:
pred_train = tuning4.predict(feature_train)
score_train = tuning4.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = tuning4.predict(feature_test)
score_test = tuning4.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

Training set:
   accuracy  weighted acc  precision  recall       auc
0  0.831667       0.89593   0.531866     1.0  0.999062


Test set:
   accuracy  weighted acc  precision    recall       auc
0  0.668333      0.681151   0.382812  0.705036  0.762606




In [52]:
tuning5 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=150,
                                      max_depth=4, min_samples_split=2, min_samples_leaf=1, 
                                      subsample=1,max_features='sqrt', random_state=299)
start = time.time()
tuning5.fit(feature_train, y_train, sample_weight = weights4)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 6.63 s


In [53]:
pred_train = tuning5.predict(feature_train)
score_train = tuning5.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = tuning5.predict(feature_test)
score_test = tuning5.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

Training set:
   accuracy  weighted acc  precision  recall      auc
0  0.784167      0.866564   0.469806     1.0  0.99769


Test set:
   accuracy  weighted acc  precision   recall       auc
0  0.616667      0.667629   0.349835  0.76259  0.731706




### After tuning, we would choose learning_rate=0.1, n_estimators=150, max_depth=4, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=299, and fit the model with sample_weights ratio of 10.

### Save the model to pickle object:

In [64]:
weights2 = np.zeros(len(y_train))
weights2[y_train == 0] = 1
weights2[y_train == 1] = 10

gbt_baseline = GradientBoostingClassifier(learning_rate=0.1, n_estimators=150,
                                      max_depth=4, min_samples_split=2, min_samples_leaf=1, 
                                      subsample=0.8,max_features='sqrt', random_state=299)
start = time.time()
gbt_baseline.fit(feature_train, y_train, sample_weight = weights2)
print('Training time cost {:.2f} s'.format(time.time()-start))

Training time cost 4.94 s


In [65]:
# store the trained model
pickle_save('gbt_baseline', gbt_baseline)

In [102]:
# load the trained model from file
gbt_baseline = pickle_open('gbt_baseline')

In [103]:
# Gradient boosting baseline model performance
pred_train = gbt_baseline.predict(feature_train)
score_train = gbt_baseline.decision_function(feature_train)
print('Training set:')
clf_metrics(y_train, pred_train, score_train)
print('\n')

pred_test = gbt_baseline.predict(feature_test)
score_test = gbt_baseline.decision_function(feature_test)
print('Test set:')
clf_metrics(y_test, pred_test, score_test)
print('\n')

Training set:
   accuracy  weighted acc  precision  recall       auc
0   0.96625      0.979134       0.85     1.0  0.999912


Test set:
   accuracy  weighted acc  precision   recall       auc
0  0.788333      0.714017   0.540541  0.57554  0.796735


