## binarsity results visualisation

All calculations are processed on a HPC2 Cluster. Choose the `filename` to edit the corresponding results and save it as .html file.

In [1]:
# -*- coding: utf-8 -*-
%reset -f
%matplotlib inline
import os
import numpy as np
import pandas as pd
import seaborn.apionly as sns
import pylab as pl
from prettytable import PrettyTable
from sklearn.metrics import roc_auc_score, roc_curve
from mlpp.preprocessing import FeaturesBinarizer

## Visualize data

In [2]:
filename = 'default_cb'
os.chdir('./datasets/%s' % filename)
data_folder = os.path.expanduser("~/Dev/datasets_binarsity/")
# otherwise our dropbox is full
df = pd.read_csv(data_folder + "%s/%s" % (filename, filename))

# continuous features only
to_be_dropped = []
for i in range(df.shape[1]):
    feature_type = FeaturesBinarizer._detect_feature_type(df.ix[:,i])
    if feature_type == 'discrete':
        to_be_dropped.append(i)
df = df.drop(df.columns[to_be_dropped], axis=1)

print(df.shape)
df.head()

(30000, 25)


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


### Convergence check (objective must be decreasing)

In [None]:
for fname in os.listdir('./results/cvg'):
    if fname[0] != '.':
        cvg = np.load('./results/cvg/' + str(fname))
        n_iter = cvg[:, 0]
        obj = cvg[:, 1]
        fname = fname.split('.')[0].split('-')[1].replace('_',' ')
        fig = pl.figure()
        pl.suptitle("Objective convergence, %s" % fname,
                    fontsize=14, fontweight='bold')
        pl.xlabel('iterations')
        pl.ylabel('Objective')
        pl.plot(n_iter, obj, '-b')
        pl.show()

### Beta coefficients

In [None]:
for fname in os.listdir("./results/beta"):
    if fname[0] not in ['.', 'f', 'b']:
        coeffs = np.load("./results/beta/%s" % fname)
        model = fname.split('.')[0].split('-')[1]
        fig = pl.figure(figsize=(13,5))
        ax = fig.add_subplot(111)
        if(int(fname.split('-')[0]) > 2):
            blocks_start = np.load("./results/beta/blocks_start-%s.npy" % model)
            for val in blocks_start:
                ax.axvline(val, color='g', linestyle='--') 
        pl.suptitle("Beta, %s" % model.replace('_',' '), 
                    fontsize=14, fontweight="bold")
        pl.xlabel("Coefs values", fontsize=12)
        pl.ylabel("Beta coeffs", fontsize=12)
        ax.stem(coeffs)
        
        ax.set_xlim([-5, len(coeffs)+5])
        pl.show()

features_names = np.load("./results/beta/features_names_bina.npy")
nb_first_features = 20
idx_selected = [i for i, x in enumerate(coeffs != 0.) if x]
features_selected = [features_names[i] for i in  idx_selected]
norm1_coeff = np.linalg.norm(coeffs, ord=1)
val = [100 * abs(coeffs[i])/norm1_coeff for i in  idx_selected]
features_selected = pd.DataFrame({'Feature' : features_selected,
                                  'Effect' : val})
features_selected.sort_values(['Effect'], ascending=False, inplace=True)
features_selected.index = range(len(features_selected))
print("%s first selected features (bina pen bin feat):" % nb_first_features)
t = PrettyTable(['Feature', 'Effect']) 
for k,i in enumerate(range(len(features_selected))):
    if k<nb_first_features+1:
        t.add_row([features_selected.Feature[i], 
                   "%g %%" % features_selected.Effect[i]])
print(t)

### Learning curves

In [None]:
for fname in os.listdir("./results/learning_curves"):
    if fname[0] != '.':
        learning_curves = np.load("./results/learning_curves/" + str(fname))
        model_num = fname.split('-')[0]
        fname_ = fname.split('.')[0].split('-')[1].replace('_',' ')  
                
        if int(model_num) == 2:
            fig = pl.figure()
            ax = fig.add_subplot(111)
            C_grid = learning_curves[:, 0]
            C_grid_ = C_grid
            avg_scores = learning_curves[:, 1]
            score_test = learning_curves[:, 2]
            selection = fname.split('.')[0].split('selection_')[1]
            idx_best = np.unravel_index(avg_scores.argmax(), 
                                        avg_scores.shape)[0]
            C_best = C_grid[idx_best]
            if selection == 'min':
                C_chosen = C_best
            if selection == '1st':
                max_ = avg_scores.max()
                min_ = avg_scores.min()
                idx = [i for i, is_up in enumerate(
                    list(avg_scores >= max_ - .05 * (max_ - min_)))
                       if is_up]
                idx_chosen = min(idx) if len(idx) > 0 else idx_best
                C_chosen = C_grid[idx_chosen]
            
            pl.xscale('log')
            ax.plot(C_grid, avg_scores, label="AUC on CV")
            ax.plot(C_grid, score_test , '-r', 
                    label="AUC on test set")
            y_min = ax.get_ylim()[0]
            ax.plot(C_best,y_min,'g^',ms=20, label="best C on CV")
            ax.plot(C_chosen,y_min,'r^',ms=20, label="C chosen")
            pl.suptitle("Learning curves, %s" % fname_, 
                        fontsize=14, fontweight="bold")
            pl.xlabel("C")
            pl.ylabel("AUC")
            pl.legend(bbox_to_anchor=(1.15,1), loc=2, borderaxespad=0.,
                      numpoints=1, markerscale=.5)
            pl.show()
            
        if int(model_num) == 3:
            fig = pl.figure()
            ax = fig.add_subplot(111)
            n_cuts_grid = learning_curves[:, 0]
            avg_scores = learning_curves[:, 1]
            score_test = learning_curves[:, 2]
            idx_best = np.unravel_index(avg_scores.argmax(), 
                                        avg_scores.shape)[0]
            n_cuts_chosen = n_cuts_grid[idx_best]
            ax.plot(n_cuts_grid, score_test , '-r', 
                    label = "AUC on test set")
            ax.plot(n_cuts_grid, avg_scores, label="AUC on CV")
            y_min = ax.get_ylim()[0]
            ax.plot(n_cuts_chosen,y_min,'r^',ms=20, 
                    label="n_cuts chosen")
            pl.suptitle("Learning curves, %s" % fname_, 
                        fontsize=14, fontweight="bold")
            pl.xlabel('n_cuts')
            pl.ylabel('AUC')
            pl.legend(bbox_to_anchor=(1.15,1), loc=2, borderaxespad=0., 
                      numpoints=1, markerscale=.5)
            pl.show()
            
        if int(model_num) in [4, 5]:
            n_cuts_grid_size = int(learning_curves.shape[1]/2)
            avg_scores = learning_curves[:, :n_cuts_grid_size]
            score_test = learning_curves[:, n_cuts_grid_size:]
            fig = pl.figure(figsize=(12,6))
            
            id_C, id_n_cuts = np.where(avg_scores == avg_scores.max())
            C_chosen = C_grid[len(C_grid) - id_C[0] - 1]
            n_cuts_chosen = n_cuts_grid[id_n_cuts[0]]
            
            print("n_cuts_chosen=%s" % n_cuts_chosen)
            print("C_chosen=%.1e" % C_chosen)
            
            if True:
                lines2del = 5
                avg_scores = avg_scores[:-lines2del,:]
                score_test = score_test[:-lines2del,:]
                C_grid = C_grid_[lines2del:]
            
            for i,data in enumerate([avg_scores, score_test]):
                if i == 0:
                    add = 'CV'
                    ax = fig.add_subplot(121)
                else:
                    add = 'test set'
                    ax = fig.add_subplot(122)
                ax = sns.heatmap(data, cmap=pl.cm.Blues, linewidths=.1)
                pl.yticks(rotation=0)
                pl.title("Heatmap on %s, %s" % (add, fname_), 
                         fontsize=14, fontweight="bold")
                pl.xlabel('n_cuts')
                pl.ylabel('C', rotation=0)
                ax.set_xticklabels(n_cuts_grid, minor=False)
                ax.set_yticklabels(['%.1e' % C for C in C_grid], minor=False)
                ax.grid(False)
            pl.tight_layout()
            pl.show()
        
        if int(model_num) == 6:
                        
            C_grid = list()
            gamma_grid = list()
            avg_scores = list()
            for val in learning_curves:
                param = val[0]
                gamma_grid.append(param['gamma'])
                C_grid.append(param['C'])
                avg_scores.append(val[1])
            
            gamma_grid = np.unique(gamma_grid)
            C_grid = np.unique(C_grid)
            avg_scores = np.reshape(avg_scores, (len(gamma_grid), len(C_grid)))
            
            id_C, id_gamma = np.where(avg_scores == avg_scores.max())
            C_chosen = C_grid[id_C[0]]
            gamma_chosen = gamma_grid[id_gamma[0]]   
            print("gamma_chosen=%.1e" % gamma_chosen)
            print("C_chosen=%.1e" % C_chosen)
 
            fig = pl.figure(figsize=(6,6))
            ax = sns.heatmap(avg_scores, cmap=pl.cm.Blues, linewidths=.1)
            pl.yticks(rotation=0)
            pl.title("Heatmap on CV, %s" % fname_, 
                     fontsize=14, fontweight="bold")
            pl.xlabel('gamma')
            pl.ylabel('C', rotation=0)
            ax.set_xticklabels(['%.1e' % gamma for gamma in gamma_grid], minor=False)
            ax.set_yticklabels(['%.1e' % C for C in reversed(C_grid)], minor=False)
            ax.grid(False)
            pl.tight_layout()
            pl.show()  

## Final performances comparison

In [None]:
results = open("./results/results.txt", "r")
print(results.read())

## ROC curves

In [None]:
y_test = np.load('./results/y_test.npy')
lw = 2
color_list = pl.cm.Set1(np.linspace(0, .5, 8))
fig = pl.figure(figsize=(8,8))   
ax = fig.add_subplot(1,1,1)
i=0
for fname in os.listdir('./results/y_pred'):
    if fname[0] != '.':
        y_pred = np.load('./results/y_pred/' + str(fname))
        fname = fname.split('.')[0].split('-')[1].replace('_',' ')
        fpr, tpr, threshold = roc_curve(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred)
        pl.plot(fpr, tpr, color=color_list[i], lw=lw, 
                label = '%s (AUC = %0.3f)' % (fname, roc_auc))
        i+=1

pl.plot([0, 1], [0, 1], 'k--', lw=lw)
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.05])
pl.tick_params(axis='both', which='major', labelsize=16)
pl.xlabel('False Positive Rate', fontsize=16)
pl.ylabel('True Positive Rate', fontsize=16)
pl.title('ROC curves comparison', fontsize=16)
pl.legend(loc="lower right")
pl.savefig('./results/roc_curves.pdf', bbox_inches='tight')
pl.show()