In [1]:
import os
import smtplib
import numpy as np
import pandas as pd
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import COMMASPACE
from sys import stdout, argv
from prettytable import PrettyTable
from time import time
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from tick.inference import LogisticRegression
from tick.preprocessing import FeaturesBinarizer
from sklearn.utils.validation import indexable
from sklearn.model_selection import check_cv
from sklearn.metrics.scorer import check_scoring
from sklearn.model_selection._validation import _fit_and_score
from sklearn.externals.joblib import Parallel, delayed
import seaborn.apionly as sns
import pylab as pl
from prettytable import PrettyTable
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
import warnings
warnings.filterwarnings('ignore')



In [2]:
filename = "default_cb" #"ionosphere" default_cb adult
header = "infer"  #"infer" None

os.chdir('./datasets/%s' % filename)
df = pd.read_csv('./%s' % filename, header=header)

K = 5
selection = "min"
test = False

# default
n_cuts_min = 10
n_cuts_max = 80
n_cuts_grid_size = 20

In [3]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [4]:
def cross_val_score_(estimators, X, y=None, groups=None, scoring=None,
                     cv=None, n_jobs=1, verbose=0, fit_params=None):
    X, y, groups = indexable(X, y, groups)
    cv = check_cv(cv, y, classifier=True)
    cv_iter = list(cv.split(X, y, groups))

    parallel = Parallel(n_jobs=n_jobs, verbose=0)

    scores = parallel(delayed(_fit_and_score)(estimators[i], X, y,
                                              check_scoring(estimators[i],
                                                            scoring=scoring),
                                              train, test, verbose, None,
                                              fit_params)
                      for i, (train, test) in enumerate(cv_iter))

    return np.array(scores)[:, 0]


def compute_score(clf, X, y, K, verbose=True, fit_params=None):
    scores = cross_val_score_(clf, X, y, cv=K, verbose=0,
                              n_jobs=1, scoring="roc_auc",
                              fit_params=fit_params)
    score_mean = scores.mean()
    score_std = 2 * scores.std()
    if verbose:
        print("\n AUC: %0.3f (+/- %0.3f)" % (score_mean, score_std))
    return score_mean, score_std


In [5]:
with_categorical = False

# drop lines with NaN values
df.dropna(axis=0, how='any', inplace=True)

# if dataset churn: drop phone feature
if filename == 'churn':
    df = df.drop(df.columns[[3]], axis=1)

# get label (have to be the last column!)
idx_label_column = -1
labels = df.iloc[:, idx_label_column]
labels = 2 * (labels.values != labels.values[0]) - 1
# drop it from df
df = df.drop(df.columns[[idx_label_column]], axis=1)

# shuffle and split training and test sets
X, X_test, y, y_test = train_test_split(
    df, labels, test_size=.33, random_state=0, stratify=labels)

del df

# speed up restriction
# n_restrict = 1000000  # 200k examples max
if test:
    n_restrict = 200
    C_grid_size = 4
    n_cuts_grid_size = 3
    X = X.iloc[:n_restrict, :]
    y = y[:n_restrict]
    X_test = X_test.iloc[:n_restrict, :]
    y_test = y_test[:n_restrict]
else:
    C_grid_size = 25

# get categorical features index
cate_feat_idx = []
for i in range(X.shape[1]):
    feature_type = FeaturesBinarizer._detect_feature_type(X.ix[:, i])
    if feature_type == 'discrete':
        cate_feat_idx.append(i)

if (len(cate_feat_idx) == 0):
    with_categorical = False

original_feature_names = X.columns

if not with_categorical:
    feature_names_cont = list()
    for i, name in enumerate(original_feature_names):
        if i not in cate_feat_idx:
            feature_names_cont.append(name)
else:
    feature_names_cont = original_feature_names

n_cuts_grid = np.linspace(n_cuts_min, n_cuts_max, n_cuts_grid_size, dtype=int)

# separate continuous and categorical features
X_cat = X[X.columns[cate_feat_idx]]
X_test_cat = X_test[X_test.columns[cate_feat_idx]]
X_cat.reset_index(drop=True, inplace=True)
X_test_cat.reset_index(drop=True, inplace=True)

if with_categorical:
    binarizer = FeaturesBinarizer()
    binarizer.fit(pd.concat([X_cat, X_test_cat], axis=0))
    X_cat_bin = pd.DataFrame(binarizer.transform(X_cat).toarray())
    X_test_cat_bin = pd.DataFrame(binarizer.transform(X_test_cat).toarray())

#del X_cat, X_test_cat

X_cont = X.drop(X.columns[cate_feat_idx], axis=1)
X_test_cont = X_test.drop(X_test.columns[cate_feat_idx], axis=1)
X_cont.reset_index(drop=True, inplace=True)
X_test_cont.reset_index(drop=True, inplace=True)

print("Training:")
print(X.shape)
print("Test:")
print(X_test.shape)

# Center and reduce continuous data
standardscaler = StandardScaler()
X_std = pd.DataFrame(standardscaler.fit_transform(X_cont))
X_test_std = pd.DataFrame(standardscaler.transform(X_test_cont))
print("data centered and reduced")

# use only 10k examples max for Cross-Val
n_restrict_cv = 8000

Training:
(20100, 24)
Test:
(9900, 24)
data centered and reduced


In [6]:
# logistic regression on binarized features, binarsity penalization
model = "bina_pen_bin_feat"
print("\n launch %s" % model)

if with_categorical:
    X_final = pd.concat([X_cont, X_cat], axis=1)
    X_test_final = pd.concat([X_test_cont, X_test_cat], axis=1)
else:
    X_final = X_cont
    X_test_final = X_test_cont

# prendre une gde valeur de n_cut puis cross valider sur C
n_cuts_chosen = 30

binarizer = FeaturesBinarizer(n_cuts=n_cuts_chosen)
binarizer.fit(pd.concat([X_final, X_test_final], axis=0))

if with_categorical:
    X_final = pd.concat([X_cont.iloc[:n_restrict_cv, :],
                         X_cat.iloc[:n_restrict_cv, :]],
                        axis=1)
    X_test_final = pd.concat([X_test_cont.iloc[:n_restrict_cv, :],
                              X_test_cat.iloc[:n_restrict_cv, :]],
                             axis=1)

else:
    X_final = X_cont.iloc[:n_restrict_cv, :]
    X_test_final = X_test_cont.iloc[:n_restrict_cv, :]

X_bin = binarizer.transform(X_final)
X_test_bin = binarizer.transform(X_test_final)


 launch bina_pen_bin_feat


In [7]:
# logistic regression on binarized features, binarsity penalization
if with_categorical:
    X_final = pd.concat([X_cont, X_cat], axis=1)
    X_test_final = pd.concat([X_test_cont, X_test_cat], axis=1)
else:
    X_final = X_cont
    X_test_final = X_test_cont

# prendre une gde valeur de n_cut puis cross valider sur C
n_cuts_chosen = 30

binarizer = FeaturesBinarizer(n_cuts=n_cuts_chosen)
binarizer.fit(pd.concat([X_final, X_test_final], axis=0))

if with_categorical:
    X_final = pd.concat([X_cont.iloc[:n_restrict_cv, :],
                         X_cat.iloc[:n_restrict_cv, :]],
                        axis=1)
    X_test_final = pd.concat([X_test_cont.iloc[:n_restrict_cv, :],
                              X_test_cat.iloc[:n_restrict_cv, :]],
                             axis=1)

else:
    X_final = X_cont.iloc[:n_restrict_cv, :]
    X_test_final = X_test_cont.iloc[:n_restrict_cv, :]

X_bin = binarizer.transform(X_final)
X_test_bin = binarizer.transform(X_test_final)

## Bina or Group-TV

In [8]:
C_grid = np.logspace(1, 4, C_grid_size)

In [9]:
model = "bina_pen_bin_feat"
print("\n launch %s" % model)


# cross validation on C
avg_scores, score_test = np.empty(0), []
tmp = 0
for i, C_ in enumerate(C_grid):
    tmp += 1
    print("CV %s: %d%%" % (
        model, tmp * 100 / C_grid_size))
    stdout.flush()

    learners = [
        LogisticRegression(penalty='binarsity', solver='svrg', C=C_,
                      verbose=False, step=1e-3,
                      blocks_start=binarizer.feature_indices[:-1, ],
                      blocks_length=binarizer.n_values)
        for _ in range(K)]
    auc = compute_score(learners, X_bin, y[:n_restrict_cv], K,
                        verbose=False)[0]

    avg_scores = np.append(avg_scores, max(auc, 1 - auc))

    learner = LogisticRegression(penalty='binarsity', solver='svrg',
                            C=C_, verbose=False, step=1e-3,
                            blocks_start=binarizer.feature_indices[
                                         :-1, ],
                            blocks_length=binarizer.n_values)
    learner.fit(X_bin, y[:n_restrict_cv])
    y_pred = learner.predict_proba(X_test_bin)[:, 1]
    score_test.append(roc_auc_score(y_test[:n_restrict_cv], y_pred))

idx_best = np.unravel_index(avg_scores.argmax(),
                            avg_scores.shape)[0]
C_best = C_grid[idx_best]
if selection == 'min':
    C_chosen = C_best
if selection == '1st':
    max_ = avg_scores.max()
    min_ = avg_scores.min()
    idx = [i for i, is_up in enumerate(
        list(avg_scores >= max_ - .05 * (max_ - min_)))
           if is_up]
    idx_chosen = min(idx) if len(idx) > 0 else idx_best
    C_chosen = C_grid[idx_chosen]


 launch bina_pen_bin_feat
CV bina_pen_bin_feat: 4%
CV bina_pen_bin_feat: 8%
CV bina_pen_bin_feat: 12%
CV bina_pen_bin_feat: 16%
CV bina_pen_bin_feat: 20%
CV bina_pen_bin_feat: 24%
CV bina_pen_bin_feat: 28%


KeyboardInterrupt: 

In [None]:
# learning curves
learning_curves = np.column_stack((C_grid, avg_scores, score_test))

fig = pl.figure()
ax = fig.add_subplot(111)
C_grid = learning_curves[:, 0]
C_grid_ = C_grid
avg_scores = learning_curves[:, 1]
score_test = learning_curves[:, 2]

idx_best = np.unravel_index(avg_scores.argmax(), 
                            avg_scores.shape)[0]
C_best = C_grid[idx_best]
if selection == 'min':
    C_chosen = C_best
if selection == '1st':
    max_ = avg_scores.max()
    min_ = avg_scores.min()
    idx = [i for i, is_up in enumerate(
        list(avg_scores >= max_ - .05 * (max_ - min_)))
           if is_up]
    idx_chosen = min(idx) if len(idx) > 0 else idx_best
    C_chosen = C_grid[idx_chosen]

pl.xscale('log')
ax.plot(C_grid, avg_scores, label="AUC on CV")
ax.plot(C_grid, score_test , '-r', 
        label="AUC on test set")
y_min = ax.get_ylim()[0]
ax.plot(C_best,y_min,'g^',ms=20, label="best C on CV")
ax.plot(C_chosen,y_min,'r^',ms=20, label="C chosen")
pl.suptitle("Learning curves bina", 
            fontsize=14, fontweight="bold")
pl.xlabel("C")
pl.ylabel("AUC")
pl.legend(bbox_to_anchor=(1.15,1), loc=2, borderaxespad=0.,
          numpoints=1, markerscale=.5)
pl.show()

In [None]:
if with_categorical:
    X_final = pd.concat([X_cont, X_cat], axis=1)
    X_test_final = pd.concat([X_test_cont, X_test_cat], axis=1)
else:
    X_final = X_cont
    X_test_final = X_test_cont

binarizer = FeaturesBinarizer(n_cuts=n_cuts_chosen)
binarizer.fit(pd.concat([X_final, X_test_final], axis=0))
X_bin = binarizer.transform(X_final)
X_test_bin = binarizer.transform(X_test_final)

blocks_start = binarizer.feature_indices[:-1, ]
blocks_length = binarizer.n_values

learner = LogisticRegression(penalty='binarsity', solver='svrg', C=C_chosen,
                        verbose=False, step=1e-3,
                        blocks_start=blocks_start,
                        blocks_length=blocks_length)
start = time()
learner.fit(X_bin, y)
y_pred = learner.predict_proba(X_test_bin)[:, 1]

auc = roc_auc_score(y_test, y_pred)
auc = max(auc, 1 - auc)

print("\n %s done, AUC: %.3f" % (model, auc))

coeffs = learner.weights

In [None]:
fig = pl.figure(figsize=(13,5))
ax = fig.add_subplot(111)
for val in blocks_start:
    ax.axvline(val, color='g', linestyle='--') 
#pl.suptitle("Beta, %s" % model.replace('_',' '), 
#            fontsize=14, fontweight="bold")
pl.xlabel("Coefs values", fontsize=12)
pl.ylabel("Beta coeffs", fontsize=12)
ax.stem(coeffs, 'b', markerfmt='ob')

ax.set_xlim([-5, len(coeffs)+5])
pl.show()

## Group-L1

In [None]:
# logistic regression on binarized features, binarsity penalization
model = "bina_pen_bin_feat"
print("\n launch %s" % model)

if with_categorical:
    X_final = pd.concat([X_cont, X_cat], axis=1)
    X_test_final = pd.concat([X_test_cont, X_test_cat], axis=1)
else:
    X_final = X_cont
    X_test_final = X_test_cont

# prendre une gde valeur de n_cut puis cross valider sur C
n_cuts_chosen = 30

binarizer = FeaturesBinarizer(n_cuts=n_cuts_chosen)
binarizer.fit(pd.concat([X_final, X_test_final], axis=0))

if with_categorical:
    X_final = pd.concat([X_cont.iloc[:n_restrict_cv, :],
                         X_cat.iloc[:n_restrict_cv, :]],
                        axis=1)
    X_test_final = pd.concat([X_test_cont.iloc[:n_restrict_cv, :],
                              X_test_cat.iloc[:n_restrict_cv, :]],
                             axis=1)

else:
    X_final = X_cont.iloc[:n_restrict_cv, :]
    X_test_final = X_test_cont.iloc[:n_restrict_cv, :]

X_bin = binarizer.transform(X_final)
X_test_bin = binarizer.transform(X_test_final)

In [None]:
# cross validation on C

C_grid = np.logspace(1, 4, C_grid_size)

avg_scores, score_test = np.empty(0), []
tmp = 0
for i, C_ in enumerate(C_grid):
    tmp += 1
    print("CV %s: %d%%" % (
        model, tmp * 100 / C_grid_size))
    stdout.flush()

    learners = [
        LogisticRegression(penalty='group-L1', solver='svrg', C=C_,
                      verbose=False, step=1e-3,
                      blocks_start=binarizer.feature_indices[:-1, ],
                      blocks_length=binarizer.n_values)
        for _ in range(K)]
    auc = compute_score(learners, X_bin, y[:n_restrict_cv], K,
                        verbose=False)[0]

    avg_scores = np.append(avg_scores, max(auc, 1 - auc))

    learner = LogisticRegression(penalty='group-L1', solver='svrg',
                            C=C_, verbose=False, step=1e-3,
                            blocks_start=binarizer.feature_indices[
                                         :-1, ],
                            blocks_length=binarizer.n_values)
    learner.fit(X_bin, y[:n_restrict_cv])
    y_pred = learner.predict_proba(X_test_bin)[:, 1]
    score_test.append(roc_auc_score(y_test[:n_restrict_cv], y_pred))

idx_best = np.unravel_index(avg_scores.argmax(),
                            avg_scores.shape)[0]
C_best = C_grid[idx_best]
if selection == 'min':
    C_chosen = C_best
if selection == '1st':
    max_ = avg_scores.max()
    min_ = avg_scores.min()
    idx = [i for i, is_up in enumerate(
        list(avg_scores >= max_ - .05 * (max_ - min_)))
           if is_up]
    idx_chosen = min(idx) if len(idx) > 0 else idx_best
    C_chosen = C_grid[idx_chosen]

In [None]:
import seaborn.apionly as sns
import pylab as pl
from prettytable import PrettyTable
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve

# learning curves
learning_curves = np.column_stack((C_grid, avg_scores, score_test))

fig = pl.figure()
ax = fig.add_subplot(111)
C_grid = learning_curves[:, 0]
C_grid_ = C_grid
avg_scores = learning_curves[:, 1]
score_test = learning_curves[:, 2]

idx_best = np.unravel_index(avg_scores.argmax(), 
                            avg_scores.shape)[0]
C_best = C_grid[idx_best]
if selection == 'min':
    C_chosen = C_best
if selection == '1st':
    max_ = avg_scores.max()
    min_ = avg_scores.min()
    idx = [i for i, is_up in enumerate(
        list(avg_scores >= max_ - .05 * (max_ - min_)))
           if is_up]
    idx_chosen = min(idx) if len(idx) > 0 else idx_best
    C_chosen = C_grid[idx_chosen]

pl.xscale('log')
ax.plot(C_grid, avg_scores, label="AUC on CV")
ax.plot(C_grid, score_test , '-r', 
        label="AUC on test set")
y_min = ax.get_ylim()[0]
ax.plot(C_best,y_min,'g^',ms=20, label="best C on CV")
ax.plot(C_chosen,y_min,'r^',ms=20, label="C chosen")
pl.suptitle("Learning curves bina", 
            fontsize=14, fontweight="bold")
pl.xlabel("C")
pl.ylabel("AUC")
pl.legend(bbox_to_anchor=(1.15,1), loc=2, borderaxespad=0.,
          numpoints=1, markerscale=.5)
pl.show()

In [None]:
if with_categorical:
    X_final = pd.concat([X_cont, X_cat], axis=1)
    X_test_final = pd.concat([X_test_cont, X_test_cat], axis=1)
else:
    X_final = X_cont
    X_test_final = X_test_cont

binarizer = FeaturesBinarizer(n_cuts=n_cuts_chosen)
binarizer.fit(pd.concat([X_final, X_test_final], axis=0))
X_bin = binarizer.transform(X_final)
X_test_bin = binarizer.transform(X_test_final)

blocks_start = binarizer.feature_indices[:-1, ]
blocks_length = binarizer.n_values

learner = LogisticRegression(penalty='group-L1', solver='svrg', C=C_chosen,
                        verbose=False, step=1e-3,
                        blocks_start=blocks_start,
                        blocks_length=blocks_length)
start = time()
learner.fit(X_bin, y)
y_pred = learner.predict_proba(X_test_bin)[:, 1]

auc = roc_auc_score(y_test, y_pred)
auc = max(auc, 1 - auc)

print("\n %s done, AUC: %.3f" % (model, auc))

coeffs = learner.weights

In [None]:
fig = pl.figure(figsize=(13,5))
ax = fig.add_subplot(111)
for val in blocks_start:
    ax.axvline(val, color='g', linestyle='--') 
#pl.suptitle("Beta, %s" % model.replace('_',' '), 
#            fontsize=14, fontweight="bold")
pl.xlabel("Coefs values", fontsize=12)
pl.ylabel("Beta coeffs", fontsize=12)
ax.stem(coeffs, 'b', markerfmt='ob')

ax.set_xlim([-5, len(coeffs)+5])
pl.show()