In [1]:
import os
import numpy as np
import pandas as pd
from sys import stdout, argv
from prettytable import PrettyTable
from time import time
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from tick.preprocessing import FeaturesBinarizer
from matplotlib.ticker import FormatStrFormatter
import pylab as pl
import seaborn as sns
from pygam import LogisticGAM
import warnings
warnings.filterwarnings('ignore')



In [2]:
filename = 'susy'
test = False
header = None # None "infer"
directory = 'results'
os.chdir('./datasets/%s' % filename)
df = pd.read_csv('./%s' % filename, header=header)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.972861,0.653855,1.176225,1.157156,-1.739873,-0.874309,0.567765,-0.175,0.810061,-0.252552,1.921887,0.889637,0.410772,1.145621,1.932632,0.994464,1.367815,0.040714,0
1,0.44484,-0.134298,-0.709972,0.451719,-1.613871,-0.768661,1.219918,0.504026,1.831248,-0.431385,0.526283,0.941514,1.587535,2.024308,0.603498,1.562374,1.135454,0.18091,1
2,0.381256,-0.976145,0.693152,0.448959,0.891753,-0.677328,2.03306,1.533041,3.04626,-1.005285,0.569386,1.015211,1.582217,1.551914,0.761215,1.715464,1.492257,0.090719,1
3,0.456398,1.099371,1.512453,0.751772,0.638967,-0.742216,0.322601,1.321054,0.169502,0.359941,0.489256,0.416168,0.754829,0.30375,0.461067,0.345541,0.733242,0.186044,0
4,0.437818,-1.119883,-1.336823,0.50232,-1.717515,1.017067,0.215619,-0.4612,0.323671,0.173626,0.411898,0.370525,0.79826,0.671369,0.38591,0.515522,0.47911,0.029058,0


In [3]:
# drop lines with NaN values
df.dropna(axis=0, how='any', inplace=True)

# if dataset churn: drop phone feature
if filename == 'churn':
    df = df.drop(df.columns[[3]], axis=1)

# get label (have to be the last column!)
idx_label_column = -1
labels = df.iloc[:, idx_label_column]
labels = (labels.values != labels.values[0]).astype(int)

# drop it from df
df = df.drop(df.columns[[idx_label_column]], axis=1)

# shuffle and split training and test sets
X, X_test, y, y_test = train_test_split(
    df, labels, test_size=.33, random_state=0, stratify=labels)

In [4]:
if test:
    n_restrict = 200
    X = X.iloc[:n_restrict, :]
    y = y[:n_restrict]
    X_test = X_test.iloc[:n_restrict, :]
    y_test = y_test[:n_restrict]

# get categorical features index
cate_feat_idx = []
for i in range(X.shape[1]):
    feature_type = FeaturesBinarizer._detect_feature_type(X.ix[:, i])
    if feature_type == 'discrete':
        cate_feat_idx.append(i)

original_feature_names = X.columns

feature_names_cont = list()
for i, name in enumerate(original_feature_names):
    if i not in cate_feat_idx:
        feature_names_cont.append(name)

# separate continuous and categorical features
X_cat = X[X.columns[cate_feat_idx]]
X_test_cat = X_test[X_test.columns[cate_feat_idx]]
X_cat.reset_index(drop=True, inplace=True)
X_test_cat.reset_index(drop=True, inplace=True)

X_cont = X.drop(X.columns[cate_feat_idx], axis=1)
X_test_cont = X_test.drop(X_test.columns[cate_feat_idx], axis=1)
X_cont.reset_index(drop=True, inplace=True)
X_test_cont.reset_index(drop=True, inplace=True)

print("Training:")
print(X.shape)
print("Test:")
print(X_test.shape)

# Center and reduce continuous data
standardscaler = StandardScaler()
X_std = pd.DataFrame(standardscaler.fit_transform(X_cont))
X_test_std = pd.DataFrame(standardscaler.transform(X_test_cont))
print("data centered and reduced")

Training:
(2638545, 18)
Test:
(1299582, 18)
data centered and reduced


In [11]:
model = "GAM"
lam = 4
n_splines=5
n_restrict = 300000

t = PrettyTable(['Algos', 'AUC', 'time'])
best_params = np.load('./%s/learning_curves/best_params_%s.npy' % (directory, model)).item()
best_params['lam'] = lam
best_params['n_splines'] = n_splines
np.save('./results/learning_curves/best_params_%s.npy' % model, best_params)

start = time()
gam = LogisticGAM(dtype='numerical', lam=lam, n_splines=n_splines)
gam.fit(X_std.iloc[:n_restrict, :], y[:n_restrict])
y_pred = gam.predict_proba(X_test_std)
np.save('./results/y_pred/9-%s' % model, y_pred)
np.save('./results/y_test', y_test)
auc = roc_auc_score(y_test, y_pred)
auc = max(auc, 1 - auc)

t.add_row(["GAM", "%g" % auc, "%.3f" % (time() - start)])

# Final performances comparison
print(t)
results = open("./results/results.txt", "w")
results.write('%s' % t)
results.close()

  coef = coef[fit_linear:]
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  return dist.levels/(mu*(dist.levels - mu))
  return sp.sparse.diags((self.link.gradient(mu, self.distribution)**2 * self.distribution.V(mu=mu))**-0.5)
  mask = (np.abs(weights) >= np.sqrt(EPS)) * (weights != np.nan)


+-------+----------+--------+
| Algos |   AUC    |  time  |
+-------+----------+--------+
|  GAM  | 0.856431 | 42.543 |
+-------+----------+--------+
