Skip to content

Commit

Permalink
Merge pull request #158 from X-DataInitiative/TICK-361
Browse files Browse the repository at this point in the history
Add ConvSCCS model + refactoring of SCCS-related stuff (preprocessing, simulations, etc.)
  • Loading branch information
MaryanMorel committed Mar 22, 2018
2 parents 730d91b + 1f9c926 commit 088393b
Show file tree
Hide file tree
Showing 27 changed files with 2,334 additions and 615 deletions.
1 change: 1 addition & 0 deletions doc/modules/preprocessing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,4 @@ similar API to scikit-learn preprocessors.

LongitudinalFeaturesProduct
LongitudinalFeaturesLagger
LongitudinalSamplesFilter
4 changes: 4 additions & 0 deletions doc/modules/survival.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Kaplan-Meier estimators.
CoxRegression
nelson_aalen
kaplan_meier
ConvSCCS

2. Models
=========
Expand Down Expand Up @@ -69,3 +70,6 @@ Example

.. plot:: ../examples/plot_simulation_coxreg.py
:include-source:

.. plot:: ../examples/plot_conv_sccs_cv_results.py
:include-source:
145 changes: 145 additions & 0 deletions examples/plot_conv_sccs_cv_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
"""
====================================================================
ConvSCCS cross validation on simulated longitudinal features example
====================================================================
In this example we simulate longitudinal data with preset relative incidence
for each feature. We then perform a cross validation of the ConvSCCS model
and compare the estimated coefficients to the relative incidences used for
the simulation.
"""
from time import time
import numpy as np
from scipy.sparse import csr_matrix, hstack
from matplotlib import cm
import matplotlib.pylab as plt
from tick.survival.simu_sccs import CustomEffects
from tick.survival import SimuSCCS, ConvSCCS
from mpl_toolkits.axes_grid1 import make_axes_locatable

# Simulation parameters
seed = 0
lags = 49
n_samples = 2000
n_intervals = 750
n_corr = 3

# Relative incidence functions used for the simulation
ce = CustomEffects(lags + 1)
null_effect = [ce.constant_effect(1)] * 2
intermediate_effect = ce.bell_shaped_effect(2, 30, 15, 15)
late_effects = ce.increasing_effect(2, curvature_type=4)

sim_effects = [*null_effect, intermediate_effect, late_effects]

n_features = len(sim_effects)
n_lags = np.repeat(lags, n_features).astype('uint64')

coeffs = [np.log(c) for c in sim_effects]

# Time drift (age effect) used for the simulations.
time_drift = lambda t: np.log(8 * np.sin(.01 * t) + 9)

# Simaltion of the features.
sim = SimuSCCS(n_samples, n_intervals, n_features, n_lags,
time_drift=time_drift, coeffs=coeffs, seed=seed,
n_correlations=n_corr, verbose=False)
features, censored_features, labels, censoring, coeffs = sim.simulate()

# Plot the Hawkes kernel matrix used to generate the features.
fig, ax = plt.subplots(figsize=(7, 6))
heatmap = ax.pcolor(sim.hawkes_exp_kernels.adjacency, cmap=cm.Blues)
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.5)
fig.colorbar(heatmap, cax=cax)
ax.set_title('Hawkes adjacency matrix used for the simulation');
plt.show()

## Add age_groups features to feature matrices.
agegrps = [0, 125, 250, 375, 500, 625, 750]
n_agegrps = len(agegrps) - 1

feat_agegrp = np.zeros((n_intervals, n_agegrps))
for i in range(n_agegrps):
feat_agegrp[agegrps[i]:agegrps[i + 1], i] = 1

feat_agegrp = csr_matrix(feat_agegrp)
features = [hstack([f, feat_agegrp]).tocsr() for f in features]
censored_features = [hstack([f, feat_agegrp]).tocsr() for f in
censored_features]
n_lags = np.hstack([n_lags, np.zeros(n_agegrps)])

# Learning
# Example code for cross validation
# start = time()
# learner = ConvSCCS(n_lags=n_lags.astype('uint64'),
# penalized_features=np.arange(n_features),
# random_state=42)
# C_TV_range = (1, 4)
# C_L1_range = (2, 5)
# _, cv_track = learner.fit_kfold_cv(features, labels, censoring,
# C_TV_range, C_L1_range,
# confidence_intervals=True,
# n_samples_bootstrap=20, n_cv_iter=50)
# elapsed_time = time() - start
# print("Elapsed time (model training): %.2f seconds \n" % elapsed_time)
# print("Best model hyper parameters: \n")
# print("C_tv : %f \n" % cv_track.best_model['C_tv'])
# print("C_group_l1 : %f \n" % cv_track.best_model['C_group_l1'])
# cv_track.plot_cv_report(35, 45)
# plt.show()
# confidence_intervals = cv_track.best_model['confidence_intervals']

# using the parameters resulting from cross-validation
learner = ConvSCCS(n_lags=n_lags.astype('uint64'),
penalized_features=np.arange(n_features),
random_state=42, C_tv=270.2722840570933,
C_group_l1=5216.472772625124)

_, confidence_intervals = learner.fit(features, labels, censoring,
confidence_intervals=True,
n_samples_bootstrap=20)

# Plot estimated parameters
# get bootstrap confidence intervals
refitted_coeffs = confidence_intervals['refit_coeffs']
lower_bound = confidence_intervals['lower_bound']
upper_bound = confidence_intervals['upper_bound']

n_rows = int(np.ceil(n_features / 2))
remove_last_plot = (n_features % 2 != 0)

fig, axarr = plt.subplots(n_rows, 2, sharex=True, sharey=True, figsize=(10, 6))
y = confidence_intervals['refit_coeffs']
lb = confidence_intervals['lower_bound']
ub = confidence_intervals['upper_bound']
for i, c in enumerate(y[:-6]):
ax = axarr[i // 2][i % 2]
l = n_lags[i]
ax.plot(np.exp(coeffs[i]), label="True RI")
ax.step(np.arange(l + 1), np.exp(c), label="Estimated RI")
ax.fill_between(np.arange(l + 1), np.exp(lb[i]), np.exp(ub[i]), alpha=.5,
color='orange', step='pre', label="95% boostrap CI")
plt.suptitle('Estimated relative risks with 95% confidence bands')
axarr[0][1].legend(loc='best')
[ax[0].set_ylabel('Relative incidence') for ax in axarr]
[ax.set_xlabel('Time after exposure start') for ax in axarr[-1]]
if remove_last_plot:
fig.delaxes(axarr[-1][-1])
plt.show()

normalize = lambda x: x / np.sum(x)
m = np.repeat(np.hstack(refitted_coeffs[-6:]), 125)
lb = np.repeat(np.hstack(lower_bound[-6:]), 125)
ub = np.repeat(np.hstack(upper_bound[-6:]), 125)
plt.figure()
plt.plot(np.arange(n_intervals),
normalize(np.exp(time_drift(np.arange(n_intervals)))))
plt.step(np.arange(n_intervals), normalize(np.exp(m)))
plt.fill_between(np.arange(n_intervals), np.exp(lb) / np.exp(m).sum(),
np.exp(ub) / np.exp(m).sum(), alpha=.5, color='orange',
step='pre')
plt.xlabel('Age')
plt.ylabel('Normalized Age Relative Incidence')
plt.title("Normalized age effect with 95% confidence bands");
plt.show()
67 changes: 43 additions & 24 deletions lib/cpp/preprocessing/longitudinal_features_lagger.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,27 @@
#include "tick/preprocessing/longitudinal_features_lagger.h"

LongitudinalFeaturesLagger::LongitudinalFeaturesLagger(
const SBaseArrayDouble2dPtrList1D &features, const ulong n_lags)
const SBaseArrayDouble2dPtrList1D &features,
const SArrayULongPtr n_lags)
: n_intervals(features[0]->n_rows()),
n_lags(n_lags),
n_samples(features.size()),
n_observations(n_samples * n_intervals),
n_features(features[0]->n_cols()),
n_lagged_features(n_features * (n_lags + 1)) {
if (n_lags >= n_intervals) {
TICK_ERROR("n_lags must be between 0 and (n_intervals - 1)");
n_lagged_features(n_lags->sum() + n_lags->size()) {
col_offset = ArrayULong(n_lags->size());
col_offset.init_to_zero();
if (n_features != n_lags->size()) {
TICK_ERROR("Features matrix column number should match n_lags length.");
}
if ((*n_lags)[0] >= n_intervals) {
TICK_ERROR("n_lags elements must be between 0 and (n_intervals - 1).");
}
for (ulong i(1); i < n_lags->size(); i++) {
if ((*n_lags)[i] >= n_intervals) {
TICK_ERROR("n_lags elements must be between 0 and (n_intervals - 1).");
}
col_offset[i] = col_offset[i - 1] + (*n_lags)[i-1] + 1;
}
}

Expand All @@ -24,22 +36,22 @@ void LongitudinalFeaturesLagger::dense_lag_preprocessor(ArrayDouble2d &features,
ulong censoring) const {
if (out.n_cols() != n_lagged_features) {
TICK_ERROR(
"n_columns of &out is inconsistent with n_features * (n_lags + 1).");
"n_columns of &out should be equal to n_features + sum(n_lags).");
}
if (out.n_rows() != n_intervals) {
TICK_ERROR("n_rows of &out is inconsistent with n_intervals");
}
ulong sample, row, col;
ulong n_cols_feature, row, col, max_col;
double value;
for (ulong feature = 0; feature < n_features; feature++) {
n_cols_feature = (*n_lags)[feature] + 1;
for (ulong j = 0; j < n_intervals; j++) {
row = j;
sample = row / n_intervals;
col = feature * (n_lags + 1);
col = col_offset[feature];
value = features(row, feature);
max_col = col + n_cols_feature;
if (value != 0) {
while (row < censoring && row / n_intervals == sample &&
col / (n_lags + 1) == feature) {
while (row < censoring && col < max_col) {
out[row * n_lagged_features + col] = value;
row++;
col++;
Expand All @@ -49,23 +61,30 @@ void LongitudinalFeaturesLagger::dense_lag_preprocessor(ArrayDouble2d &features,
}
}

void LongitudinalFeaturesLagger::sparse_lag_preprocessor(
ArrayULong &row, ArrayULong &col, ArrayDouble &data, ArrayULong &out_row,
ArrayULong &out_col, ArrayDouble &out_data, ulong censoring) const {
ulong j = 0;
for (ulong i = 0; i < row.size(); i++) {
double value = data[i];
ulong r = row[i];
ulong c = col[i] * (n_lags + 1);
ulong sample = r / n_intervals;
ulong feature = c / (n_lags + 1);
while (r < censoring && r / n_intervals == sample &&
c / (n_lags + 1) == feature) {
void LongitudinalFeaturesLagger::sparse_lag_preprocessor(ArrayULong &row,
ArrayULong &col,
ArrayDouble &data,
ArrayULong &out_row,
ArrayULong &out_col,
ArrayDouble &out_data,
ulong censoring) const {
ulong j(0), r, c, offset, new_col, max_col;
double value;

for (ulong i = 0; i < data.size(); i++) {
value = data[i];
r = row[i];
c = col[i];
offset = col_offset[c];
max_col = offset + (*n_lags)[c] + 1;
new_col = offset;

while (r < censoring && new_col < max_col) {
out_row[j] = r;
out_col[j] = c;
out_col[j] = new_col;
out_data[j] = value;
r++;
c++;
new_col++;
j++;
}
}
Expand Down
46 changes: 25 additions & 21 deletions lib/cpp/survival/model_sccs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,32 @@

ModelSCCS::ModelSCCS(const SBaseArrayDouble2dPtrList1D &features,
const SArrayIntPtrList1D &labels,
const SBaseArrayULongPtr censoring, ulong n_lags)
: n_lags(n_lags), labels(labels), features(features), censoring(censoring) {
if (features.size() == 0) TICK_ERROR("ModelSCCS: features empty");

n_samples = features.size();
n_intervals = features[0]->n_rows();
n_observations = (n_samples * n_intervals);

n_lagged_features = features[0]->n_cols();
n_features =
n_lags > 0 ? n_lagged_features / (n_lags + 1) : n_lagged_features;

if (n_lags >= n_intervals)
TICK_ERROR("ModelSCCS requires n_lags < n_intervals");
const SBaseArrayULongPtr censoring,
const SArrayULongPtr n_lags)
: n_intervals(features[0]->n_rows()),
n_lags(n_lags),
n_samples(features.size()),
n_observations(n_samples * n_intervals),
n_lagged_features(n_lags->sum() + n_lags->size()),
n_features(n_lags->size()),
labels(labels),
features(features),
censoring(censoring) {
if ((*n_lags)[0] >= n_intervals) {
TICK_ERROR("n_lags elements must be between 0 and (n_intervals - 1).");
}
col_offset = ArrayULong(n_lags->size());
col_offset.init_to_zero();
for (ulong i(1); i < n_lags->size(); i++) {
if ((*n_lags)[i] >= n_intervals) {
TICK_ERROR("n_lags elements must be between 0 and (n_intervals - 1).");
}
col_offset[i] = col_offset[i - 1] + (*n_lags)[i-1] + 1;
}

if (n_samples != labels.size() || n_samples != (*censoring).size())
TICK_ERROR("features, labels and censoring should have equal length.");

if (n_lags > 0 && n_lagged_features % (n_lags + 1) != 0)
TICK_ERROR(
"n_lags should be a divisor of the number of feature matrices "
"columns.");

for (ulong i(0); i < n_samples; i++) {
if (features[i]->n_rows() != n_intervals)
TICK_ERROR("All feature matrices should have " << n_intervals << " rows");
Expand Down Expand Up @@ -62,7 +65,8 @@ double ModelSCCS::loss_i(const ulong i, const ArrayDouble &coeffs) {

for (ulong t = 0; t < max_interval; t++)
inner_prod[t] = get_inner_prod(i, t, coeffs);
for (ulong t = max_interval; t < n_intervals; t++) inner_prod[t] = 0;
if (max_interval < n_intervals)
view(inner_prod, max_interval, n_intervals).fill(0);

softMax(inner_prod, softmax);

Expand Down Expand Up @@ -98,7 +102,7 @@ void ModelSCCS::grad_i(const ulong i, const ArrayDouble &coeffs,
inner_prod[t] = get_inner_prod(i, t, coeffs);

if (max_interval < n_intervals)
view(inner_prod, max_interval, n_intervals).fill(0); // TODO
view(inner_prod, max_interval, n_intervals).fill(0);

double x_max = inner_prod.max();
double sum_exp = sumExpMinusMax(inner_prod, x_max);
Expand Down
6 changes: 4 additions & 2 deletions lib/include/tick/preprocessing/longitudinal_features_lagger.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,16 @@
class LongitudinalFeaturesLagger {
protected:
ulong n_intervals;
ulong n_lags;
SArrayULongPtr n_lags;
ArrayULong col_offset;
ulong n_samples;
ulong n_observations;
ulong n_features;
ulong n_lagged_features;

public:
LongitudinalFeaturesLagger(const SBaseArrayDouble2dPtrList1D &features,
const ulong n_lags);
const SArrayULongPtr n_lags);

void dense_lag_preprocessor(ArrayDouble2d &features, ArrayDouble2d &out,
ulong censoring) const;
Expand All @@ -36,6 +37,7 @@ class LongitudinalFeaturesLagger {
void serialize(Archive &ar) {
ar(CEREAL_NVP(n_intervals));
ar(CEREAL_NVP(n_lags));
ar(CEREAL_NVP(col_offset));
ar(CEREAL_NVP(n_samples));
ar(CEREAL_NVP(n_observations));
ar(CEREAL_NVP(n_features));
Expand Down
6 changes: 4 additions & 2 deletions lib/include/tick/survival/model_sccs.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ class DLL_PUBLIC ModelSCCS : public ModelLipschitz {

protected:
ulong n_intervals;
ulong n_lags;
SArrayULongPtr n_lags;
ArrayULong col_offset;
ulong n_samples;
ulong n_observations;
ulong n_lagged_features;
Expand All @@ -34,7 +35,8 @@ class DLL_PUBLIC ModelSCCS : public ModelLipschitz {
public:
ModelSCCS(const SBaseArrayDouble2dPtrList1D &features,
const SArrayIntPtrList1D &labels,
const SBaseArrayULongPtr censoring, ulong n_lags);
const SBaseArrayULongPtr censoring,
const SArrayULongPtr n_lags);

double loss(const ArrayDouble &coeffs) override;

Expand Down
2 changes: 1 addition & 1 deletion lib/swig/preprocessing/longitudinal_features_lagger.i
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class LongitudinalFeaturesLagger {

public:
LongitudinalFeaturesLagger(const SBaseArrayDouble2dPtrList1D &features,
const ulong n_lags);
const SArrayULongPtr n_lags);

void dense_lag_preprocessor(ArrayDouble2d &features,
ArrayDouble2d &out,
Expand Down

0 comments on commit 088393b

Please sign in to comment.