Bugfix in bootstrap and simplify example

X-DataInitiative · Feb 7, 2018 · 594b316 · 594b316
1 parent 8497968
commit 594b316
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 55 deletions.
diff --git a/examples/plot_conv_sccs_cv_results.py b/examples/plot_conv_sccs_cv_results.py
@@ -37,38 +37,13 @@
 
 coeffs = np.log(np.hstack(sim_effects))
 
-# Plot "true" relative incidences functions
-n_plots = np.sum([1 for l in n_lags if l > 0])
-n_rows = int(np.ceil(n_plots / 2))
-fig, axarr = plt.subplots(n_rows, 2, sharex=True, sharey=True, figsize=(10, 6))
-offset = 0
-c = 0
-for l in n_lags:
-    if l == 0:
-        offset += 1
-        continue
-    end = int(offset + l + 1)
-    ax = axarr[c // 2][c % 2]
-    ax.plot(np.exp(coeffs[offset:end]))
-    offset = end
-    c += 1
-plt.suptitle('Relative risks used for simulations')
-axarr[-1][0].set_ylabel('Relative incidence')
-axarr[-1][1].set_xlabel('Time after exposure start')
-plt.show()
-
 # Time drift (age effect) used for the simulations.
 time_drift = lambda t: np.log(8 * np.sin(.01 * t) + 9)
-plt.plot(np.exp(time_drift(np.arange(750))))
-plt.title('Age effect used for simulations')
-plt.xlabel('Age')
-plt.ylabel('Age Relative Incidence')
-plt.show()
 
 # Simaltion of the features.
 sim = SimuSCCS(n_samples, n_intervals, n_features, n_lags,
                time_drift=time_drift, coeffs=coeffs, seed=seed,
-               n_correlations=n_corr)
+               n_correlations=n_corr, verbose=False)
 features, censored_features, labels, censoring, coeffs = sim.simulate()
 
 # Plot the Hawkes kernel matrix used to generate the features.
@@ -77,7 +52,7 @@
 divider = make_axes_locatable(ax)
 cax = divider.append_axes("right", size="5%", pad=0.5)
 fig.colorbar(heatmap, cax=cax)
-ax.set_title('Hawkes adjacency matrix');
+ax.set_title('Hawkes adjacency matrix used for the simulation');
 plt.show()
 
 ## Add age_groups features to feature matrices.
@@ -95,24 +70,35 @@
 n_lags = np.hstack([n_lags, np.zeros(n_agegrps)])
 
 # Learning
-start = time()
+# Example code for cross validation
+# start = time()
+# learner = ConvSCCS(n_lags=n_lags.astype('uint64'),
+#                    penalized_features=np.arange(n_features),
+#                    random_state=42)
+# strength_TV_range = (-4, -1)
+# strength_L1_range = (-5, -2)
+# _, cv_track = learner.fit_kfold_cv(features, labels, censoring,
+#                            strength_TV_range, strength_L1_range,
+#                            bootstrap=True, bootstrap_rep=50, n_cv_iter=50)
+# elapsed_time = time() - start
+# print("Elapsed time (model training): %.2f seconds \n" % elapsed_time)
+# print("Best model hyper parameters: \n", cv_track.best_model['strength'])
+# cv_track.plot_cv_report(35, 45)
+# plt.show()
+
+# using the parameters resulting from cross-validation
 learner = ConvSCCS(n_lags=n_lags.astype('uint64'),
                    penalized_features=np.arange(n_features),
-                   random_state=42)
-strength_TV_range = (-4, -1)
-strength_L1_range = (-5, -2)
-res = learner.fit_kfold_cv(features, labels, censoring, strength_TV_range,
-                           strength_L1_range, bootstrap=True, bootstrap_rep=50,
-                           n_cv_iter=50)
-elapsed_time = time() - start
-print("Elapsed time (model training): %.2f seconds \n" % elapsed_time)
-cv_track = res[1]
-print("Best model hyper parameters: \n", cv_track.best_model['strength'])
-cv_track.plot_cv_report(35, 45)
-plt.show()
+                   random_state=42, strength_tv=0.0036999724314638062,
+                   strength_group_l1=0.0001917004158917066)
+
+_, bootstrap_ci = learner.fit(features, labels, censoring,
+                              bootstrap=True, bootstrap_rep=20)
 
 # Plot estimated parameters
-bootstrap_ci = cv_track.best_model['bootstrap_ci']
+# when using cross validation output:
+# bootstrap_ci = cv_track.best_model['bootstrap_ci']
+# get bootstrap confidence intervals
 refitted_coeffs = bootstrap_ci['refit_coeffs']
 lower_bound = bootstrap_ci['lower_bound']
 upper_bound = bootstrap_ci['upper_bound']
@@ -131,14 +117,15 @@
     lb = lower_bound[offset:end]
     ub = upper_bound[offset:end]
     ax = axarr[c // 2][c % 2]
-    ax.plot(np.exp(coeffs[offset:end]))
-    ax.plot(np.exp(m))
+    ax.plot(np.exp(coeffs[offset:end]), label="True RI")
+    ax.step(np.arange(l+1), np.exp(m), label="Estimated RI")
     ax.fill_between(np.arange(l + 1), np.exp(lb), np.exp(ub), alpha=.5,
-                    color='orange')
+                    color='orange', step='pre', label="95% boostrap CI")
     offset = end
     c += 1
 
 plt.suptitle('Estimated relative risks with 95% confidence bands')
+axarr[0][1].legend(loc='upper right')
 axarr[0][0].set_ylabel('Relative incidence')
 axarr[1][0].set_ylabel('Relative incidence')
 axarr[-1][0].set_xlabel('Time after exposure start')
@@ -150,11 +137,13 @@
 m = np.repeat(refitted_coeffs[offset:], 125)
 lb = np.repeat(lower_bound[offset:], 125)
 ub = np.repeat(upper_bound[offset:], 125)
+plt.figure()
 plt.plot(np.arange(n_intervals),
          normalize(np.exp(time_drift(np.arange(n_intervals)))))
-plt.plot(np.arange(n_intervals), normalize(np.exp(m)))
+plt.step(np.arange(n_intervals), normalize(np.exp(m)))
 plt.fill_between(np.arange(n_intervals), np.exp(lb) / np.exp(m).sum(),
-                 np.exp(ub) / np.exp(m).sum(), alpha=.5, color='orange')
+                 np.exp(ub) / np.exp(m).sum(), alpha=.5, color='orange',
+                 step='pre')
 plt.xlabel('Age')
 plt.ylabel('Normalized Age Relative Incidence')
 plt.title("Normalized age effect with 95% confidence bands");

diff --git a/lib/cpp/preprocessing/longitudinal_features_lagger.cpp b/lib/cpp/preprocessing/longitudinal_features_lagger.cpp
@@ -23,7 +23,7 @@ LongitudinalFeaturesLagger::LongitudinalFeaturesLagger(
   if ((*n_lags)[0] >= n_intervals) {
     TICK_ERROR("n_lags elements must be between 0 and (n_intervals - 1).");
   }
-  for(ulong i(1); i < n_lags->size(); i++){
+  for (ulong i(1); i < n_lags->size(); i++) {
     if ((*n_lags)[i] >= n_intervals) {
       TICK_ERROR("n_lags elements must be between 0 and (n_intervals - 1).");
     }

diff --git a/lib/cpp/survival/model_sccs.cpp b/lib/cpp/survival/model_sccs.cpp
@@ -28,7 +28,7 @@ ModelSCCS::ModelSCCS(const SBaseArrayDouble2dPtrList1D &features,
   }
   col_offset = ArrayULong(n_lags->size());
   col_offset.init_to_zero();
-  for(ulong i(1); i < n_lags->size(); i++){
+  for (ulong i(1); i < n_lags->size(); i++) {
     if ((*n_lags)[i] >= n_intervals) {
       TICK_ERROR("n_lags elements must be between 0 and (n_intervals - 1).");
     }

diff --git a/tick/survival/convolutional_sccs.py b/tick/survival/convolutional_sccs.py
@@ -167,7 +167,7 @@ def __init__(self, n_lags: np.array,
                                                       verbose, random_state)
 
     # Interface
-    def fit(self, features: np.ndarray, labels: np.array,
+    def fit(self, features: list, labels: list,
             censoring: np.array, bootstrap: bool = False,
             bootstrap_rep: int = 200,
             bootstrap_confidence: float = .95):
@@ -378,7 +378,7 @@ def fit_kfold_cv(self, features, labels, censoring,
 
         cv_tracker.log_best_model(self._strengths, self.coeffs.tolist(),
                                   self.score(),
-                                  self.bootstrap_coeffs._asdict())
+                                  self.bootstrap_coeffs)
 
         return coeffs, cv_tracker
 
@@ -450,7 +450,7 @@ def _postfit(self, p_features, p_labels, p_censoring,
             bootstrap_ci = self._bootstrap(p_features, p_labels, p_censoring,
                                            refit_coeffs, bootstrap_rep,
                                            bootstrap_confidence)
-            self._set('bootstrap_coeffs', bootstrap_ci)
+            self._set('bootstrap_coeffs', bootstrap_ci._asdict())
 
         return self.coeffs, self.bootstrap_coeffs
 
@@ -475,12 +475,12 @@ def _bootstrap(self, p_features, p_labels, p_censoring, coeffs,
 
         bootstrap_coeffs = []
         sim = SimuSCCS(self.n_cases, self.n_intervals, self.n_features,
-                       self.n_lags)
+                       self.n_lags, coeffs=coeffs)
         # TODO later: parallelize bootstrap (everything should be pickable...)
         for k in range(rep):
             y = sim._simulate_multinomial_outcomes(p_features, coeffs)
             self._model_obj.fit(p_features, y, p_censoring)
-            bootstrap_coeffs.append(self._fit(False))
+            bootstrap_coeffs.append(self._fit(True))
 
         bootstrap_coeffs = np.exp(np.array(bootstrap_coeffs))
         bootstrap_coeffs.sort(axis=0)
@@ -534,13 +534,14 @@ def _construct_model_obj(self):
         return ModelSCCS(self.n_intervals, self.n_lags)
 
     def _construct_prox_obj(self, coeffs=None, project=False):
-        # TODO: test this !!!
         n_penalized_features = len(self.penalized_features) \
             if self.penalized_features is not None else 0
 
         if project:
             # project future coeffs on the support of given coeffs
-            if coeffs is not None and any(self.n_lags) > 0:
+            if all(self.n_lags) == 0:
+                proxs = [ProxZero()]
+            elif coeffs is not None and any(self.n_lags) > 0:
                 prox_ranges = self._detect_support(coeffs)
                 proxs = [ProxEquality(0, range=r) for r in prox_ranges]
             else: