NF: First try of Fast-LTS implementation (large sample LTS).

The fit_large_sample method has been created. It has yet to be modified in order to take into account a complete subsampling scheme (thus, it would become a correct Fast-LTS implementation). The random_fit method has been modified so that we can stop after stage 1 and hence release the best candidates obtained so far without finishing the run. The same method can now start from an initial set of subsets, given by their masks (instead of randomly drawing the subsets). The way the candidate are stored has been modified so it is easier to pool the results from the different subsets to a larger set. A benchmark script has been added in examples. Unfortunately, I was not able to show a real benefit of using the large sample LTS version. It is even worse sometime. Parameters of the simple LTS has been adjusted so that they correspond roughly to what is done with in the alrge sample version (n_keep and max_starts parameters have been set to the same values in the example).
VirgileFritsch · Sep 12, 2012 · 1a1a14c · 1a1a14c
1 parent e784fae
commit 1a1a14c
Show file tree

Hide file tree

Showing 2 changed files with 209 additions and 24 deletions.
diff --git a/statsmodels/examples/benchmark_fastlts.py b/statsmodels/examples/benchmark_fastlts.py
@@ -0,0 +1,110 @@
+"""
+Try LTS on a large contaminated dataset
+
+"""
+# Author: Virgile Fritsch, <virgile.fritsch@inria.fr>, 2012
+
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+from statsmodels.robust.least_trimmed_squares import LTS
+
+# Exammple settings
+n_samples = 1000
+n_features = 10
+noise_scale = 1.
+outliers_scale = 5.
+range_cont = np.arange(0.05, 0.50, 0.05)
+n_trials = 10
+
+groups = 3
+
+time_lts = np.zeros((len(range_cont), n_trials))
+time_lts_ls = np.zeros((len(range_cont), n_trials))
+discard_lts = np.zeros((len(range_cont), n_trials))
+discard_lts_ls = np.zeros((len(range_cont), n_trials))
+params_lts = np.zeros((len(range_cont), n_trials, n_features))
+params_lts_ls = np.zeros((len(range_cont), n_trials, n_features))
+for i, cont in enumerate(range_cont):
+    print cont
+    for trial in range(n_trials):
+        # Experiment design
+        # explanatory variable
+        X = 3 * np.random.rand(n_samples, 1)
+        # covariables
+        Z = 3 * np.random.rand(n_samples, n_features - 1)
+        design_matrix = np.hstack((X, Z))
+        # noise
+        noise = noise_scale * np.random.randn(n_samples)
+        # generate outliers
+        outliers = np.random.randn(n_samples)
+        mask_outliers = np.ones(n_samples, dtype=bool)
+        random_selection = np.random.permutation(
+            n_samples)[:(1 - cont) * n_samples]
+        mask_outliers[random_selection] = False
+        # mix variables, noise and outliers
+        beta = np.asarray([0] + [1] * (n_features - 1))
+        Y = np.dot(design_matrix, beta) \
+            + (~mask_outliers) * noise \
+            + mask_outliers * outliers_scale * outliers
+
+        # Run simple LTS
+        t0 = time.time()
+        the_fit, support = LTS(Y, design_matrix).fit(
+            random_search_options=dict(max_nstarts=500, n_keep=10))
+        t1 = time.time()
+        time_lts[i, trial] = t1 - t0
+        # How many outliers were discarded in fit?
+        ground_truth = np.nonzero(mask_outliers)[0]
+        lts_outliers = np.where(~support)[0]
+        overlap = 0.
+        for j in lts_outliers:
+            if j in ground_truth:
+                overlap += 1
+        discard_lts[i, trial] = 100 * overlap / ground_truth.size
+        params_lts[i, trial] = the_fit.params
+
+        # Run "large-sample" LTS (~= Fast-LTS algorithm)
+        t0 = time.time()
+        the_fit2, support2 = LTS(Y, design_matrix).fit_large_sample(groups)
+        t1 = time.time()
+        time_lts_ls[i, trial] = t1 - t0
+        # How many outliers were discarded in fit?
+        ground_truth = np.nonzero(mask_outliers)[0]
+        lts_outliers2 = np.where(~support2)[0]
+        overlap = 0.
+        for j in lts_outliers2:
+            if j in ground_truth:
+                overlap += 1
+        discard_lts_ls[i, trial] = 100 * overlap / ground_truth.size
+        params_lts_ls[i, trial] = the_fit2.params
+
+plt.figure()
+plt.title("Elapsed time (p=%d, n=%d)" % (n_features, n_samples))
+plt.plot(range_cont, time_lts.mean(1), color="green", label="LTS")
+plt.plot(range_cont, time_lts_ls.mean(1), color="blue", label="LTS_ls")
+plt.xlabel("Contamination (%%)")
+plt.ylabel("Elapsed time (s)")
+plt.legend()
+
+plt.figure()
+plt.title("Param estimate (p=%d, n=%d)" % (n_features, n_samples))
+plt.errorbar(range_cont, np.sum((params_lts - beta) ** 2, 2).mean(1),
+             yerr=np.sum((params_lts - beta) ** 2, 2).var(1),
+             color="green", label="LTS")
+plt.errorbar(range_cont, np.sum((params_lts_ls - beta) ** 2, 2).mean(1),
+         yerr=np.sum((params_lts_ls - beta) ** 2, 2).var(1),
+         color="blue", label="LTS_ls")
+plt.xlabel("Contamination (%%)")
+plt.ylabel("Param estimate")
+plt.legend()
+
+plt.figure()
+plt.title("Outliers discarded (p=%d, n=%d)" % (n_features, n_samples))
+plt.plot(range_cont, discard_lts.mean(1), color="green", label="LTS")
+plt.plot(range_cont, discard_lts_ls.mean(1), color="blue", label="LTS_ls")
+plt.xlabel("Contamination (%%)")
+plt.ylabel("Outliers discarded (%%)")
+plt.legend()
+
+plt.show()
diff --git a/statsmodels/robust/least_trimmed_squares.py b/statsmodels/robust/least_trimmed_squares.py
@@ -216,7 +216,8 @@ def refine(self, iin, k_accept, max_nrefine=2):
 
 
     def fit_random(self, k_trimmed, max_nstarts=10, k_start=None, n_keep=30,
-                         max_nrefine_st1=1, max_nrefine_st2=100):
+                   max_nrefine_st1=1, max_nrefine_st2=100,
+                   two_steps=True, iterator=None):
         '''find k_trimmed outliers with a 2-stage random search
 
         Parameters
@@ -239,6 +240,10 @@ def fit_random(self, k_trimmed, max_nstarts=10, k_start=None, n_keep=30,
         max_nrefine_st2 : int
             maximum number of concentration or refinement steps in second stage.
             see Notes.
+        two_steps: bool,
+            If True, the standard two-steps algorithm is performed.
+            If False, the algorithm stops after stage 1 and report the
+            n_keep best results obtained so far.
 
         Returns
         -------
@@ -303,12 +308,13 @@ def fit_random(self, k_trimmed, max_nstarts=10, k_start=None, n_keep=30,
         k_accept = nobs - k_trimmed
 
         #stage 1
-        best_stage1 = []
-        ssr_keep = [np.inf] * n_keep
+        best_stage1 = np.zeros((n_keep, nobs), dtype=bool)
+        ssr_keep = np.inf * np.ones(n_keep)
         #need sorted list that allows inserting and deletes worst
         #use: add if it is better than n_keep-worst (i.e. min_keep)
 
-        iterator = subsample(nobs, k_start, max_nrep=max_nstarts)
+        if iterator is None:
+            iterator = subsample(nobs, k_start, max_nrep=max_nstarts)
 
         for ii in iterator:
             iin = ii.copy()   #TODO: do I still need a copy
@@ -317,28 +323,33 @@ def fit_random(self, k_trimmed, max_nstarts=10, k_start=None, n_keep=30,
             #if res_trimmed.ssr < ssr_keep[n_keep-1]:
                 #best_stage1.append((res_trimmed.ssr, ii2))
             if ssr_new < ssr_keep[n_keep-1]:
-                best_stage1.append((ssr_new, ii2))
-                #update minkeep, shouldn't grow longer than n_keep
-                #we don't drop extra indices in best_stage1
-                ssr_keep.append(ssr_new)
-                ssr_keep.sort()  #inplace python sort
-                del ssr_keep[n_keep:]   #remove extra
+                best_stage1[-1] = ii2
+                ssr_keep[-1] = ssr_new
+                sort_aux = np.argsort(ssr_keep)
+                ssr_keep = ssr_keep[sort_aux]
+                best_stage1 = best_stage1[sort_aux]
 
         #stage 2 : refine best_stage1 to convergence
-        ssr_best = np.inf
-        for (ssr, start_mask) in best_stage1:
-            if ssr > ssr_keep[n_keep-1]: continue
-            res = self.refine(start_mask, k_accept, max_nrefine=max_nrefine_st2)
-            res_trimmed, ii2, ssr_new, converged = res
-            if not converged:
-                #warning ?
-                print "refine step did not converge, max_nrefine limit reached"
-
-            ssr_current = getattr(res_trimmed, self.target_attr)  #res_trimmed.ssr
-            if ssr_current < ssr_best:
-                ssr_best = ssr_current
-                res_best = (res_trimmed, ii2)
-
+        if two_steps:
+            ssr_best = np.inf
+            for i in range(n_keep):
+                ssr = ssr_keep[i]
+                start_mask = best_stage1[i]
+                if ssr > ssr_keep[n_keep-1]: continue
+                res = self.refine(
+                    start_mask, k_accept, max_nrefine=max_nrefine_st2)
+                res_trimmed, ii2, ssr_new, converged = res
+                if not converged:
+                    #warning ?
+                    print "refine step did not converge, " + \
+                        "max_nrefine limit reached"
+                ssr_current = getattr(
+                    res_trimmed, self.target_attr)  # res_trimmed.ssr
+                if ssr_current < ssr_best:
+                    ssr_best = ssr_current
+                    res_best = (res_trimmed, ii2)
+        else:
+            return (ssr_keep, best_stage1)
         self.temp.best_stage1 = best_stage1
         self.temp.ssr_keep = ssr_keep
 
@@ -420,6 +431,70 @@ def fit(self, k_trimmed=None, max_exact=100, random_search_options=None):
             self.options_fit_used = ('exact', options)
             return self.fit_random(k_trimmed, **options)
 
+    def fit_large_sample(self, n_subsamples, max_nstarts=500, k_trimmed=None,
+                         n_keep=10):
+        '''find k_trimmed outliers with a Fast-LTS like algorithm.
+
+        Parameters
+        ----------
+        n_subsamples: int
+            number of splits the initial dataset will be divided into.
+            According to Rousseeuw and Van Driessen's paper [1], splits
+            are of size 300.
+        max_nstarts : int
+            number of random draws of outliers indices used in subsamples at
+            first stage.
+        k_trimmed : int
+            number of observations to trim, i.e. number of outliers that are
+            removed from the data for estimation.
+        n_keep : int
+            number of first stage results to use for the second stage, the
+            concentration or refinement.
+
+        Returns
+        -------
+        res_trimmed : results instance
+            The best, lowest ssr, instance of the estimation results class.
+            Warning: this is just the OLS Results instance with the trimmed
+            set of observations and does not take the trimming into account.
+            TODO: adjust results, especially scale
+        ii2 : ndarray
+            index of inliers, observations used in the best regression.
+
+        Notes
+        -----
+        see fit, fit_random
+
+        [1] P.J. Rousseuw and K. Van Driessen, Computing LTS regression for
+        large data sets, Data Mining and Knowledge Discovery, 2005.
+
+        '''
+        # TODO: version with different levels of pooling
+        # (i.e. n_subsamples would be a list of sizes)
+        nobs, k_vars = self.nobs, self.k_vars
+        if k_trimmed is None:
+            k_trimmed = nobs - int(np.trunc(nobs + k_vars) // 2)
+        # split initial sample
+        sub_size = nobs / n_subsamples
+        subsamples = np.random.permutation(
+            nobs)[:(sub_size * n_subsamples)].reshape((n_subsamples, sub_size))
+        h_sub = int(sub_size * (nobs - k_trimmed) / float(nobs))
+        # run algorithm in subsets
+        best_subsets_candidates = []
+        for sub in subsamples:
+            res = LTS(self.endog[sub], self.exog[sub]).fit_random(
+                k_trimmed=k_trimmed / n_subsamples, n_keep=n_keep,
+                max_nstarts=max_nstarts / n_subsamples, k_start=h_sub,
+                two_steps=False)
+            for c in res[1]:
+                pooled_candidate = np.zeros(nobs, dtype=bool)
+                pooled_candidate[sub[c]] = True
+                best_subsets_candidates.append(pooled_candidate)
+        # pool candidates results to full dataset
+        return self.fit_random(
+            k_trimmed=k_trimmed, iterator=best_subsets_candidates,
+            n_keep=min(n_keep, len(best_subsets_candidates)))
+
 
 from statsmodels.discrete.discrete_model import DiscreteResults
 #DiscreteResults.nllf = lambda : - DiscreteResults.llf