Skip to content

Commit

Permalink
NF: First try of Fast-LTS implementation (large sample LTS).
Browse files Browse the repository at this point in the history
The fit_large_sample method has been created. It has yet to be modified
in order to take into account a complete subsampling scheme (thus, it would
become a correct Fast-LTS implementation).
The random_fit method has been modified so that we can stop after stage 1 and
hence release the best candidates obtained so far without finishing the run.
The same method can now start from an initial set of subsets, given by their
masks (instead of randomly drawing the subsets). The way the candidate are
stored has been modified so it is easier to pool the results from the
different subsets to a larger set.

A benchmark script has been added in examples. Unfortunately, I was not able
to show a real benefit of using the large sample LTS version. It is even worse
sometime.
Parameters of the simple LTS has been adjusted so that they correspond roughly
to what is done with in the alrge sample version (n_keep and max_starts
parameters have been set to the same values in the example).
  • Loading branch information
VirgileFritsch committed Sep 12, 2012
1 parent e784fae commit 1a1a14c
Show file tree
Hide file tree
Showing 2 changed files with 209 additions and 24 deletions.
110 changes: 110 additions & 0 deletions statsmodels/examples/benchmark_fastlts.py
@@ -0,0 +1,110 @@
"""
Try LTS on a large contaminated dataset
"""
# Author: Virgile Fritsch, <virgile.fritsch@inria.fr>, 2012

import time
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.robust.least_trimmed_squares import LTS

# Exammple settings
n_samples = 1000
n_features = 10
noise_scale = 1.
outliers_scale = 5.
range_cont = np.arange(0.05, 0.50, 0.05)
n_trials = 10

groups = 3

time_lts = np.zeros((len(range_cont), n_trials))
time_lts_ls = np.zeros((len(range_cont), n_trials))
discard_lts = np.zeros((len(range_cont), n_trials))
discard_lts_ls = np.zeros((len(range_cont), n_trials))
params_lts = np.zeros((len(range_cont), n_trials, n_features))
params_lts_ls = np.zeros((len(range_cont), n_trials, n_features))
for i, cont in enumerate(range_cont):
print cont
for trial in range(n_trials):
# Experiment design
# explanatory variable
X = 3 * np.random.rand(n_samples, 1)
# covariables
Z = 3 * np.random.rand(n_samples, n_features - 1)
design_matrix = np.hstack((X, Z))
# noise
noise = noise_scale * np.random.randn(n_samples)
# generate outliers
outliers = np.random.randn(n_samples)
mask_outliers = np.ones(n_samples, dtype=bool)
random_selection = np.random.permutation(
n_samples)[:(1 - cont) * n_samples]
mask_outliers[random_selection] = False
# mix variables, noise and outliers
beta = np.asarray([0] + [1] * (n_features - 1))
Y = np.dot(design_matrix, beta) \
+ (~mask_outliers) * noise \
+ mask_outliers * outliers_scale * outliers

# Run simple LTS
t0 = time.time()
the_fit, support = LTS(Y, design_matrix).fit(
random_search_options=dict(max_nstarts=500, n_keep=10))
t1 = time.time()
time_lts[i, trial] = t1 - t0
# How many outliers were discarded in fit?
ground_truth = np.nonzero(mask_outliers)[0]
lts_outliers = np.where(~support)[0]
overlap = 0.
for j in lts_outliers:
if j in ground_truth:
overlap += 1
discard_lts[i, trial] = 100 * overlap / ground_truth.size
params_lts[i, trial] = the_fit.params

# Run "large-sample" LTS (~= Fast-LTS algorithm)
t0 = time.time()
the_fit2, support2 = LTS(Y, design_matrix).fit_large_sample(groups)
t1 = time.time()
time_lts_ls[i, trial] = t1 - t0
# How many outliers were discarded in fit?
ground_truth = np.nonzero(mask_outliers)[0]
lts_outliers2 = np.where(~support2)[0]
overlap = 0.
for j in lts_outliers2:
if j in ground_truth:
overlap += 1
discard_lts_ls[i, trial] = 100 * overlap / ground_truth.size
params_lts_ls[i, trial] = the_fit2.params

plt.figure()
plt.title("Elapsed time (p=%d, n=%d)" % (n_features, n_samples))
plt.plot(range_cont, time_lts.mean(1), color="green", label="LTS")
plt.plot(range_cont, time_lts_ls.mean(1), color="blue", label="LTS_ls")
plt.xlabel("Contamination (%%)")
plt.ylabel("Elapsed time (s)")
plt.legend()

plt.figure()
plt.title("Param estimate (p=%d, n=%d)" % (n_features, n_samples))
plt.errorbar(range_cont, np.sum((params_lts - beta) ** 2, 2).mean(1),
yerr=np.sum((params_lts - beta) ** 2, 2).var(1),
color="green", label="LTS")
plt.errorbar(range_cont, np.sum((params_lts_ls - beta) ** 2, 2).mean(1),
yerr=np.sum((params_lts_ls - beta) ** 2, 2).var(1),
color="blue", label="LTS_ls")
plt.xlabel("Contamination (%%)")
plt.ylabel("Param estimate")
plt.legend()

plt.figure()
plt.title("Outliers discarded (p=%d, n=%d)" % (n_features, n_samples))
plt.plot(range_cont, discard_lts.mean(1), color="green", label="LTS")
plt.plot(range_cont, discard_lts_ls.mean(1), color="blue", label="LTS_ls")
plt.xlabel("Contamination (%%)")
plt.ylabel("Outliers discarded (%%)")
plt.legend()

plt.show()
123 changes: 99 additions & 24 deletions statsmodels/robust/least_trimmed_squares.py
Expand Up @@ -216,7 +216,8 @@ def refine(self, iin, k_accept, max_nrefine=2):


def fit_random(self, k_trimmed, max_nstarts=10, k_start=None, n_keep=30,
max_nrefine_st1=1, max_nrefine_st2=100):
max_nrefine_st1=1, max_nrefine_st2=100,
two_steps=True, iterator=None):
'''find k_trimmed outliers with a 2-stage random search
Parameters
Expand All @@ -239,6 +240,10 @@ def fit_random(self, k_trimmed, max_nstarts=10, k_start=None, n_keep=30,
max_nrefine_st2 : int
maximum number of concentration or refinement steps in second stage.
see Notes.
two_steps: bool,
If True, the standard two-steps algorithm is performed.
If False, the algorithm stops after stage 1 and report the
n_keep best results obtained so far.
Returns
-------
Expand Down Expand Up @@ -303,12 +308,13 @@ def fit_random(self, k_trimmed, max_nstarts=10, k_start=None, n_keep=30,
k_accept = nobs - k_trimmed

#stage 1
best_stage1 = []
ssr_keep = [np.inf] * n_keep
best_stage1 = np.zeros((n_keep, nobs), dtype=bool)
ssr_keep = np.inf * np.ones(n_keep)
#need sorted list that allows inserting and deletes worst
#use: add if it is better than n_keep-worst (i.e. min_keep)

iterator = subsample(nobs, k_start, max_nrep=max_nstarts)
if iterator is None:
iterator = subsample(nobs, k_start, max_nrep=max_nstarts)

for ii in iterator:
iin = ii.copy() #TODO: do I still need a copy
Expand All @@ -317,28 +323,33 @@ def fit_random(self, k_trimmed, max_nstarts=10, k_start=None, n_keep=30,
#if res_trimmed.ssr < ssr_keep[n_keep-1]:
#best_stage1.append((res_trimmed.ssr, ii2))
if ssr_new < ssr_keep[n_keep-1]:
best_stage1.append((ssr_new, ii2))
#update minkeep, shouldn't grow longer than n_keep
#we don't drop extra indices in best_stage1
ssr_keep.append(ssr_new)
ssr_keep.sort() #inplace python sort
del ssr_keep[n_keep:] #remove extra
best_stage1[-1] = ii2
ssr_keep[-1] = ssr_new
sort_aux = np.argsort(ssr_keep)
ssr_keep = ssr_keep[sort_aux]
best_stage1 = best_stage1[sort_aux]

#stage 2 : refine best_stage1 to convergence
ssr_best = np.inf
for (ssr, start_mask) in best_stage1:
if ssr > ssr_keep[n_keep-1]: continue
res = self.refine(start_mask, k_accept, max_nrefine=max_nrefine_st2)
res_trimmed, ii2, ssr_new, converged = res
if not converged:
#warning ?
print "refine step did not converge, max_nrefine limit reached"

ssr_current = getattr(res_trimmed, self.target_attr) #res_trimmed.ssr
if ssr_current < ssr_best:
ssr_best = ssr_current
res_best = (res_trimmed, ii2)

if two_steps:
ssr_best = np.inf
for i in range(n_keep):
ssr = ssr_keep[i]
start_mask = best_stage1[i]
if ssr > ssr_keep[n_keep-1]: continue
res = self.refine(
start_mask, k_accept, max_nrefine=max_nrefine_st2)
res_trimmed, ii2, ssr_new, converged = res
if not converged:
#warning ?
print "refine step did not converge, " + \
"max_nrefine limit reached"
ssr_current = getattr(
res_trimmed, self.target_attr) # res_trimmed.ssr
if ssr_current < ssr_best:
ssr_best = ssr_current
res_best = (res_trimmed, ii2)
else:
return (ssr_keep, best_stage1)
self.temp.best_stage1 = best_stage1
self.temp.ssr_keep = ssr_keep

Expand Down Expand Up @@ -420,6 +431,70 @@ def fit(self, k_trimmed=None, max_exact=100, random_search_options=None):
self.options_fit_used = ('exact', options)
return self.fit_random(k_trimmed, **options)

def fit_large_sample(self, n_subsamples, max_nstarts=500, k_trimmed=None,
n_keep=10):
'''find k_trimmed outliers with a Fast-LTS like algorithm.
Parameters
----------
n_subsamples: int
number of splits the initial dataset will be divided into.
According to Rousseeuw and Van Driessen's paper [1], splits
are of size 300.
max_nstarts : int
number of random draws of outliers indices used in subsamples at
first stage.
k_trimmed : int
number of observations to trim, i.e. number of outliers that are
removed from the data for estimation.
n_keep : int
number of first stage results to use for the second stage, the
concentration or refinement.
Returns
-------
res_trimmed : results instance
The best, lowest ssr, instance of the estimation results class.
Warning: this is just the OLS Results instance with the trimmed
set of observations and does not take the trimming into account.
TODO: adjust results, especially scale
ii2 : ndarray
index of inliers, observations used in the best regression.
Notes
-----
see fit, fit_random
[1] P.J. Rousseuw and K. Van Driessen, Computing LTS regression for
large data sets, Data Mining and Knowledge Discovery, 2005.
'''
# TODO: version with different levels of pooling
# (i.e. n_subsamples would be a list of sizes)
nobs, k_vars = self.nobs, self.k_vars
if k_trimmed is None:
k_trimmed = nobs - int(np.trunc(nobs + k_vars) // 2)
# split initial sample
sub_size = nobs / n_subsamples
subsamples = np.random.permutation(
nobs)[:(sub_size * n_subsamples)].reshape((n_subsamples, sub_size))
h_sub = int(sub_size * (nobs - k_trimmed) / float(nobs))
# run algorithm in subsets
best_subsets_candidates = []
for sub in subsamples:
res = LTS(self.endog[sub], self.exog[sub]).fit_random(
k_trimmed=k_trimmed / n_subsamples, n_keep=n_keep,
max_nstarts=max_nstarts / n_subsamples, k_start=h_sub,
two_steps=False)
for c in res[1]:
pooled_candidate = np.zeros(nobs, dtype=bool)
pooled_candidate[sub[c]] = True
best_subsets_candidates.append(pooled_candidate)
# pool candidates results to full dataset
return self.fit_random(
k_trimmed=k_trimmed, iterator=best_subsets_candidates,
n_keep=min(n_keep, len(best_subsets_candidates)))


from statsmodels.discrete.discrete_model import DiscreteResults
#DiscreteResults.nllf = lambda : - DiscreteResults.llf
Expand Down

0 comments on commit 1a1a14c

Please sign in to comment.