diff --git a/docs/anomaly-detectors/isolation-forest.md b/docs/anomaly-detectors/isolation-forest.md index 21ecd3de7..8216b862d 100644 --- a/docs/anomaly-detectors/isolation-forest.md +++ b/docs/anomaly-detectors/isolation-forest.md @@ -28,4 +28,5 @@ $estimator = new IsolationForest(100); // Default sample size and threshold ### References >- F. T. Liu et al. (2008). Isolation Forest. ->- F. T. Liu et al. (2011). Isolation-based Anomaly Detection. \ No newline at end of file +>- F. T. Liu et al. (2011). Isolation-based Anomaly Detection. +>- M. Garchery et al. (2018). On the influence of categorical features in ranking anomalies using mixed data. \ No newline at end of file diff --git a/src/AnomalyDetectors/IsolationForest.php b/src/AnomalyDetectors/IsolationForest.php index 7a9726699..455c0b442 100644 --- a/src/AnomalyDetectors/IsolationForest.php +++ b/src/AnomalyDetectors/IsolationForest.php @@ -27,6 +27,8 @@ * References: * [1] F. T. Liu et al. (2008). Isolation Forest. * [2] F. T. Liu et al. (2011). Isolation-based Anomaly Detection. + * [3] M. Garchery et al. (2018). On the influence of categorical features in + * ranking anomalies using mixed data. * * @category Machine Learning * @package Rubix/ML @@ -168,7 +170,7 @@ public function train(Dataset $dataset) : void for ($i = 0; $i < $this->estimators; $i++) { $tree = new ITree($maxDepth); - $subset = $dataset->randomize()->head($k); + $subset = $dataset->randomSubset($k); $tree->grow($subset); @@ -236,9 +238,9 @@ protected function isolationScore(array $sample) : float $depth += $node ? $node->depth() : EPSILON; } - $depth /= $this->estimators; + $depth /= $this->estimators * $this->delta; - return 2. ** -($depth / $this->delta); + return 2. ** -$depth; } /** diff --git a/src/Graph/Nodes/Isolator.php b/src/Graph/Nodes/Isolator.php index dd9266aa5..ad1407f5e 100644 --- a/src/Graph/Nodes/Isolator.php +++ b/src/Graph/Nodes/Isolator.php @@ -58,8 +58,8 @@ public static function split(Dataset $dataset) : self $values = $dataset->column($column); if ($dataset->columnType($column) === DataType::CONTINUOUS) { - $min = (int) round(min($values) * PHI); - $max = (int) round(max($values) * PHI); + $min = (int) floor(min($values) * PHI); + $max = (int) ceil(max($values) * PHI); $value = rand($min, $max) / PHI; } else { diff --git a/src/Graph/Trees/ExtraTree.php b/src/Graph/Trees/ExtraTree.php index f9dcddb68..9a9ba57c2 100644 --- a/src/Graph/Trees/ExtraTree.php +++ b/src/Graph/Trees/ExtraTree.php @@ -31,8 +31,8 @@ protected function split(Labeled $dataset) : Comparison $values = $dataset->column($column); if ($dataset->columnType($column) === DataType::CONTINUOUS) { - $min = (int) round(min($values) * PHI); - $max = (int) round(max($values) * PHI); + $min = (int) floor(min($values) * PHI); + $max = (int) ceil(max($values) * PHI); $value = rand($min, $max) / PHI; } else { diff --git a/src/Graph/Trees/ITree.php b/src/Graph/Trees/ITree.php index af1417e3d..54b39caf4 100644 --- a/src/Graph/Trees/ITree.php +++ b/src/Graph/Trees/ITree.php @@ -46,11 +46,16 @@ class ITree implements BST */ public static function c(int $n) : float { - if ($n <= 1) { - return 1.; + switch (true) { + case $n > 2: + return 2. * (log($n - 1.) + M_EULER) - 2. * ($n - 1.) / $n; + + case $n === 2: + return 1.; + + default: + return 0.; } - - return 2. * (log($n - 1) + M_EULER) - 2. * ($n - 1) / $n; } /** diff --git a/tests/AnomalyDetectors/IsolationForestTest.php b/tests/AnomalyDetectors/IsolationForestTest.php index 19f710a83..8d1309e77 100644 --- a/tests/AnomalyDetectors/IsolationForestTest.php +++ b/tests/AnomalyDetectors/IsolationForestTest.php @@ -70,6 +70,8 @@ public function test_train_predict() $this->assertTrue($this->estimator->trained()); + var_dump($this->estimator->rank($testing)); + $predictions = $this->estimator->predict($testing); $score = $this->metric->score($predictions, $testing->labels()); diff --git a/tests/Graph/Nodes/CellTest.php b/tests/Graph/Nodes/CellTest.php index 490221b07..3dbc6fe99 100644 --- a/tests/Graph/Nodes/CellTest.php +++ b/tests/Graph/Nodes/CellTest.php @@ -14,10 +14,11 @@ class CellTest extends TestCase protected const SAMPLES = [ [5., 2., -3], [6., 4., -5], + [-0.01, 0.1, -7], ]; protected const DEPTH = 8; - protected const C = 7.1544313298030655; + protected const C = 8.207392357589622; public function test_build_node() {