Rambatino · Rambatino · Jun 14, 2017 · Jul 4, 2017 · Jul 6, 2017 · Jul 6, 2017
diff --git a/CHAID/__main__.py b/CHAID/__main__.py
@@ -7,7 +7,7 @@
 from .tree import Tree
 import pandas as pd
 import numpy as np
-
+from collections import OrderedDict
 
 def main():
     """Entry point when module is run from command line"""
@@ -42,6 +42,8 @@ def main():
                        'input with the value of the  dependent variable that '
                        'the majority of respondents in that node selected')
     group.add_argument('--rules', action='store_true')
+    group.add_argument('--accuracy', action='store_true', help='Find the accuracy '
+                       'of CHAID on this data')
 
 
     nspace = parser.parse_args()
@@ -71,7 +73,6 @@ def main():
     if nspace.dependent_variable_type:
         config['dep_variable_type'] = nspace.dependent_variable_type
 
-
     ordinal = nspace.ordinal_variables or []
     nominal = nspace.nominal_variables or []
     independent_variables = nominal + ordinal
@@ -94,6 +95,8 @@ def main():
         print(data.to_csv())
     elif nspace.rules:
         print('\n'.join(str(x) for x in tree.classification_rules()))
+    elif nspace.accuracy:
+        print(tree.accuracy(data[nominal + ordinal].values, data[nspace.dependent_variable[0]].values))
     else:
         tree.print_tree()
 

diff --git a/CHAID/column.py b/CHAID/column.py
@@ -23,9 +23,11 @@ class Column(object):
     def __init__(self, arr=None, metadata=None,
                  missing_id='<missing>', substitute=True, weights=None):
         self._metadata = dict(metadata or {})
+        self._original_arr = arr
         self._arr = np.array(arr)
         self._missing_id = missing_id
         self._weights = weights
+        self._counts = {}
 
     def __iter__(self):
         return iter(self._arr)
@@ -82,6 +84,33 @@ def metadata(self):
         """
         return self._metadata
 
+    @property
+    def original_vector(self):
+        """
+        Convert back to the original vector
+        """
+        return np.array([self._metadata[v] for v in self.arr])
+
+    def counts(self, substitute_metadata=False):
+        """
+        Enables the column to determine the most efficient way of
+        calculating the frequency of the different variables
+        """
+        for member in self._metadata.values():
+            self._counts[member] = 0
+
+        if self._weights is None:
+            counts = np.transpose(np.unique(self._arr, return_counts=True))
+        else:
+            counts = np.array([
+                [i, self._weights[self._arr == i].sum()] for i in set(self._arr)
+            ])
+        if substitute_metadata:
+            self._counts.update((self._metadata[k], v) for k, v in counts)
+        else:
+            self._counts.update((k, v) for k, v in counts)
+        return self._counts
+
 
 class NominalColumn(Column):
     """
@@ -111,7 +140,7 @@ def substitute_values(self, vect):
         metadata to convert back to the original vector.
 
         np.nan is always given -1, all other objects are given integers in
-        order of apperence.
+        order of appearence.
 
         Parameters
         ----------
@@ -287,6 +316,14 @@ def __setitem__(self, key, value):
         self._arr[key] = value
         return self
 
+    def counts(self, substitute_metadata=False):
+        if not self._counts:
+            self._counts = {
+                'mean': self._arr.mean(),
+                's.t.d': self._arr.std()
+            }
+        return self._counts
+
     @property
     def type(self):
         """

diff --git a/CHAID/node.py b/CHAID/node.py
@@ -72,26 +72,10 @@ def is_terminal(self):
 
     @property
     def members(self):
-        if self._members is None:
-            dep_v = self.dep_v
-            if isinstance(dep_v, ContinuousColumn):
-                self._members = {
-                    'mean': self.dep_v.arr.mean(),
-                    's.t.d': self.dep_v.arr.std()
-                }
-            else:
-                metadata = dep_v.metadata
-                self._members = {}
-                for member in metadata.values():
-                    self._members[member] = 0
-
-                if dep_v.weights is None:
-                    counts = np.transpose(np.unique(dep_v.arr, return_counts=True))
-                else:
-                    counts = np.array([
-                        [i, dep_v.weights[dep_v.arr == i].sum()] for i in set(dep_v.arr)
-                    ])
-
-                self._members.update((metadata[k], v) for k, v in counts)
-
+        if not self._members:
+            self._members = self.dep_v.counts(True)
         return self._members
+
+    @property
+    def predict(self):
+        return max(self.members, key=self.members.get)
diff --git a/CHAID/tree.py b/CHAID/tree.py
@@ -1,10 +1,12 @@
 import numpy as np
+import pandas as pd
 from treelib import Tree as TreeLibTree
 from .node import Node
 from .split import Split
 from .column import NominalColumn, OrdinalColumn, ContinuousColumn
 from .stats import Stats
 from .invalid_split_reason import InvalidSplitReason
+from collections import OrderedDict
 
 class Tree(object):
     """
@@ -111,13 +113,14 @@ def from_pandas_df(df, i_variables, d_variable, alpha_merge=0.05, max_depth=2,
             the type of dependent variable. Supported variable types are 'categorical' or
             'continuous'
         """
-        ind_df = df[list(i_variables.keys())]
+        df_ordered_keys = [x for x in df.columns if x in i_variables.keys()]
+        ind_df = df[df_ordered_keys] # preserve df column order
         ind_values = ind_df.values
         dep_values = df[d_variable].values
         weights = df[weight] if weight is not None else None
         return Tree(ind_values, dep_values, alpha_merge, max_depth, min_parent_node_size,
                     min_child_node_size, list(ind_df.columns.values), split_threshold, weights,
-                    list(i_variables.values()), dep_variable_type)
+                    [i_variables[key] for key in df_ordered_keys], dep_variable_type)
 
     def node(self, rows, ind, dep, depth=0, parent=None, parent_decisions=None):
         """ internal method to create a node in the tree """
@@ -226,6 +229,38 @@ def classification_rules(self, node=None, stack=None):
         else:
             return self.classification_rules(self.get_node(node.parent), stack)
 
+    def tree_predictions(self):
+        """
+        Calculates the row criteria that give rise
+        to a particular terminal node
+        """
+        tree_predictions = pd.DataFrame()
+        for node in self:
+            if node.is_terminal:
+                sliced_arr = np.array([x.original_vector for x in self.vectorised_array]).T[node.indices]
+                unique_set = np.vstack({ tuple(row) for row in sliced_arr })
+                index = pd.MultiIndex.from_arrays(np.transpose(unique_set))
+                if tree_predictions.empty:
+                    tree_predictions = pd.DataFrame([[node.node_id, node.predict]] * len(index), index=index)
+                else:
+                    tree_predictions = tree_predictions.append(pd.DataFrame([[node.node_id, node.predict]] * len(index), index=index))
+        tree_predictions.columns = ['node_id', 'prediction']
+        # need to retroactively fill missing values
+        return tree_predictions
+
+    def accuracy(self, ndarr, arr):
+        """
+        Calculates the accuracy of predicting the
+        dependent variable based upon the node
+        predictions
+        """
+        tree_predictions = self.tree_predictions()
+        index = pd.MultiIndex.from_arrays(np.transpose(ndarr))
+        series = pd.Series(arr, index=index, name='dep')
+        join = tree_predictions.join(series)
+        true_set = (join['prediction'] == join['dep']).sum()
+        return true_set / float(len(arr))
+
     def model_predictions(self):
         """
         Determines the highest frequency of

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,22 +1,18 @@
 # Change Log
 
-## [Unreleased](https://github.com/Rambatino/CHAID/tree/HEAD)
-
-[Full Changelog](https://github.com/Rambatino/CHAID/compare/v4.0.0-alpha.2...HEAD)
+## [v4.0.0](https://github.com/Rambatino/CHAID/tree/v4.0.0) (2017-06-14)
+[Full Changelog](https://github.com/Rambatino/CHAID/compare/v4.0.0-alpha.2...v4.0.0)
 
 **Implemented enhancements:**
 
 - Changed architecture of from\_pandas\_df to align variable types and instance variables into a single parameter [\#70](https://github.com/Rambatino/CHAID/pull/70) ([Rambatino](https://github.com/Rambatino))
+- Adding invalid split messages [\#68](https://github.com/Rambatino/CHAID/pull/68) ([Rambatino](https://github.com/Rambatino))
 
 **Closed issues:**
 
 - Creating tree different from README [\#65](https://github.com/Rambatino/CHAID/issues/65)
 - User shouldn't have to pass in variables and variable types [\#53](https://github.com/Rambatino/CHAID/issues/53)
 
-**Merged pull requests:**
-
-- Adding invalid split messages [\#68](https://github.com/Rambatino/CHAID/pull/68) ([Rambatino](https://github.com/Rambatino))
-
 ## [v4.0.0-alpha.2](https://github.com/Rambatino/CHAID/tree/v4.0.0-alpha.2) (2017-06-07)
 [Full Changelog](https://github.com/Rambatino/CHAID/compare/v4.0.0-alpha.1...v4.0.0-alpha.2)
 

diff --git a/tests/test_tree.py b/tests/test_tree.py
@@ -299,6 +299,7 @@ def test_node_predictions():
     ndarr = np.transpose(np.vstack([gender]))
     tree = CHAID.Tree(ndarr, income, alpha_merge=0.9, max_depth=1,
                       min_child_node_size=1, min_parent_node_size=1)
+
     assert (tree.node_predictions() == np.array([1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 3, 3, 3., 3, 3, 3, 3, 3, 2])).all() == True
 
 class TestTreeGenerated(TestCase):
@@ -515,3 +516,44 @@ def test_bartlett_significance(self):
         tree.build_tree()
         assert round(tree.tree_store[0].p, 4) == 0.3681
         assert len(tree.tree_store) == 5
+
+
+def test_accuracy_when_same_data():
+    """
+    Check whether it correctly determines the unique independent
+    variables for each node
+    """
+
+    independent_set = np.array([
+        [1], [2], [1], [3], [1], [2], [1], [2], [3], [4],
+        [1], [2], [3], [4], [1], [2], [3], [1], [2], [3]
+    ])
+
+    dependent_set = np.array([
+        1, 2, 2, 1, 2, 1, 2, 1, 2, 1,
+        2, 2, 1, 1, 2, 1, 2, 1, 1, 2
+    ])
+
+
+    tree = CHAID.Tree(ndarr=independent_set, arr=dependent_set)
+
+    indices_1 = np.array([0,  2,  4,  6, 10, 14, 17, 1,  5,  7, 11, 15, 18])
+
+    split = CHAID.Split(None, None, None, None, 0)
+
+    terminal_node_1 = CHAID.Node(
+        node_id=2, split=split, indices=indices_1,
+        dep_v=CHAID.NominalColumn(dependent_set[indices_1])
+    )
+
+    indices_2   = np.array([ 3,  8, 12, 16, 19, 9, 13])
+    terminal_node_2 = CHAID.Node(
+        node_id=3, split=split, indices=indices_2,
+        dep_v=CHAID.NominalColumn(dependent_set[indices_2])
+    )
+
+    tree._tree_store = [terminal_node_1, terminal_node_2]
+
+    accuracy = tree.accuracy(ndarr=independent_set, arr=dependent_set)
+
+    assert accuracy == float(11)/20