Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/accuracy #73

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions CHAID/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .tree import Tree
import pandas as pd
import numpy as np

from collections import OrderedDict

def main():
"""Entry point when module is run from command line"""
Expand Down Expand Up @@ -42,6 +42,8 @@ def main():
'input with the value of the dependent variable that '
'the majority of respondents in that node selected')
group.add_argument('--rules', action='store_true')
group.add_argument('--accuracy', action='store_true', help='Find the accuracy '
'of CHAID on this data')


nspace = parser.parse_args()
Expand Down Expand Up @@ -71,7 +73,6 @@ def main():
if nspace.dependent_variable_type:
config['dep_variable_type'] = nspace.dependent_variable_type


ordinal = nspace.ordinal_variables or []
nominal = nspace.nominal_variables or []
independent_variables = nominal + ordinal
Expand All @@ -94,6 +95,8 @@ def main():
print(data.to_csv())
elif nspace.rules:
print('\n'.join(str(x) for x in tree.classification_rules()))
elif nspace.accuracy:
print(tree.accuracy(data[nominal + ordinal].values, data[nspace.dependent_variable[0]].values))
else:
tree.print_tree()

Expand Down
39 changes: 38 additions & 1 deletion CHAID/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ class Column(object):
def __init__(self, arr=None, metadata=None,
missing_id='<missing>', substitute=True, weights=None):
self._metadata = dict(metadata or {})
self._original_arr = arr
self._arr = np.array(arr)
self._missing_id = missing_id
self._weights = weights
self._counts = {}

def __iter__(self):
return iter(self._arr)
Expand Down Expand Up @@ -82,6 +84,33 @@ def metadata(self):
"""
return self._metadata

@property
def original_vector(self):
"""
Convert back to the original vector
"""
return np.array([self._metadata[v] for v in self.arr])

def counts(self, substitute_metadata=False):
"""
Enables the column to determine the most efficient way of
calculating the frequency of the different variables
"""
for member in self._metadata.values():
self._counts[member] = 0

if self._weights is None:
counts = np.transpose(np.unique(self._arr, return_counts=True))
else:
counts = np.array([
[i, self._weights[self._arr == i].sum()] for i in set(self._arr)
])
if substitute_metadata:
self._counts.update((self._metadata[k], v) for k, v in counts)
else:
self._counts.update((k, v) for k, v in counts)
return self._counts


class NominalColumn(Column):
"""
Expand Down Expand Up @@ -111,7 +140,7 @@ def substitute_values(self, vect):
metadata to convert back to the original vector.

np.nan is always given -1, all other objects are given integers in
order of apperence.
order of appearence.

Parameters
----------
Expand Down Expand Up @@ -287,6 +316,14 @@ def __setitem__(self, key, value):
self._arr[key] = value
return self

def counts(self, substitute_metadata=False):
if not self._counts:
self._counts = {
'mean': self._arr.mean(),
's.t.d': self._arr.std()
}
return self._counts

@property
def type(self):
"""
Expand Down
28 changes: 6 additions & 22 deletions CHAID/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,26 +72,10 @@ def is_terminal(self):

@property
def members(self):
if self._members is None:
dep_v = self.dep_v
if isinstance(dep_v, ContinuousColumn):
self._members = {
'mean': self.dep_v.arr.mean(),
's.t.d': self.dep_v.arr.std()
}
else:
metadata = dep_v.metadata
self._members = {}
for member in metadata.values():
self._members[member] = 0

if dep_v.weights is None:
counts = np.transpose(np.unique(dep_v.arr, return_counts=True))
else:
counts = np.array([
[i, dep_v.weights[dep_v.arr == i].sum()] for i in set(dep_v.arr)
])

self._members.update((metadata[k], v) for k, v in counts)

if not self._members:
self._members = self.dep_v.counts(True)
return self._members

@property
def predict(self):
return max(self.members, key=self.members.get)
39 changes: 37 additions & 2 deletions CHAID/tree.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import numpy as np
import pandas as pd
from treelib import Tree as TreeLibTree
from .node import Node
from .split import Split
from .column import NominalColumn, OrdinalColumn, ContinuousColumn
from .stats import Stats
from .invalid_split_reason import InvalidSplitReason
from collections import OrderedDict

class Tree(object):
"""
Expand Down Expand Up @@ -111,13 +113,14 @@ def from_pandas_df(df, i_variables, d_variable, alpha_merge=0.05, max_depth=2,
the type of dependent variable. Supported variable types are 'categorical' or
'continuous'
"""
ind_df = df[list(i_variables.keys())]
df_ordered_keys = [x for x in df.columns if x in i_variables.keys()]
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is kinda gross. Will need to discuss this with you @xulaus, it's logic that seems to be the best in a given situation, but isn't foolproof. Actually, I don't think this can every go into prod.

ind_df = df[df_ordered_keys] # preserve df column order
ind_values = ind_df.values
dep_values = df[d_variable].values
weights = df[weight] if weight is not None else None
return Tree(ind_values, dep_values, alpha_merge, max_depth, min_parent_node_size,
min_child_node_size, list(ind_df.columns.values), split_threshold, weights,
list(i_variables.values()), dep_variable_type)
[i_variables[key] for key in df_ordered_keys], dep_variable_type)
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here ☝️


def node(self, rows, ind, dep, depth=0, parent=None, parent_decisions=None):
""" internal method to create a node in the tree """
Expand Down Expand Up @@ -226,6 +229,38 @@ def classification_rules(self, node=None, stack=None):
else:
return self.classification_rules(self.get_node(node.parent), stack)

def tree_predictions(self):
"""
Calculates the row criteria that give rise
to a particular terminal node
"""
tree_predictions = pd.DataFrame()
for node in self:
if node.is_terminal:
sliced_arr = np.array([x.original_vector for x in self.vectorised_array]).T[node.indices]
unique_set = np.vstack({ tuple(row) for row in sliced_arr })
index = pd.MultiIndex.from_arrays(np.transpose(unique_set))
if tree_predictions.empty:
tree_predictions = pd.DataFrame([[node.node_id, node.predict]] * len(index), index=index)
else:
tree_predictions = tree_predictions.append(pd.DataFrame([[node.node_id, node.predict]] * len(index), index=index))
tree_predictions.columns = ['node_id', 'prediction']
# need to retroactively fill missing values
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dataset may not contain all the other i variables at that node, and thus they will be missed off of the node

return tree_predictions

def accuracy(self, ndarr, arr):
"""
Calculates the accuracy of predicting the
dependent variable based upon the node
predictions
"""
tree_predictions = self.tree_predictions()
index = pd.MultiIndex.from_arrays(np.transpose(ndarr))
series = pd.Series(arr, index=index, name='dep')
join = tree_predictions.join(series)
true_set = (join['prediction'] == join['dep']).sum()
return true_set / float(len(arr))

def model_predictions(self):
"""
Determines the highest frequency of
Expand Down
10 changes: 3 additions & 7 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,22 +1,18 @@
# Change Log

## [Unreleased](https://github.com/Rambatino/CHAID/tree/HEAD)

[Full Changelog](https://github.com/Rambatino/CHAID/compare/v4.0.0-alpha.2...HEAD)
## [v4.0.0](https://github.com/Rambatino/CHAID/tree/v4.0.0) (2017-06-14)
[Full Changelog](https://github.com/Rambatino/CHAID/compare/v4.0.0-alpha.2...v4.0.0)

**Implemented enhancements:**

- Changed architecture of from\_pandas\_df to align variable types and instance variables into a single parameter [\#70](https://github.com/Rambatino/CHAID/pull/70) ([Rambatino](https://github.com/Rambatino))
- Adding invalid split messages [\#68](https://github.com/Rambatino/CHAID/pull/68) ([Rambatino](https://github.com/Rambatino))

**Closed issues:**

- Creating tree different from README [\#65](https://github.com/Rambatino/CHAID/issues/65)
- User shouldn't have to pass in variables and variable types [\#53](https://github.com/Rambatino/CHAID/issues/53)

**Merged pull requests:**

- Adding invalid split messages [\#68](https://github.com/Rambatino/CHAID/pull/68) ([Rambatino](https://github.com/Rambatino))

## [v4.0.0-alpha.2](https://github.com/Rambatino/CHAID/tree/v4.0.0-alpha.2) (2017-06-07)
[Full Changelog](https://github.com/Rambatino/CHAID/compare/v4.0.0-alpha.1...v4.0.0-alpha.2)

Expand Down
42 changes: 42 additions & 0 deletions tests/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ def test_node_predictions():
ndarr = np.transpose(np.vstack([gender]))
tree = CHAID.Tree(ndarr, income, alpha_merge=0.9, max_depth=1,
min_child_node_size=1, min_parent_node_size=1)

assert (tree.node_predictions() == np.array([1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 3, 3, 3., 3, 3, 3, 3, 3, 2])).all() == True

class TestTreeGenerated(TestCase):
Expand Down Expand Up @@ -515,3 +516,44 @@ def test_bartlett_significance(self):
tree.build_tree()
assert round(tree.tree_store[0].p, 4) == 0.3681
assert len(tree.tree_store) == 5


def test_accuracy_when_same_data():
"""
Check whether it correctly determines the unique independent
variables for each node
"""

independent_set = np.array([
[1], [2], [1], [3], [1], [2], [1], [2], [3], [4],
[1], [2], [3], [4], [1], [2], [3], [1], [2], [3]
])

dependent_set = np.array([
1, 2, 2, 1, 2, 1, 2, 1, 2, 1,
2, 2, 1, 1, 2, 1, 2, 1, 1, 2
])


tree = CHAID.Tree(ndarr=independent_set, arr=dependent_set)

indices_1 = np.array([0, 2, 4, 6, 10, 14, 17, 1, 5, 7, 11, 15, 18])

split = CHAID.Split(None, None, None, None, 0)

terminal_node_1 = CHAID.Node(
node_id=2, split=split, indices=indices_1,
dep_v=CHAID.NominalColumn(dependent_set[indices_1])
)

indices_2 = np.array([ 3, 8, 12, 16, 19, 9, 13])
terminal_node_2 = CHAID.Node(
node_id=3, split=split, indices=indices_2,
dep_v=CHAID.NominalColumn(dependent_set[indices_2])
)

tree._tree_store = [terminal_node_1, terminal_node_2]

accuracy = tree.accuracy(ndarr=independent_set, arr=dependent_set)

assert accuracy == float(11)/20