Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added ordinal variable type to independent variables #45

Merged
merged 19 commits into from
Oct 25, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHAID/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .split import Split
from .tree import Tree
from .node import Node
from .column import NominalColumn
from .column import NominalColumn, OrdinalColumn

__version__ = "2.1.0"
42 changes: 32 additions & 10 deletions CHAID/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import savReaderWriter as spss
from .tree import Tree
import pandas as pd
import numpy as np


def main():
"""Entry point when module is run from command line"""
Expand All @@ -14,7 +16,14 @@ def main():
' csv/sav file.')
parser.add_argument('file')
parser.add_argument('dependent_variable', nargs=1)
parser.add_argument('independent_variables', nargs='+')

var = parser.add_argument_group('Independent Variable Specification')
var.add_argument('nominal_variables', nargs='*', help='The names of '
'independent variables to use that have no intrinsic '
'order to them')
var.add_argument('--ordinal-variables', type=str, nargs='*',
help='The names of independent variables to use that '
'have an intrinsic order but a finite amount of states')
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ooo very nice descriptions 👍

parser.add_argument('--weights', type=str, help='Name of weight column')

parser.add_argument('--max-depth', type=int, help='Max depth of generated '
Expand All @@ -29,16 +38,20 @@ def main():
' input with the node id of the node that that '
'respondent has been placed into')
group.add_argument('--predict', action='store_true', help='Add column to '
'input with the value of the dependent varaible that '
'input with the value of the dependent variable that '
'the majority of respondents in that node selected')
nspace = parser.parse_args()

data = pd.read_csv(nspace.file)

# raw_data = spss.SavReader(nspace.file, returnHeader = True, rawMode=True)
# raw_data_list = list(raw_data)
# data = pd.DataFrame(raw_data_list)
# data = data.rename(columns=data.loc[0]).iloc[1:]
if nspace.file[-4:] == '.csv':
data = pd.read_csv(nspace.file)
elif nspace.file[-4:] == '.sav':
raw_data = spss.SavReader(nspace.file, returnHeader=True)
raw_data_list = list(raw_data)
data = pd.DataFrame(raw_data_list)
data = data.rename(columns=data.loc[0]).iloc[1:]
else:
print('Unknown file type')
exit(1)

config = {}
if nspace.max_depth:
Expand All @@ -51,8 +64,17 @@ def main():
config['min_child_node_size'] = nspace.min_child_node_size
if nspace.weights:
config['weight'] = nspace.weights
tree = Tree.from_pandas_df(data, nspace.independent_variables,
nspace.dependent_variable[0], **config)

ordinal = nspace.ordinal_variables or []
nominal = nspace.nominal_variables or []
independent_variables = nominal + ordinal
types = ['nominal'] * len(nominal) + ['ordinal'] * len(ordinal)
if len(independent_variables) == 0:
print('Need to provide at least one independent variable')
exit(1)
tree = Tree.from_pandas_df(data, independent_variables,
nspace.dependent_variable[0],
variable_types=types, **config)

if nspace.classify:
predictions = pd.Series(tree.node_predictions())
Expand Down
107 changes: 99 additions & 8 deletions CHAID/column.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import numpy as np
import math
from math import isnan
from itertools import combinations
from .mapping_dict import MappingDict


class Column(object):
"""
A numpy array with metadata
Expand All @@ -24,7 +25,6 @@ def __init__(self, arr=None, metadata=None,
self._metadata = dict(metadata or {})
self._arr = np.array(arr)
self._missing_id = missing_id
self._groupings = MappingDict()

def __iter__(self):
return iter(self._arr)
Expand All @@ -38,9 +38,6 @@ def __setitem__(self, key, value):
def possible_groupings(self):
raise NotImplementedError

def combine(self, x, y):
raise NotImplementedError

def deep_copy(self):
"""
Returns a deep copy.
Expand Down Expand Up @@ -70,6 +67,7 @@ def metadata(self):
"""
return self._metadata


class NominalColumn(Column):
"""
A column containing numerical values that are unrelated to
Expand All @@ -80,6 +78,8 @@ def __init__(self, arr=None, metadata=None,
super(self.__class__, self).__init__(arr, metadata, missing_id)
if substitute:
self.substitute_values(arr)

self._groupings = MappingDict()
for x in np.unique(self._arr):
self._groupings[x] = [x]

Expand All @@ -106,7 +106,7 @@ def substitute_values(self, vect):
"""
unique = np.unique(vect)
unique = [
x for x in unique if not isinstance(x, float) or not math.isnan(x)
x for x in unique if not isinstance(x, float) or not isnan(x)
]

arr = np.zeros(len(vect), dtype=int) - 1
Expand All @@ -123,15 +123,106 @@ def __getitem__(self, key):

def __setitem__(self, key, value):
self._arr[key] = value
return NominalColumn(np.array(self._arr), metadata=self.metadata, substitute=False)
return self

def groups(self):
return list(self._groupings.values())

def possible_groupings(self):
return enumerate(combinations(self._groupings.keys(), 2))
return combinations(self._groupings.keys(), 2)

def group(self, x, y):
self._groupings[x] += self._groupings[y]
del self._groupings[y]
self._arr[self._arr == y] = x


class OrdinalColumn(Column):
"""
A column containing integer values that have an order
"""
def __init__(self, arr=None, metadata=None,
missing_id='<missing>', groupings=None, substitute=True):
super(self.__class__, self).__init__(arr, metadata, missing_id)

if substitute:
self._arr, self.orig_type = self.substitute_values(self._arr)

self._groupings = {}
if groupings is None:
for x in np.unique(self._arr):
self._groupings[x] = [x, x + 1, False]
else:
for x in np.unique(self._arr):
self._groupings[x] = list(groupings[x])
self._nan = np.array([np.nan]).astype(int)[0]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@xulaus can you remember what this line is doing?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not exactly. I think we were using nan as a sentinel value that can combine with any item in the ordinal but we wanted the arrays to all be of the same dtype so we did this force cast.

self._possible_groups = None

def substitute_values(self, vect):
if not np.issubdtype(vect.dtype, np.integer):
uniq = set(vect)
uniq_floats = np.array(list(uniq), dtype=float)
uniq_ints = uniq_floats.astype(int)
nan = self._missing_id
self._metadata = {
new: nan if isnan(as_float) else old
for old, as_float, new in zip(uniq, uniq_floats, uniq_ints)
}
self._arr = self._arr.astype(float)
return self._arr.astype(int), self._arr.dtype.type

def deep_copy(self):
"""
Returns a deep copy.
"""
return OrdinalColumn(self._arr, metadata=self.metadata,
missing_id=self._missing_id, substitute=True,
groupings=self._groupings)

def __getitem__(self, key):
return OrdinalColumn(self._arr[key], metadata=self.metadata,
missing_id=self._missing_id, substitute=True,
groupings=self._groupings)

def __setitem__(self, key, value):
self._arr[key] = value
return self

def groups(self):
vals = self._groupings.values()
return [
[x for x in range(minmax[0], minmax[1])] + ([self._nan] if minmax[2] else [])
for minmax in vals
]

def possible_groupings(self):
if self._possible_groups is None:
ranges = sorted(self._groupings.items())
candidates = zip(ranges[0:], ranges[1:])
self._possible_groups = [
(k1, k2) for (k1, minmax1), (k2, minmax2) in candidates
if minmax1[1] == minmax2[0]
]
if self._nan in self._arr:
self._possible_groups += [
(key, self._nan) for key in self._groupings.keys() if key != self._nan
]
return self._possible_groups.__iter__()

def group(self, x, y):
self._possible_groups = None
if y != self._nan:
x = int(x)
y = int(y)
x_max = self._groupings[x][1]
y_min = self._groupings[y][0]
if y_min >= x_max:
self._groupings[x][1] = self._groupings[y][1]
else:
self._groupings[x][0] = y_min
self._groupings[x][2] = self._groupings[x][2] or self._groupings[y][2]
else:
self._groupings[x][2] = True

del self._groupings[y]
self._arr[self._arr == y] = x
1 change: 1 addition & 0 deletions CHAID/node.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .split import Split
import numpy as np


class Node(object):
"""
A node in the CHAID tree
Expand Down
Loading