-
Notifications
You must be signed in to change notification settings - Fork 50
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added ordinal variable type to independent variables #45
Changes from all commits
6fabe09
9450af3
451f55a
092b00a
84069f3
c102b17
08371e6
4f83f2a
fdff416
0f807a5
1b0ccb7
4ca04a9
0d52cbb
1d8984a
4ac893a
15c88fd
5902d0e
c03df76
cf4a388
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
from .split import Split | ||
from .tree import Tree | ||
from .node import Node | ||
from .column import NominalColumn | ||
from .column import NominalColumn, OrdinalColumn | ||
|
||
__version__ = "2.1.0" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,9 @@ | ||
import numpy as np | ||
import math | ||
from math import isnan | ||
from itertools import combinations | ||
from .mapping_dict import MappingDict | ||
|
||
|
||
class Column(object): | ||
""" | ||
A numpy array with metadata | ||
|
@@ -24,7 +25,6 @@ def __init__(self, arr=None, metadata=None, | |
self._metadata = dict(metadata or {}) | ||
self._arr = np.array(arr) | ||
self._missing_id = missing_id | ||
self._groupings = MappingDict() | ||
|
||
def __iter__(self): | ||
return iter(self._arr) | ||
|
@@ -38,9 +38,6 @@ def __setitem__(self, key, value): | |
def possible_groupings(self): | ||
raise NotImplementedError | ||
|
||
def combine(self, x, y): | ||
raise NotImplementedError | ||
|
||
def deep_copy(self): | ||
""" | ||
Returns a deep copy. | ||
|
@@ -70,6 +67,7 @@ def metadata(self): | |
""" | ||
return self._metadata | ||
|
||
|
||
class NominalColumn(Column): | ||
""" | ||
A column containing numerical values that are unrelated to | ||
|
@@ -80,6 +78,8 @@ def __init__(self, arr=None, metadata=None, | |
super(self.__class__, self).__init__(arr, metadata, missing_id) | ||
if substitute: | ||
self.substitute_values(arr) | ||
|
||
self._groupings = MappingDict() | ||
for x in np.unique(self._arr): | ||
self._groupings[x] = [x] | ||
|
||
|
@@ -106,7 +106,7 @@ def substitute_values(self, vect): | |
""" | ||
unique = np.unique(vect) | ||
unique = [ | ||
x for x in unique if not isinstance(x, float) or not math.isnan(x) | ||
x for x in unique if not isinstance(x, float) or not isnan(x) | ||
] | ||
|
||
arr = np.zeros(len(vect), dtype=int) - 1 | ||
|
@@ -123,15 +123,106 @@ def __getitem__(self, key): | |
|
||
def __setitem__(self, key, value): | ||
self._arr[key] = value | ||
return NominalColumn(np.array(self._arr), metadata=self.metadata, substitute=False) | ||
return self | ||
|
||
def groups(self): | ||
return list(self._groupings.values()) | ||
|
||
def possible_groupings(self): | ||
return enumerate(combinations(self._groupings.keys(), 2)) | ||
return combinations(self._groupings.keys(), 2) | ||
|
||
def group(self, x, y): | ||
self._groupings[x] += self._groupings[y] | ||
del self._groupings[y] | ||
self._arr[self._arr == y] = x | ||
|
||
|
||
class OrdinalColumn(Column): | ||
""" | ||
A column containing integer values that have an order | ||
""" | ||
def __init__(self, arr=None, metadata=None, | ||
missing_id='<missing>', groupings=None, substitute=True): | ||
super(self.__class__, self).__init__(arr, metadata, missing_id) | ||
|
||
if substitute: | ||
self._arr, self.orig_type = self.substitute_values(self._arr) | ||
|
||
self._groupings = {} | ||
if groupings is None: | ||
for x in np.unique(self._arr): | ||
self._groupings[x] = [x, x + 1, False] | ||
else: | ||
for x in np.unique(self._arr): | ||
self._groupings[x] = list(groupings[x]) | ||
self._nan = np.array([np.nan]).astype(int)[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @xulaus can you remember what this line is doing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not exactly. I think we were using nan as a sentinel value that can combine with any item in the ordinal but we wanted the arrays to all be of the same dtype so we did this force cast. |
||
self._possible_groups = None | ||
|
||
def substitute_values(self, vect): | ||
if not np.issubdtype(vect.dtype, np.integer): | ||
uniq = set(vect) | ||
uniq_floats = np.array(list(uniq), dtype=float) | ||
uniq_ints = uniq_floats.astype(int) | ||
nan = self._missing_id | ||
self._metadata = { | ||
new: nan if isnan(as_float) else old | ||
for old, as_float, new in zip(uniq, uniq_floats, uniq_ints) | ||
} | ||
self._arr = self._arr.astype(float) | ||
return self._arr.astype(int), self._arr.dtype.type | ||
|
||
def deep_copy(self): | ||
""" | ||
Returns a deep copy. | ||
""" | ||
return OrdinalColumn(self._arr, metadata=self.metadata, | ||
missing_id=self._missing_id, substitute=True, | ||
groupings=self._groupings) | ||
|
||
def __getitem__(self, key): | ||
return OrdinalColumn(self._arr[key], metadata=self.metadata, | ||
missing_id=self._missing_id, substitute=True, | ||
groupings=self._groupings) | ||
|
||
def __setitem__(self, key, value): | ||
self._arr[key] = value | ||
return self | ||
|
||
def groups(self): | ||
vals = self._groupings.values() | ||
return [ | ||
[x for x in range(minmax[0], minmax[1])] + ([self._nan] if minmax[2] else []) | ||
for minmax in vals | ||
] | ||
|
||
def possible_groupings(self): | ||
if self._possible_groups is None: | ||
ranges = sorted(self._groupings.items()) | ||
candidates = zip(ranges[0:], ranges[1:]) | ||
self._possible_groups = [ | ||
(k1, k2) for (k1, minmax1), (k2, minmax2) in candidates | ||
if minmax1[1] == minmax2[0] | ||
] | ||
if self._nan in self._arr: | ||
self._possible_groups += [ | ||
(key, self._nan) for key in self._groupings.keys() if key != self._nan | ||
] | ||
return self._possible_groups.__iter__() | ||
|
||
def group(self, x, y): | ||
self._possible_groups = None | ||
if y != self._nan: | ||
x = int(x) | ||
y = int(y) | ||
x_max = self._groupings[x][1] | ||
y_min = self._groupings[y][0] | ||
if y_min >= x_max: | ||
self._groupings[x][1] = self._groupings[y][1] | ||
else: | ||
self._groupings[x][0] = y_min | ||
self._groupings[x][2] = self._groupings[x][2] or self._groupings[y][2] | ||
else: | ||
self._groupings[x][2] = True | ||
|
||
del self._groupings[y] | ||
self._arr[self._arr == y] = x |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
from .split import Split | ||
import numpy as np | ||
|
||
|
||
class Node(object): | ||
""" | ||
A node in the CHAID tree | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ooo very nice descriptions 👍