Skip to content

Commit

Permalink
BUG: MultiIndex mangling during parsing (pandas-dev#18062)
Browse files Browse the repository at this point in the history
  • Loading branch information
WillAyd committed Nov 4, 2017
1 parent 27bbea7 commit 41eea13
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 4 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.22.0.txt
Expand Up @@ -89,6 +89,7 @@ Bug Fixes

- Bug in ``pd.read_msgpack()`` with a non existent file is passed in Python 2 (:issue:`15296`)
- Bug in ``DataFrame.groupby`` where key as tuple in a ``MultiIndex`` were interpreted as a list of keys (:issue:`17979`)
- Bug in :func:`pd.read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`)

Conversion
^^^^^^^^^^
Expand Down
28 changes: 24 additions & 4 deletions pandas/io/parsers.py
Expand Up @@ -1106,6 +1106,24 @@ def _is_index_col(col):
return col is not None and col is not False


def _is_potential_multi_index(columns):
"""
Check whether or not the `columns` parameter
could be converted into a MultiIndex.
Parameters
----------
columns : array-like
Object which may or may not be convertible into a MultiIndex
Returns
-------
boolean : Whether or not columns could become a MultiIndex
"""
return (len(columns) and not isinstance(columns, MultiIndex) and
all([isinstance(c, tuple) for c in columns]))


def _evaluate_usecols(usecols, names):
"""
Check whether or not the 'usecols' parameter
Expand Down Expand Up @@ -1374,14 +1392,18 @@ def _maybe_dedup_names(self, names):
if self.mangle_dupe_cols:
names = list(names) # so we can index
counts = defaultdict(int)
is_potential_mi = _is_potential_multi_index(names)

for i, col in enumerate(names):
cur_count = counts[col]

while cur_count > 0:
counts[col] = cur_count + 1

col = '%s.%d' % (col, cur_count)
if is_potential_mi:
col = col[:-1] + ('%s.%d' % (col[-1], cur_count),)
else:
col = '%s.%d' % (col, cur_count)
cur_count = counts[col]

names[i] = col
Expand All @@ -1391,9 +1413,7 @@ def _maybe_dedup_names(self, names):

def _maybe_make_multi_index_columns(self, columns, col_names=None):
# possibly create a column mi here
if (not self.tupleize_cols and len(columns) and
not isinstance(columns, MultiIndex) and
all([isinstance(c, tuple) for c in columns])):
if _is_potential_multi_index(columns):
columns = MultiIndex.from_tuples(columns, names=col_names)
return columns

Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/io/parser/header.py
Expand Up @@ -290,3 +290,30 @@ def test_singleton_header(self):
df = self.read_csv(StringIO(data), header=[0])
expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
tm.assert_frame_equal(df, expected)

def test_mangles_multi_index(self):
# See GH 18062
data = """A,A,A,B\none,one,one,two\n0,40,34,0.1"""
df = self.read_csv(StringIO(data), header=[0, 1])
expected = DataFrame([[0, 40, 34, 0.1]],
columns=MultiIndex.from_tuples(
[('A', 'one'), ('A', 'one.1'),
('A', 'one.2'), ('B', 'two')]))
tm.assert_frame_equal(df, expected)

data = """A,A,A,B\none,one,one.1,two\n0,40,34,0.1"""
df = self.read_csv(StringIO(data), header=[0, 1])
expected = DataFrame([[0, 40, 34, 0.1]],
columns=MultiIndex.from_tuples(
[('A', 'one'), ('A', 'one.1'),
('A', 'one.1.1'), ('B', 'two')]))
tm.assert_frame_equal(df, expected)

data = """A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1"""
df = self.read_csv(StringIO(data), header=[0, 1])
expected = DataFrame([[0, 40, 34, 0.1, 0.1]],
columns=MultiIndex.from_tuples(
[('A', 'one'), ('A', 'one.1'),
('A', 'one.1.1'), ('B', 'two'),
('B', 'two.1')]))
tm.assert_frame_equal(df, expected)

0 comments on commit 41eea13

Please sign in to comment.