BUG: MultiIndex mangling during parsing (pandas-dev#18062)

WillAyd · Nov 4, 2017 · 41eea13 · 41eea13
1 parent 27bbea7
commit 41eea13
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 4 deletions.
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -89,6 +89,7 @@ Bug Fixes
 
 - Bug in ``pd.read_msgpack()`` with a non existent file is passed in Python 2 (:issue:`15296`)
 - Bug in ``DataFrame.groupby`` where key as tuple in a ``MultiIndex`` were interpreted as a list of keys (:issue:`17979`)
+- Bug in :func:`pd.read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`)
 
 Conversion
 ^^^^^^^^^^

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1106,6 +1106,24 @@ def _is_index_col(col):
     return col is not None and col is not False
 
 
+def _is_potential_multi_index(columns):
+    """
+    Check whether or not the `columns` parameter
+    could be converted into a MultiIndex.
+
+    Parameters
+    ----------
+    columns : array-like
+        Object which may or may not be convertible into a MultiIndex
+
+    Returns
+    -------
+    boolean : Whether or not columns could become a MultiIndex
+    """
+    return (len(columns) and not isinstance(columns, MultiIndex) and
+            all([isinstance(c, tuple) for c in columns]))
+
+
 def _evaluate_usecols(usecols, names):
     """
     Check whether or not the 'usecols' parameter
@@ -1374,14 +1392,18 @@ def _maybe_dedup_names(self, names):
         if self.mangle_dupe_cols:
             names = list(names)  # so we can index
             counts = defaultdict(int)
+            is_potential_mi = _is_potential_multi_index(names)
 
             for i, col in enumerate(names):
                 cur_count = counts[col]
 
                 while cur_count > 0:
                     counts[col] = cur_count + 1
 
-                    col = '%s.%d' % (col, cur_count)
+                    if is_potential_mi:
+                        col = col[:-1] + ('%s.%d' % (col[-1], cur_count),)
+                    else:
+                        col = '%s.%d' % (col, cur_count)
                     cur_count = counts[col]
 
                 names[i] = col
@@ -1391,9 +1413,7 @@ def _maybe_dedup_names(self, names):
 
     def _maybe_make_multi_index_columns(self, columns, col_names=None):
         # possibly create a column mi here
-        if (not self.tupleize_cols and len(columns) and
-                not isinstance(columns, MultiIndex) and
-                all([isinstance(c, tuple) for c in columns])):
+        if _is_potential_multi_index(columns):
             columns = MultiIndex.from_tuples(columns, names=col_names)
         return columns
 

diff --git a/pandas/tests/io/parser/header.py b/pandas/tests/io/parser/header.py
@@ -290,3 +290,30 @@ def test_singleton_header(self):
         df = self.read_csv(StringIO(data), header=[0])
         expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
         tm.assert_frame_equal(df, expected)
+
+    def test_mangles_multi_index(self):
+        # See GH 18062
+        data = """A,A,A,B\none,one,one,two\n0,40,34,0.1"""
+        df = self.read_csv(StringIO(data), header=[0, 1])
+        expected = DataFrame([[0, 40, 34, 0.1]],
+                             columns=MultiIndex.from_tuples(
+                                 [('A', 'one'), ('A', 'one.1'),
+                                  ('A', 'one.2'), ('B', 'two')]))
+        tm.assert_frame_equal(df, expected)
+
+        data = """A,A,A,B\none,one,one.1,two\n0,40,34,0.1"""
+        df = self.read_csv(StringIO(data), header=[0, 1])
+        expected = DataFrame([[0, 40, 34, 0.1]],
+                             columns=MultiIndex.from_tuples(
+                                 [('A', 'one'), ('A', 'one.1'),
+                                  ('A', 'one.1.1'), ('B', 'two')]))
+        tm.assert_frame_equal(df, expected)
+
+        data = """A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1"""
+        df = self.read_csv(StringIO(data), header=[0, 1])
+        expected = DataFrame([[0, 40, 34, 0.1, 0.1]],
+                             columns=MultiIndex.from_tuples(
+                                 [('A', 'one'), ('A', 'one.1'),
+                                  ('A', 'one.1.1'), ('B', 'two'),
+                                  ('B', 'two.1')]))
+        tm.assert_frame_equal(df, expected)