Skip to content

Commit

Permalink
ENH: support decimal option in PythonParser pandas-dev#12933
Browse files Browse the repository at this point in the history
  • Loading branch information
Camilo Cota committed May 15, 2016
1 parent af7bdd3 commit dc7acd1
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 52 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Other enhancements

- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`)


- Support decimal option in PythonParser


.. _whatsnew_0182.api:
Expand Down
36 changes: 30 additions & 6 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ def _read(filepath_or_buffer, kwds):
'keep_default_na': True,
'thousands': None,
'comment': None,
'decimal': b'.',

# 'engine': 'c',
'parse_dates': False,
Expand Down Expand Up @@ -383,7 +384,6 @@ def _read(filepath_or_buffer, kwds):
'error_bad_lines': True,
'warn_bad_lines': True,
'dtype': None,
'decimal': b'.',
'float_precision': None
}

Expand All @@ -404,7 +404,6 @@ def _read(filepath_or_buffer, kwds):
'error_bad_lines',
'warn_bad_lines',
'dtype',
'decimal',
'float_precision',
])

Expand Down Expand Up @@ -1582,6 +1581,7 @@ def __init__(self, f, **kwds):
self.converters = kwds['converters']

self.thousands = kwds['thousands']
self.decimal = kwds['decimal']
self.comment = kwds['comment']
self._comment_lines = []

Expand Down Expand Up @@ -1639,6 +1639,9 @@ def __init__(self, f, **kwds):
else:
self._no_thousands_columns = None

if len(self.decimal) != 1:
raise ValueError('Only length-1 decimal markers supported')

def _set_no_thousands_columns(self):
# Create a set of column ids that are not to be stripped of thousands
# operators.
Expand Down Expand Up @@ -2050,22 +2053,42 @@ def _check_empty(self, lines):
def _check_thousands(self, lines):
if self.thousands is None:
return lines
nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands)
nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands, self.decimal))
return self._search_replace_num_columns(lines=lines,
search=self.thousands,
replace='',
nonnum=nonnum)

def _search_replace_num_columns(self, lines, search, replace, nonnum):
ret = []
for l in lines:
rl = []
for i, x in enumerate(l):
if (not isinstance(x, compat.string_types) or
self.thousands not in x or
search not in x or
(self._no_thousands_columns and
i in self._no_thousands_columns) or
nonnum.search(x.strip())):
rl.append(x)
else:
rl.append(x.replace(self.thousands, ''))
rl.append(x.replace(search, replace))
ret.append(rl)
return ret

def _check_decimal(self, lines):
if self.decimal == b'.':
return lines

if self.thousands is None:
nonnum = re.compile('[^-^0-9^%s]+' % self.decimal)
else:
nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands,
self.decimal))
return self._search_replace_num_columns(lines=lines,
search=self.decimal,
replace='.',
nonnum=nonnum)

def _clear_buffer(self):
self.buf = []

Expand Down Expand Up @@ -2249,7 +2272,8 @@ def _get_lines(self, rows=None):
lines = self._check_comments(lines)
if self.skip_blank_lines:
lines = self._check_empty(lines)
return self._check_thousands(lines)
lines = self._check_thousands(lines)
return self._check_decimal(lines)


def _make_date_converter(date_parser=None, dayfirst=False,
Expand Down
45 changes: 0 additions & 45 deletions pandas/io/tests/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,17 +353,6 @@ def test_disable_bool_parsing(self):
result = self.read_csv(StringIO(data), dtype=object, na_filter=False)
self.assertEqual(result['B'][2], '')

def test_euro_decimal_format(self):
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
2;121,12;14897,76;DEF;uyt;0,377320872
3;878,158;108013,434;GHI;rez;2,735694704"""

df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
self.assertEqual(df2['Number1'].dtype, float)
self.assertEqual(df2['Number2'].dtype, float)
self.assertEqual(df2['Number3'].dtype, float)

def test_custom_lineterminator(self):
data = 'a,b,c~1,2,3~4,5,6'

Expand Down Expand Up @@ -444,40 +433,6 @@ def test_raise_on_no_columns(self):
data = "\n\n\n"
self.assertRaises(ValueError, self.read_csv, StringIO(data))

def test_1000_sep_with_decimal(self):
data = """A|B|C
1|2,334.01|5
10|13|10.
"""
expected = DataFrame({
'A': [1, 10],
'B': [2334.01, 13],
'C': [5, 10.]
})

tm.assert_equal(expected.A.dtype, 'int64')
tm.assert_equal(expected.B.dtype, 'float')
tm.assert_equal(expected.C.dtype, 'float')

df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data), sep='|',
thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

data_with_odd_sep = """A|B|C
1|2.334,01|5
10|13|10,
"""
df = self.read_csv(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

def test_grow_boundary_at_cap(self):
# See gh-12494
#
Expand Down
45 changes: 45 additions & 0 deletions pandas/io/tests/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1236,3 +1236,48 @@ def test_iteration_open_handle(self):
result = self.read_table(f, squeeze=True, header=None)
expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0)
tm.assert_series_equal(result, expected)

def test_1000_sep_with_decimal(self):
data = """A|B|C
1|2,334.01|5
10|13|10.
"""
expected = DataFrame({
'A': [1, 10],
'B': [2334.01, 13],
'C': [5, 10.]
})

tm.assert_equal(expected.A.dtype, 'int64')
tm.assert_equal(expected.B.dtype, 'float')
tm.assert_equal(expected.C.dtype, 'float')

df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data), sep='|',
thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

data_with_odd_sep = """A|B|C
1|2.334,01|5
10|13|10,
"""
df = self.read_csv(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

def test_euro_decimal_format(self):
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
2;121,12;14897,76;DEF;uyt;0,377320872
3;878,158;108013,434;GHI;rez;2,735694704"""

df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
self.assertEqual(df2['Number1'].dtype, float)
self.assertEqual(df2['Number2'].dtype, float)
self.assertEqual(df2['Number3'].dtype, float)

0 comments on commit dc7acd1

Please sign in to comment.