diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 654e5d3bfec0e..7d63d78084270 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -180,19 +180,35 @@ def setup(self, method): raise NotImplementedError win = 100 arr = np.random.rand(100000) - rolling = {'rolling_median': rolling_median, - 'rolling_mean': rolling_mean, - 'rolling_min': rolling_min, - 'rolling_max': rolling_max, - 'rolling_var': rolling_var, - 'rolling_skew': rolling_skew, - 'rolling_kurt': rolling_kurt, - 'rolling_std': rolling_std} - - @test_parallel(num_threads=2) - def parallel_rolling(): - rolling[method](arr, win) - self.parallel_rolling = parallel_rolling + if hasattr(DataFrame, 'rolling'): + rolling = {'rolling_median': 'median', + 'rolling_mean': 'mean', + 'rolling_min': 'min', + 'rolling_max': 'max', + 'rolling_var': 'var', + 'rolling_skew': 'skew', + 'rolling_kurt': 'kurt', + 'rolling_std': 'std'} + df = DataFrame(arr).rolling(win) + + @test_parallel(num_threads=2) + def parallel_rolling(): + getattr(df, rolling[method])() + self.parallel_rolling = parallel_rolling + else: + rolling = {'rolling_median': rolling_median, + 'rolling_mean': rolling_mean, + 'rolling_min': rolling_min, + 'rolling_max': rolling_max, + 'rolling_var': rolling_var, + 'rolling_skew': rolling_skew, + 'rolling_kurt': rolling_kurt, + 'rolling_std': rolling_std} + + @test_parallel(num_threads=2) + def parallel_rolling(): + rolling[method](arr, win) + self.parallel_rolling = parallel_rolling def time_rolling(self, method): self.parallel_rolling() diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 899349cd21f84..45142c53dcd01 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -1,185 +1,41 @@ -from .pandas_vb_common import * import pandas as pd import numpy as np +from .pandas_vb_common import setup # noqa -class DataframeRolling(object): - goal_time = 0.2 - def setup(self): - self.N = 100000 - self.Ns = 10000 - self.df = pd.DataFrame({'a': np.random.random(self.N)}) - self.dfs = pd.DataFrame({'a': np.random.random(self.Ns)}) - self.wins = 10 - self.winl = 1000 +class Methods(object): - def time_rolling_quantile_0(self): - (self.df.rolling(self.wins).quantile(0.0)) + sample_time = 0.2 + params = (['DataFrame', 'Series'], + [10, 1000], + ['int', 'float'], + ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', + 'sum', 'corr', 'cov']) + param_names = ['contructor', 'window', 'dtype', 'method'] - def time_rolling_quantile_1(self): - (self.df.rolling(self.wins).quantile(1.0)) + def setup(self, contructor, window, dtype, method): + N = 10**5 + arr = np.random.random(N).astype(dtype) + self.roll = getattr(pd, contructor)(arr).rolling(window) - def time_rolling_quantile_median(self): - (self.df.rolling(self.wins).quantile(0.5)) + def time_rolling(self, contructor, window, dtype, method): + getattr(self.roll, method)() - def time_rolling_median(self): - (self.df.rolling(self.wins).median()) - def time_rolling_mean(self): - (self.df.rolling(self.wins).mean()) +class Quantile(object): - def time_rolling_max(self): - (self.df.rolling(self.wins).max()) + sample_time = 0.2 + params = (['DataFrame', 'Series'], + [10, 1000], + ['int', 'float'], + [0, 0.5, 1]) + param_names = ['contructor', 'window', 'dtype', 'percentile'] - def time_rolling_min(self): - (self.df.rolling(self.wins).min()) + def setup(self, contructor, window, dtype, percentile): + N = 10**5 + arr = np.random.random(N).astype(dtype) + self.roll = getattr(pd, contructor)(arr).rolling(window) - def time_rolling_std(self): - (self.df.rolling(self.wins).std()) - - def time_rolling_count(self): - (self.df.rolling(self.wins).count()) - - def time_rolling_skew(self): - (self.df.rolling(self.wins).skew()) - - def time_rolling_kurt(self): - (self.df.rolling(self.wins).kurt()) - - def time_rolling_sum(self): - (self.df.rolling(self.wins).sum()) - - def time_rolling_corr(self): - (self.dfs.rolling(self.wins).corr()) - - def time_rolling_cov(self): - (self.dfs.rolling(self.wins).cov()) - - def time_rolling_quantile_0_l(self): - (self.df.rolling(self.winl).quantile(0.0)) - - def time_rolling_quantile_1_l(self): - (self.df.rolling(self.winl).quantile(1.0)) - - def time_rolling_quantile_median_l(self): - (self.df.rolling(self.winl).quantile(0.5)) - - def time_rolling_median_l(self): - (self.df.rolling(self.winl).median()) - - def time_rolling_mean_l(self): - (self.df.rolling(self.winl).mean()) - - def time_rolling_max_l(self): - (self.df.rolling(self.winl).max()) - - def time_rolling_min_l(self): - (self.df.rolling(self.winl).min()) - - def time_rolling_std_l(self): - (self.df.rolling(self.wins).std()) - - def time_rolling_count_l(self): - (self.df.rolling(self.wins).count()) - - def time_rolling_skew_l(self): - (self.df.rolling(self.wins).skew()) - - def time_rolling_kurt_l(self): - (self.df.rolling(self.wins).kurt()) - - def time_rolling_sum_l(self): - (self.df.rolling(self.wins).sum()) - - -class SeriesRolling(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.Ns = 10000 - self.df = pd.DataFrame({'a': np.random.random(self.N)}) - self.dfs = pd.DataFrame({'a': np.random.random(self.Ns)}) - self.sr = self.df.a - self.srs = self.dfs.a - self.wins = 10 - self.winl = 1000 - - def time_rolling_quantile_0(self): - (self.sr.rolling(self.wins).quantile(0.0)) - - def time_rolling_quantile_1(self): - (self.sr.rolling(self.wins).quantile(1.0)) - - def time_rolling_quantile_median(self): - (self.sr.rolling(self.wins).quantile(0.5)) - - def time_rolling_median(self): - (self.sr.rolling(self.wins).median()) - - def time_rolling_mean(self): - (self.sr.rolling(self.wins).mean()) - - def time_rolling_max(self): - (self.sr.rolling(self.wins).max()) - - def time_rolling_min(self): - (self.sr.rolling(self.wins).min()) - - def time_rolling_std(self): - (self.sr.rolling(self.wins).std()) - - def time_rolling_count(self): - (self.sr.rolling(self.wins).count()) - - def time_rolling_skew(self): - (self.sr.rolling(self.wins).skew()) - - def time_rolling_kurt(self): - (self.sr.rolling(self.wins).kurt()) - - def time_rolling_sum(self): - (self.sr.rolling(self.wins).sum()) - - def time_rolling_corr(self): - (self.srs.rolling(self.wins).corr()) - - def time_rolling_cov(self): - (self.srs.rolling(self.wins).cov()) - - def time_rolling_quantile_0_l(self): - (self.sr.rolling(self.winl).quantile(0.0)) - - def time_rolling_quantile_1_l(self): - (self.sr.rolling(self.winl).quantile(1.0)) - - def time_rolling_quantile_median_l(self): - (self.sr.rolling(self.winl).quantile(0.5)) - - def time_rolling_median_l(self): - (self.sr.rolling(self.winl).median()) - - def time_rolling_mean_l(self): - (self.sr.rolling(self.winl).mean()) - - def time_rolling_max_l(self): - (self.sr.rolling(self.winl).max()) - - def time_rolling_min_l(self): - (self.sr.rolling(self.winl).min()) - - def time_rolling_std_l(self): - (self.sr.rolling(self.wins).std()) - - def time_rolling_count_l(self): - (self.sr.rolling(self.wins).count()) - - def time_rolling_skew_l(self): - (self.sr.rolling(self.wins).skew()) - - def time_rolling_kurt_l(self): - (self.sr.rolling(self.wins).kurt()) - - def time_rolling_sum_l(self): - (self.sr.rolling(self.wins).sum()) + def time_quantile(self, contructor, window, dtype, percentile): + self.roll.quantile(percentile) diff --git a/doc/plots/stats/moment_plots.py b/doc/plots/stats/moment_plots.py deleted file mode 100644 index 9e3a902592c6b..0000000000000 --- a/doc/plots/stats/moment_plots.py +++ /dev/null @@ -1,30 +0,0 @@ -import numpy as np - -import matplotlib.pyplot as plt -import pandas.util.testing as t -import pandas.stats.moments as m - - -def test_series(n=1000): - t.N = n - s = t.makeTimeSeries() - return s - - -def plot_timeseries(*args, **kwds): - n = len(args) - - fig, axes = plt.subplots(n, 1, figsize=kwds.get('size', (10, 5)), - sharex=True) - titles = kwds.get('titles', None) - - for k in range(1, n + 1): - ax = axes[k - 1] - ts = args[k - 1] - ax.plot(ts.index, ts.values) - - if titles: - ax.set_title(titles[k - 1]) - - fig.autofmt_xdate() - fig.subplots_adjust(bottom=0.10, top=0.95) diff --git a/doc/plots/stats/moments_ewma.py b/doc/plots/stats/moments_ewma.py deleted file mode 100644 index 3e521ed60bb8f..0000000000000 --- a/doc/plots/stats/moments_ewma.py +++ /dev/null @@ -1,15 +0,0 @@ -import matplotlib.pyplot as plt -import pandas.util.testing as t -import pandas.stats.moments as m - -t.N = 200 -s = t.makeTimeSeries().cumsum() - -plt.figure(figsize=(10, 5)) -plt.plot(s.index, s.values) -plt.plot(s.index, m.ewma(s, 20, min_periods=1).values) -f = plt.gcf() -f.autofmt_xdate() - -plt.show() -plt.close('all') diff --git a/doc/plots/stats/moments_ewmvol.py b/doc/plots/stats/moments_ewmvol.py deleted file mode 100644 index 093f62868fc4e..0000000000000 --- a/doc/plots/stats/moments_ewmvol.py +++ /dev/null @@ -1,23 +0,0 @@ -import matplotlib.pyplot as plt -import pandas.util.testing as t -import pandas.stats.moments as m - -t.N = 500 -ts = t.makeTimeSeries() -ts[::100] = 20 - -s = ts.cumsum() - - -plt.figure(figsize=(10, 5)) -plt.plot(s.index, m.ewmvol(s, span=50, min_periods=1).values, color='b') -plt.plot(s.index, m.rolling_std(s, 50, min_periods=1).values, color='r') - -plt.title('Exp-weighted std with shocks') -plt.legend(('Exp-weighted', 'Equal-weighted')) - -f = plt.gcf() -f.autofmt_xdate() - -plt.show() -plt.close('all') diff --git a/doc/plots/stats/moments_expw.py b/doc/plots/stats/moments_expw.py deleted file mode 100644 index 5fff419b3a940..0000000000000 --- a/doc/plots/stats/moments_expw.py +++ /dev/null @@ -1,35 +0,0 @@ -from moment_plots import * - -np.random.seed(1) - -ts = test_series(500) * 10 - -# ts[::100] = 20 - -s = ts.cumsum() - -fig, axes = plt.subplots(3, 1, figsize=(8, 10), sharex=True) - -ax0, ax1, ax2 = axes - -ax0.plot(s.index, s.values) -ax0.set_title('time series') - -ax1.plot(s.index, m.ewma(s, span=50, min_periods=1).values, color='b') -ax1.plot(s.index, m.rolling_mean(s, 50, min_periods=1).values, color='r') -ax1.set_title('rolling_mean vs. ewma') - -line1 = ax2.plot( - s.index, m.ewmstd(s, span=50, min_periods=1).values, color='b') -line2 = ax2.plot( - s.index, m.rolling_std(s, 50, min_periods=1).values, color='r') -ax2.set_title('rolling_std vs. ewmstd') - -fig.legend((line1, line2), - ('Exp-weighted', 'Equal-weighted'), - loc='upper right') -fig.autofmt_xdate() -fig.subplots_adjust(bottom=0.10, top=0.95) - -plt.show() -plt.close('all') diff --git a/doc/plots/stats/moments_rolling.py b/doc/plots/stats/moments_rolling.py deleted file mode 100644 index 30a6c5f53e20c..0000000000000 --- a/doc/plots/stats/moments_rolling.py +++ /dev/null @@ -1,24 +0,0 @@ -from moment_plots import * - -ts = test_series() -s = ts.cumsum() - -s[20:50] = np.NaN -s[120:150] = np.NaN -plot_timeseries(s, - m.rolling_count(s, 50), - m.rolling_sum(s, 50, min_periods=10), - m.rolling_mean(s, 50, min_periods=10), - m.rolling_std(s, 50, min_periods=10), - m.rolling_skew(s, 50, min_periods=10), - m.rolling_kurt(s, 50, min_periods=10), - size=(10, 12), - titles=('time series', - 'rolling_count', - 'rolling_sum', - 'rolling_mean', - 'rolling_std', - 'rolling_skew', - 'rolling_kurt')) -plt.show() -plt.close('all') diff --git a/doc/plots/stats/moments_rolling_binary.py b/doc/plots/stats/moments_rolling_binary.py deleted file mode 100644 index ab6b7b1c8ff49..0000000000000 --- a/doc/plots/stats/moments_rolling_binary.py +++ /dev/null @@ -1,30 +0,0 @@ -from moment_plots import * - -np.random.seed(1) - -ts = test_series() -s = ts.cumsum() -ts2 = test_series() -s2 = ts2.cumsum() - -s[20:50] = np.NaN -s[120:150] = np.NaN -fig, axes = plt.subplots(3, 1, figsize=(8, 10), sharex=True) - -ax0, ax1, ax2 = axes - -ax0.plot(s.index, s.values) -ax0.plot(s2.index, s2.values) -ax0.set_title('time series') - -ax1.plot(s.index, m.rolling_corr(s, s2, 50, min_periods=1).values) -ax1.set_title('rolling_corr') - -ax2.plot(s.index, m.rolling_cov(s, s2, 50, min_periods=1).values) -ax2.set_title('rolling_cov') - -fig.autofmt_xdate() -fig.subplots_adjust(bottom=0.10, top=0.95) - -plt.show() -plt.close('all') diff --git a/doc/source/release.rst b/doc/source/release.rst index a3289b1144863..0298eda2c78ab 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -37,6 +37,82 @@ analysis / manipulation tool available in any language. * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.pydata.org +pandas 0.21.1 +------------- + +**Release date:** December 12, 2017 + +This is a minor bug-fix release in the 0.21.x series and includes some small +regression fixes, bug fixes and performance improvements. We recommend that all +users upgrade to this version. + +Highlights include: + +- Temporarily restore matplotlib datetime plotting functionality. This should + resolve issues for users who relied implicitly on pandas to plot datetimes + with matplotlib. See :ref:`here `. +- Improvements to the Parquet IO functions introduced in 0.21.0. See + :ref:`here `. + +See the :ref:`v0.21.1 Whatsnew ` overview for an extensive list +of all the changes for 0.21.1. + +Thanks +~~~~~~ + +A total of 46 people contributed to this release. People with a "+" by their +names contributed a patch for the first time. + +Contributors +============ + +* Aaron Critchley + +* Alex Rychyk +* Alexander Buchkovsky + +* Alexander Michael Schade + +* Chris Mazzullo +* Cornelius Riemenschneider + +* Dave Hirschfeld + +* David Fischer + +* David Stansby + +* Dror Atariah + +* Eric Kisslinger + +* Hans + +* Ingolf Becker + +* Jan Werkmann + +* Jeff Reback +* Joris Van den Bossche +* Jörg Döpfert + +* Kevin Kuhl + +* Krzysztof Chomski + +* Leif Walsh +* Licht Takeuchi +* Manraj Singh + +* Matt Braymer-Hayes + +* Michael Waskom + +* Mie~~~ + +* Peter Hoffmann + +* Robert Meyer + +* Sam Cohan + +* Sietse Brouwer + +* Sven + +* Tim Swast +* Tom Augspurger +* Wes Turner +* William Ayd + +* Yee Mey + +* bolkedebruin + +* cgohlke +* derestle-htwg + +* fjdiod + +* gabrielclow + +* gfyoung +* ghasemnaddaf + +* jbrockmendel +* jschendel +* miker985 + +* topper-123 + pandas 0.21.0 ------------- diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 5d7950a667a2f..a7dde5d6ee410 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -1,7 +1,7 @@ .. _whatsnew_0211: -v0.21.1 -------- +v0.21.1 (December 12, 2017) +--------------------------- This is a minor bug-fix release in the 0.21.x series and includes some small regression fixes, bug fixes and performance improvements. @@ -10,8 +10,8 @@ We recommend that all users upgrade to this version. Highlights include: - Temporarily restore matplotlib datetime plotting functionality. This should - resolve issues for users who relied implicitly on pandas to plot datetimes - with matplotlib. See :ref:`here `. + resolve issues for users who implicitly relied on pandas to plot datetimes + with matplotlib. See :ref:`here `. - Improvements to the Parquet IO functions introduced in 0.21.0. See :ref:`here `. @@ -21,7 +21,7 @@ Highlights include: :backlinks: none -.. _whatsnew_0211.special: +.. _whatsnew_0211.converters: Restore Matplotlib datetime Converter Registration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -34,8 +34,8 @@ pandas``. In pandas 0.21.0, we required users to explicitly register the converter. This caused problems for some users who relied on those converters being present for regular ``matplotlib.pyplot`` plotting methods, so we're -temporarily reverting that change; pandas will again register the converters on -import. +temporarily reverting that change; pandas 0.21.1 again registers the converters on +import, just like before 0.21.0. We've added a new option to control the converters: ``pd.options.plotting.matplotlib.register_converters``. By default, they are @@ -123,7 +123,7 @@ I/O - Bug in :func:`read_csv` when reading numeric category fields with high cardinality (:issue:`18186`) - Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`) - Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`). -- Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`) +- Bug in :meth:`DataFrame.to_msgpack` when serializing data of the ``numpy.bool_`` datatype (:issue:`18390`) - Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`) - Bug in :func:`pandas.io.json.json_normalize` to avoid modification of ``meta`` (:issue:`18610`) - Bug in :func:`to_latex` where repeated multi-index values were not printed even though a higher level index differed from the previous row (:issue:`14484`) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 3958f4719e3bd..1fa4fd79bac78 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -138,8 +138,8 @@ Other Enhancements - :func:`Series` / :func:`DataFrame` tab completion also returns identifiers in the first level of a :func:`MultiIndex`. (:issue:`16326`) - :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`) - :func:``DataFrame.to_json`` and ``Series.to_json`` now accept an ``index`` argument which allows the user to exclude the index from the JSON output (:issue:`17394`) +- ``IntervalIndex.to_tuples()`` has gained the ``na_tuple`` parameter to control whether NA is returned as a tuple of NA, or NA itself (:issue:`18756`) - :func: groupby.is_monotonic_increasing and :func: .is_monotonic_decreasing extend :func: Series.is_monotonic_increasing to groups, returning whether each group is monotonically increasing or decreasing, respectively. (:issue:`17015`) -- .. _whatsnew_0220.api_breaking: @@ -265,6 +265,7 @@ Conversion - Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) - Fixed a bug where ``FY5253`` date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) - Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) +- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) Indexing @@ -280,6 +281,8 @@ Indexing - Bug in :func:`IntervalIndex.symmetric_difference` where the symmetric difference with a non-``IntervalIndex`` did not raise (:issue:`18475`) - Bug in indexing a datetimelike ``Index`` that raised ``ValueError`` instead of ``IndexError`` (:issue:`18386`). - Bug in tz-aware :class:`DatetimeIndex` where addition/subtraction with a :class:`TimedeltaIndex` or array with ``dtype='timedelta64[ns]'`` was incorrect (:issue:`17558`) +- :func:`Index.to_series` now accepts ``index`` and ``name`` kwargs (:issue:`18699`) +- :func:`DatetimeIndex.to_series` now accepts ``index`` and ``name`` kwargs (:issue:`18699`) I/O ^^^ @@ -317,7 +320,7 @@ Reshaping - Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`) - Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`) - +- Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`) - Numeric @@ -338,4 +341,3 @@ Other ^^^^^ - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) -- diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index 0e46530e20d1c..8ccc6e036da80 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -84,6 +84,11 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, mask = np.isnan(values) {{elif dtype == 'int64'}} mask = values == iNaT + + # create copy in case of iNaT + # values are mutated inplace + if mask.any(): + values = values.copy() {{endif}} # double sort first by mask and then by values to ensure nan values are diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 7b0504388be22..c7035df8ac15c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -220,7 +220,7 @@ def _test_parse_iso8601(object ts): if ts == 'now': return Timestamp.utcnow() elif ts == 'today': - return Timestamp.utcnow().normalize() + return Timestamp.now().normalize() _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset) obj.value = dtstruct_to_dt64(&obj.dts) @@ -734,7 +734,7 @@ cdef inline bint _parse_today_now(str val, int64_t* iresult): return True elif val == 'today': # Note: this is *not* the same as Timestamp('today') - iresult[0] = Timestamp.utcnow().normalize().value + iresult[0] = Timestamp.now().normalize().value return True return False diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index a68ecbd2e8629..d7edae865911a 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -33,6 +33,17 @@ cdef int32_t* _month_offset = [ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365, 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366] +# Canonical location for other modules to find name constants +MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', + 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] +MONTH_NUMBERS = {name: num for num, name in enumerate(MONTHS)} +MONTH_ALIASES = {(num + 1): name for num, name in enumerate(MONTHS)} +MONTH_TO_CAL_NUM = {name: num + 1 for num, name in enumerate(MONTHS)} + +DAYS = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] +int_to_weekday = {num: name for num, name in enumerate(DAYS)} +weekday_to_int = {int_to_weekday[key]: key for key in int_to_weekday} + # ---------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index c12a15b71487b..11e1787cd77da 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -469,7 +469,8 @@ cdef inline void _localize_tso(_TSObject obj, object tz): """ cdef: ndarray[int64_t] trans, deltas - Py_ssize_t delta, posn + int64_t delta + Py_ssize_t posn datetime dt assert obj.tzinfo is None diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 950677b3b53db..18101c834c737 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -139,8 +139,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, int mo_off, dom, doy, dow, ldom _month_offset = np.array( - [[ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 ], - [ 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 ]], + [[0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365], + [0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366]], dtype=np.int32) count = len(dtindex) @@ -380,8 +380,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): int mo_off, doy, dow _month_offset = np.array( - [[ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 ], - [ 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 ]], + [[0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365], + [0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366]], dtype=np.int32 ) count = len(dtindex) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 29e14103dfe20..933e7ed64b837 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -17,6 +17,7 @@ np.import_array() from util cimport is_string_object, is_integer_object +from ccalendar import MONTHS, DAYS from conversion cimport tz_convert_single, pydt_to_i8 from frequencies cimport get_freq_code from nattype cimport NPY_NAT @@ -27,14 +28,9 @@ from np_datetime cimport (pandas_datetimestruct, # --------------------------------------------------------------------- # Constants -# Duplicated in tslib -_MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', - 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] -_int_to_month = {(k + 1): v for k, v in enumerate(_MONTHS)} -_month_to_int = {v: k for k, v in _int_to_month.items()} - class WeekDay(object): + # TODO: Remove: This is not used outside of tests MON = 0 TUE = 1 WED = 2 @@ -44,18 +40,6 @@ class WeekDay(object): SUN = 6 -_int_to_weekday = { - WeekDay.MON: 'MON', - WeekDay.TUE: 'TUE', - WeekDay.WED: 'WED', - WeekDay.THU: 'THU', - WeekDay.FRI: 'FRI', - WeekDay.SAT: 'SAT', - WeekDay.SUN: 'SUN'} - -_weekday_to_int = {_int_to_weekday[key]: key for key in _int_to_weekday} - - _offset_to_period_map = { 'WEEKDAY': 'D', 'EOM': 'M', @@ -88,17 +72,16 @@ _offset_to_period_map = { need_suffix = ['QS', 'BQ', 'BQS', 'YS', 'AS', 'BY', 'BA', 'BYS', 'BAS'] for __prefix in need_suffix: - for _m in _MONTHS: + for _m in MONTHS: key = '%s-%s' % (__prefix, _m) _offset_to_period_map[key] = _offset_to_period_map[__prefix] for __prefix in ['A', 'Q']: - for _m in _MONTHS: + for _m in MONTHS: _alias = '%s-%s' % (__prefix, _m) _offset_to_period_map[_alias] = _alias -_days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] -for _d in _days: +for _d in DAYS: _offset_to_period_map['W-%s' % _d] = 'W-%s' % _d diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 8ce1d9cdf2158..a9a5500cd7447 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -41,6 +41,9 @@ from dateutil.relativedelta import relativedelta from dateutil.parser import DEFAULTPARSER from dateutil.parser import parse as du_parse +from ccalendar import MONTH_NUMBERS +from nattype import nat_strings + # ---------------------------------------------------------------------- # Constants @@ -49,14 +52,8 @@ class DateParseError(ValueError): pass -_nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) - _DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, second=0, microsecond=0) -_MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', - 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] -_MONTH_NUMBERS = {k: i for i, k in enumerate(_MONTHS)} -_MONTH_ALIASES = {(k + 1): v for k, v in enumerate(_MONTHS)} cdef object _TIMEPAT = re.compile(r'^([01]?[0-9]|2[0-3]):([0-5][0-9])') @@ -213,7 +210,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, # len(date_string) == 0 # should be NaT??? - if date_string in _nat_strings: + if date_string in nat_strings: return NAT_SENTINEL, NAT_SENTINEL, '' date_string = date_string.upper() @@ -267,7 +264,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, if freq is not None: # hack attack, #1228 try: - mnum = _MONTH_NUMBERS[_get_rule_month(freq)] + 1 + mnum = MONTH_NUMBERS[_get_rule_month(freq)] + 1 except (KeyError, ValueError): msg = ('Unable to retrieve month information from given ' 'freq: {0}').format(freq) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index cf73257caf227..42570e355e2bf 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -32,8 +32,9 @@ from timestamps import Timestamp from timezones cimport is_utc, is_tzlocal, get_utcoffset, get_dst_info from timedeltas cimport delta_to_nanoseconds +from ccalendar import MONTH_NUMBERS from parsing import (parse_time_string, NAT_SENTINEL, - _get_rule_month, _MONTH_NUMBERS) + _get_rule_month) from frequencies cimport get_freq_code from resolution import resolution, Resolution from nattype import nat_strings, NaT, iNaT @@ -1148,7 +1149,7 @@ def _quarter_to_myear(year, quarter, freq): if quarter <= 0 or quarter > 4: raise ValueError('Quarter must be 1 <= q <= 4') - mnum = _MONTH_NUMBERS[_get_rule_month(freq)] + 1 + mnum = MONTH_NUMBERS[_get_rule_month(freq)] + 1 month = (mnum + (quarter - 1) * 3) % 12 + 1 if month > mnum: year -= 1 diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index d2b518c74a1e3..9cb2c450524fb 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -17,13 +17,13 @@ from pandas._libs.khash cimport (khiter_t, from cpython.datetime cimport datetime -from np_datetime cimport (pandas_datetimestruct, - dtstruct_to_dt64, dt64_to_dtstruct) +from np_datetime cimport pandas_datetimestruct, dt64_to_dtstruct from frequencies cimport get_freq_code from timezones cimport (is_utc, is_tzlocal, maybe_get_tz, get_dst_info, get_utcoffset) from fields import build_field_sarray from conversion import tz_convert +from ccalendar import DAYS, MONTH_ALIASES, int_to_weekday from pandas._libs.properties import cache_readonly from pandas._libs.tslib import Timestamp @@ -50,13 +50,6 @@ _ONE_MINUTE = 60 * _ONE_SECOND _ONE_HOUR = 60 * _ONE_MINUTE _ONE_DAY = 24 * _ONE_HOUR -DAYS = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] -_weekday_rule_aliases = {k: v for k, v in enumerate(DAYS)} - -_MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', - 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] -_MONTH_ALIASES = {(k + 1): v for k, v in enumerate(_MONTHS)} - # ---------------------------------------------------------------------- cpdef resolution(ndarray[int64_t] stamps, tz=None): @@ -354,7 +347,7 @@ class Resolution(object): # Frequency Inference -# TODO: this is non performiant logic here (and duplicative) and this +# TODO: this is non performant logic here (and duplicative) and this # simply should call unique_1d directly # plus no reason to depend on khash directly cdef unique_deltas(ndarray[int64_t] arr): @@ -537,7 +530,7 @@ class _FrequencyInferer(object): annual_rule = self._get_annual_rule() if annual_rule: nyears = self.ydiffs[0] - month = _MONTH_ALIASES[self.rep_stamp.month] + month = MONTH_ALIASES[self.rep_stamp.month] alias = '{prefix}-{month}'.format(prefix=annual_rule, month=month) return _maybe_add_count(alias, nyears) @@ -545,7 +538,7 @@ class _FrequencyInferer(object): if quarterly_rule: nquarters = self.mdiffs[0] / 3 mod_dict = {0: 12, 2: 11, 1: 10} - month = _MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]] + month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]] alias = '{prefix}-{month}'.format(prefix=quarterly_rule, month=month) return _maybe_add_count(alias, nquarters) @@ -558,7 +551,7 @@ class _FrequencyInferer(object): days = self.deltas[0] / _ONE_DAY if days % 7 == 0: # Weekly - day = _weekday_rule_aliases[self.rep_stamp.weekday()] + day = int_to_weekday[self.rep_stamp.weekday()] return _maybe_add_count('W-{day}'.format(day=day), days / 7) else: return _maybe_add_count('D', days) @@ -630,7 +623,7 @@ class _FrequencyInferer(object): # get which week week = week_of_months[0] + 1 - wd = _weekday_rule_aliases[weekdays[0]] + wd = int_to_weekday[weekdays[0]] return 'WOM-{week}{weekday}'.format(week=week, weekday=wd) @@ -642,7 +635,7 @@ class _TimedeltaFrequencyInferer(_FrequencyInferer): days = self.deltas[0] / _ONE_DAY if days % 7 == 0: # Weekly - wd = _weekday_rule_aliases[self.rep_stamp.weekday()] + wd = int_to_weekday[self.rep_stamp.weekday()] alias = 'W-{weekday}'.format(weekday=wd) return _maybe_add_count(alias, days / 7) else: diff --git a/pandas/core/apply.py b/pandas/core/apply.py new file mode 100644 index 0000000000000..2f43087f7dff9 --- /dev/null +++ b/pandas/core/apply.py @@ -0,0 +1,301 @@ +import numpy as np +from pandas import compat +from pandas._libs import lib +from pandas.core.dtypes.common import ( + is_extension_type, + is_sequence) + +from pandas.io.formats.printing import pprint_thing + + +def frame_apply(obj, func, axis=0, broadcast=False, + raw=False, reduce=None, args=(), **kwds): + """ construct and return a row or column based frame apply object """ + + axis = obj._get_axis_number(axis) + if axis == 0: + klass = FrameRowApply + elif axis == 1: + klass = FrameColumnApply + + return klass(obj, func, broadcast=broadcast, + raw=raw, reduce=reduce, args=args, kwds=kwds) + + +class FrameApply(object): + + def __init__(self, obj, func, broadcast, raw, reduce, args, kwds): + self.obj = obj + self.broadcast = broadcast + self.raw = raw + self.reduce = reduce + self.args = args + + self.ignore_failures = kwds.pop('ignore_failures', False) + self.kwds = kwds + + # curry if needed + if kwds or args and not isinstance(func, np.ufunc): + def f(x): + return func(x, *args, **kwds) + else: + f = func + + self.f = f + + @property + def columns(self): + return self.obj.columns + + @property + def index(self): + return self.obj.index + + @property + def values(self): + return self.obj.values + + @property + def agg_axis(self): + return self.obj._get_agg_axis(self.axis) + + def get_result(self): + """ compute the results """ + + # all empty + if len(self.columns) == 0 and len(self.index) == 0: + return self.apply_empty_result() + + # string dispatch + if isinstance(self.f, compat.string_types): + if self.axis: + self.kwds['axis'] = self.axis + return getattr(self.obj, self.f)(*self.args, **self.kwds) + + # ufunc + elif isinstance(self.f, np.ufunc): + with np.errstate(all='ignore'): + results = self.f(self.values) + return self.obj._constructor(data=results, index=self.index, + columns=self.columns, copy=False) + + # broadcasting + if self.broadcast: + return self.apply_broadcast() + + # one axis empty + if not all(self.obj.shape): + return self.apply_empty_result() + + # raw + if self.raw and not self.obj._is_mixed_type: + return self.apply_raw() + + return self.apply_standard() + + def apply_empty_result(self): + from pandas import Series + reduce = self.reduce + + if reduce is None: + reduce = False + + EMPTY_SERIES = Series([]) + try: + r = self.f(EMPTY_SERIES, *self.args, **self.kwds) + reduce = not isinstance(r, Series) + except Exception: + pass + + if reduce: + return Series(np.nan, index=self.agg_axis) + else: + return self.obj.copy() + + def apply_raw(self): + try: + result = lib.reduce(self.values, self.f, axis=self.axis) + except Exception: + result = np.apply_along_axis(self.f, self.axis, self.values) + + # TODO: mixed type case + from pandas import DataFrame, Series + if result.ndim == 2: + return DataFrame(result, index=self.index, columns=self.columns) + else: + return Series(result, index=self.agg_axis) + + def apply_standard(self): + from pandas import Series + + reduce = self.reduce + if reduce is None: + reduce = True + + # try to reduce first (by default) + # this only matters if the reduction in values is of different dtype + # e.g. if we want to apply to a SparseFrame, then can't directly reduce + if reduce: + values = self.values + + # we cannot reduce using non-numpy dtypes, + # as demonstrated in gh-12244 + if not is_extension_type(values): + + # Create a dummy Series from an empty array + index = self.obj._get_axis(self.axis) + empty_arr = np.empty(len(index), dtype=values.dtype) + + dummy = Series(empty_arr, index=index, dtype=values.dtype) + + try: + labels = self.agg_axis + result = lib.reduce(values, self.f, + axis=self.axis, + dummy=dummy, + labels=labels) + return Series(result, index=labels) + except Exception: + pass + + # compute the result using the series generator + results, res_index, res_columns = self._apply_series_generator() + + # wrap results + return self.wrap_results(results, res_index, res_columns) + + def _apply_series_generator(self): + series_gen = self.series_generator + res_index = self.result_index + res_columns = self.result_columns + + i = None + keys = [] + results = {} + if self.ignore_failures: + successes = [] + for i, v in enumerate(series_gen): + try: + results[i] = self.f(v) + keys.append(v.name) + successes.append(i) + except Exception: + pass + + # so will work with MultiIndex + if len(successes) < len(res_index): + res_index = res_index.take(successes) + + else: + try: + for i, v in enumerate(series_gen): + results[i] = self.f(v) + keys.append(v.name) + except Exception as e: + if hasattr(e, 'args'): + + # make sure i is defined + if i is not None: + k = res_index[i] + e.args = e.args + ('occurred at index %s' % + pprint_thing(k), ) + raise + + return results, res_index, res_columns + + def wrap_results(self, results, res_index, res_columns): + from pandas import Series + + if len(results) > 0 and is_sequence(results[0]): + if not isinstance(results[0], Series): + index = res_columns + else: + index = None + + result = self.obj._constructor(data=results, index=index) + result.columns = res_index + + if self.axis == 1: + result = result.T + result = result._convert( + datetime=True, timedelta=True, copy=False) + + else: + + result = Series(results) + result.index = res_index + + return result + + def _apply_broadcast(self, target): + result_values = np.empty_like(target.values) + columns = target.columns + for i, col in enumerate(columns): + result_values[:, i] = self.f(target[col]) + + result = self.obj._constructor(result_values, index=target.index, + columns=target.columns) + return result + + +class FrameRowApply(FrameApply): + axis = 0 + + def get_result(self): + + # dispatch to agg + if isinstance(self.f, (list, dict)): + return self.obj.aggregate(self.f, axis=self.axis, + *self.args, **self.kwds) + + return super(FrameRowApply, self).get_result() + + def apply_broadcast(self): + return self._apply_broadcast(self.obj) + + @property + def series_generator(self): + return (self.obj._ixs(i, axis=1) + for i in range(len(self.columns))) + + @property + def result_index(self): + return self.columns + + @property + def result_columns(self): + return self.index + + +class FrameColumnApply(FrameApply): + axis = 1 + + def __init__(self, obj, func, broadcast, raw, reduce, args, kwds): + super(FrameColumnApply, self).__init__(obj, func, broadcast, + raw, reduce, args, kwds) + + # skip if we are mixed datelike and trying reduce across axes + # GH6125 + if self.reduce: + if self.obj._is_mixed_type and self.obj._is_datelike_mixed_type: + self.reduce = False + + def apply_broadcast(self): + return self._apply_broadcast(self.obj.T).T + + @property + def series_generator(self): + from pandas import Series + dtype = object if self.obj._is_mixed_type else None + return (Series._from_array(arr, index=self.columns, name=name, + dtype=dtype) + for i, (arr, name) in enumerate(zip(self.values, + self.index))) + + @property + def result_index(self): + return self.index + + @property + def result_columns(self): + return self.columns diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index e34755e665f8d..356e76df366b4 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -436,9 +436,12 @@ def astype(self, dtype, copy=True): """ if is_categorical_dtype(dtype): - if copy is True: - return self.copy() - return self + # GH 10696/18593 + dtype = self.dtype._update_dtype(dtype) + self = self.copy() if copy else self + if dtype == self.dtype: + return self + return self._set_dtype(dtype) return np.array(self, dtype=dtype, copy=copy) @cache_readonly diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9ce6b6148be56..753c623b2de4c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4413,10 +4413,12 @@ def pivot(self, index=None, columns=None, values=None): list can contain any of the other types (except list). Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values. - aggfunc : function or list of functions, default numpy.mean + aggfunc : function, list of functions, dict, default numpy.mean If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) + If dict is passed, the key is column to aggregate and value + is function or list of functions fill_value : scalar, default None Value to replace missing values with margins : boolean, default False @@ -4452,7 +4454,6 @@ def pivot(self, index=None, columns=None, values=None): >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table - ... # doctest: +NORMALIZE_WHITESPACE C large small A B bar one 4.0 5.0 @@ -4460,6 +4461,28 @@ def pivot(self, index=None, columns=None, values=None): foo one 4.0 1.0 two NaN 6.0 + >>> table = pivot_table(df, values='D', index=['A', 'B'], + ... columns=['C'], aggfunc=np.sum) + >>> table + C large small + A B + bar one 4.0 5.0 + two 7.0 6.0 + foo one 4.0 1.0 + two NaN 6.0 + + >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'], + ... aggfunc={'D': np.mean, + ... 'E': [min, max, np.mean]}) + >>> table + D E + mean max median min + A C + bar large 5.500000 16 14.5 13 + small 5.500000 15 14.5 14 + foo large 2.000000 10 9.5 9 + small 2.333333 12 11.0 8 + Returns ------- table : DataFrame @@ -4787,8 +4810,7 @@ def aggregate(self, func, axis=0, *args, **kwargs): def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, args=(), **kwds): - """ - Applies function along input axis of DataFrame. + """Applies function along input axis of DataFrame. Objects passed to functions are Series objects having index either the DataFrame's index (axis=0) or the columns (axis=1). @@ -4847,194 +4869,15 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, ------- applied : Series or DataFrame """ - axis = self._get_axis_number(axis) - ignore_failures = kwds.pop('ignore_failures', False) - - # dispatch to agg - if axis == 0 and isinstance(func, (list, dict)): - return self.aggregate(func, axis=axis, *args, **kwds) - - if len(self.columns) == 0 and len(self.index) == 0: - return self._apply_empty_result(func, axis, reduce, *args, **kwds) - - # if we are a string, try to dispatch - if isinstance(func, compat.string_types): - if axis: - kwds['axis'] = axis - return getattr(self, func)(*args, **kwds) - - if kwds or args and not isinstance(func, np.ufunc): - def f(x): - return func(x, *args, **kwds) - else: - f = func - - if isinstance(f, np.ufunc): - with np.errstate(all='ignore'): - results = f(self.values) - return self._constructor(data=results, index=self.index, - columns=self.columns, copy=False) - else: - if not broadcast: - if not all(self.shape): - return self._apply_empty_result(func, axis, reduce, *args, - **kwds) - - if raw and not self._is_mixed_type: - return self._apply_raw(f, axis) - else: - if reduce is None: - reduce = True - return self._apply_standard( - f, axis, - reduce=reduce, - ignore_failures=ignore_failures) - else: - return self._apply_broadcast(f, axis) - - def _apply_empty_result(self, func, axis, reduce, *args, **kwds): - if reduce is None: - reduce = False - try: - reduce = not isinstance(func(_EMPTY_SERIES, *args, **kwds), - Series) - except Exception: - pass - - if reduce: - return Series(np.nan, index=self._get_agg_axis(axis)) - else: - return self.copy() - - def _apply_raw(self, func, axis): - try: - result = lib.reduce(self.values, func, axis=axis) - except Exception: - result = np.apply_along_axis(func, axis, self.values) - - # TODO: mixed type case - if result.ndim == 2: - return DataFrame(result, index=self.index, columns=self.columns) - else: - return Series(result, index=self._get_agg_axis(axis)) - - def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): - - # skip if we are mixed datelike and trying reduce across axes - # GH6125 - if (reduce and axis == 1 and self._is_mixed_type and - self._is_datelike_mixed_type): - reduce = False - - # try to reduce first (by default) - # this only matters if the reduction in values is of different dtype - # e.g. if we want to apply to a SparseFrame, then can't directly reduce - if reduce: - values = self.values - - # we cannot reduce using non-numpy dtypes, - # as demonstrated in gh-12244 - if not is_extension_type(values): - # Create a dummy Series from an empty array - index = self._get_axis(axis) - empty_arr = np.empty(len(index), dtype=values.dtype) - dummy = Series(empty_arr, index=self._get_axis(axis), - dtype=values.dtype) - - try: - labels = self._get_agg_axis(axis) - result = lib.reduce(values, func, axis=axis, dummy=dummy, - labels=labels) - return Series(result, index=labels) - except Exception: - pass - - dtype = object if self._is_mixed_type else None - if axis == 0: - series_gen = (self._ixs(i, axis=1) - for i in range(len(self.columns))) - res_index = self.columns - res_columns = self.index - elif axis == 1: - res_index = self.index - res_columns = self.columns - values = self.values - series_gen = (Series._from_array(arr, index=res_columns, name=name, - dtype=dtype) - for i, (arr, name) in enumerate(zip(values, - res_index))) - else: # pragma : no cover - raise AssertionError('Axis must be 0 or 1, got %s' % str(axis)) - - i = None - keys = [] - results = {} - if ignore_failures: - successes = [] - for i, v in enumerate(series_gen): - try: - results[i] = func(v) - keys.append(v.name) - successes.append(i) - except Exception: - pass - # so will work with MultiIndex - if len(successes) < len(res_index): - res_index = res_index.take(successes) - else: - try: - for i, v in enumerate(series_gen): - results[i] = func(v) - keys.append(v.name) - except Exception as e: - if hasattr(e, 'args'): - # make sure i is defined - if i is not None: - k = res_index[i] - e.args = e.args + ('occurred at index %s' % - pprint_thing(k), ) - raise - - if len(results) > 0 and is_sequence(results[0]): - if not isinstance(results[0], Series): - index = res_columns - else: - index = None - - result = self._constructor(data=results, index=index) - result.columns = res_index - - if axis == 1: - result = result.T - result = result._convert(datetime=True, timedelta=True, copy=False) - - else: - - result = Series(results) - result.index = res_index - - return result - - def _apply_broadcast(self, func, axis): - if axis == 0: - target = self - elif axis == 1: - target = self.T - else: # pragma: no cover - raise AssertionError('Axis must be 0 or 1, got %s' % axis) - - result_values = np.empty_like(target.values) - columns = target.columns - for i, col in enumerate(columns): - result_values[:, i] = func(target[col]) - - result = self._constructor(result_values, index=target.index, - columns=target.columns) - - if axis == 1: - result = result.T - - return result + from pandas.core.apply import frame_apply + op = frame_apply(self, + func=func, + axis=axis, + broadcast=broadcast, + raw=raw, + reduce=reduce, + args=args, **kwds) + return op.get_result() def applymap(self, func): """ @@ -6166,8 +6009,6 @@ def isin(self, values): ops.add_flex_arithmetic_methods(DataFrame, **ops.frame_flex_funcs) ops.add_special_arithmetic_methods(DataFrame, **ops.frame_special_funcs) -_EMPTY_SERIES = Series([]) - def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9557261e61463..5231dc2deb233 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -983,20 +983,32 @@ def _format_attrs(self): attrs.append(('length', len(self))) return attrs - def to_series(self, **kwargs): + def to_series(self, index=None, name=None): """ Create a Series with both index and values equal to the index keys useful with map for returning an indexer based on an index + Parameters + ---------- + index : Index, optional + index of resulting Series. If None, defaults to original index + name : string, optional + name of resulting Series. If None, defaults to name of original + index + Returns ------- Series : dtype will be based on the type of the Index values. """ from pandas import Series - return Series(self._to_embed(), - index=self._shallow_copy(), - name=self.name) + + if index is None: + index = self._shallow_copy() + if name is None: + name = self.name + + return Series(self._to_embed(), index=index, name=name) def to_frame(self, index=True): """ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 38e8c24de4bdf..ec5c20d341b50 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -932,7 +932,7 @@ def _get_time_micros(self): values = self._local_timestamps() return fields.get_time_micros(values) - def to_series(self, keep_tz=False): + def to_series(self, keep_tz=False, index=None, name=None): """ Create a Series with both index and values equal to the index keys useful with map for returning an indexer based on an index @@ -954,15 +954,24 @@ def to_series(self, keep_tz=False): Series will have a datetime64[ns] dtype. TZ aware objects will have the tz removed. + index : Index, optional + index of resulting Series. If None, defaults to original index + name : string, optional + name of resulting Series. If None, defaults to name of original + index Returns ------- Series """ from pandas import Series - return Series(self._to_embed(keep_tz), - index=self._shallow_copy(), - name=self.name) + + if index is None: + index = self._shallow_copy() + if name is None: + name = self.name + + return Series(self._to_embed(keep_tz), index=index, name=name) def _to_embed(self, keep_tz=False, dtype=None): """ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 292b0f638f821..cb786574909db 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -544,9 +544,31 @@ def from_tuples(cls, data, closed='right', name=None, copy=False): return cls.from_arrays(left, right, closed, name=name, copy=False) - def to_tuples(self): - """Return an Index of tuples of the form (left, right)""" - return Index(_asarray_tuplesafe(zip(self.left, self.right))) + def to_tuples(self, na_tuple=True): + """ + Return an Index of tuples of the form (left, right) + + Parameters + ---------- + na_tuple : boolean, default True + Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA + value itself if False, ``nan``. + + ..versionadded:: 0.22.0 + + Examples + -------- + >>> idx = pd.IntervalIndex.from_arrays([0, np.nan, 2], [1, np.nan, 3]) + >>> idx.to_tuples() + Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object') + >>> idx.to_tuples(na_tuple=False) + Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object') + """ + tuples = _asarray_tuplesafe(zip(self.left, self.right)) + if not na_tuple: + # GH 18756 + tuples = np.where(~self._isnan, tuples, np.nan) + return Index(tuples) @cache_readonly def _multiindex(self): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 4169a001655cb..3a64a0ef84e3d 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -54,7 +54,7 @@ import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ABCSeries, ABCDatetimeIndex -from pandas.core.common import is_null_slice +from pandas.core.common import is_null_slice, _any_not_none import pandas.core.algorithms as algos from pandas.core.index import Index, MultiIndex, _ensure_index @@ -573,7 +573,6 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, raise TypeError(msg) # may need to convert to categorical - # this is only called for non-categoricals if self.is_categorical_astype(dtype): # deprecated 17636 @@ -589,13 +588,16 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, "CategoricalDtype instead", FutureWarning, stacklevel=7) - kwargs = kwargs.copy() - categories = getattr(dtype, 'categories', None) - ordered = getattr(dtype, 'ordered', False) + categories = kwargs.get('categories', None) + ordered = kwargs.get('ordered', None) + if _any_not_none(categories, ordered): + dtype = CategoricalDtype(categories, ordered) - kwargs.setdefault('categories', categories) - kwargs.setdefault('ordered', ordered) - return self.make_block(Categorical(self.values, **kwargs)) + if is_categorical_dtype(self.values): + # GH 10696/18593: update an existing categorical efficiently + return self.make_block(self.values.astype(dtype, copy=copy)) + + return self.make_block(Categorical(self.values, dtype=dtype)) # astype processing dtype = np.dtype(dtype) @@ -2427,23 +2429,6 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): return self.make_block_same_class(new_values, new_mgr_locs) - def _astype(self, dtype, copy=False, errors='raise', values=None, - klass=None, mgr=None): - """ - Coerce to the new type (if copy=True, return a new copy) - raise on an except if raise == True - """ - - if self.is_categorical_astype(dtype): - values = self.values - else: - values = np.asarray(self.values).astype(dtype, copy=False) - - if copy: - values = values.copy() - - return self.make_block(values) - def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 36a18d8f8b4a0..05f39a8caa6f6 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -861,11 +861,17 @@ def apply(self, func, axis=0, broadcast=False, reduce=False): new_series, index=self.index, columns=self.columns, default_fill_value=self._default_fill_value, default_kind=self._default_kind).__finalize__(self) - else: - if not broadcast: - return self._apply_standard(func, axis, reduce=reduce) - else: - return self._apply_broadcast(func, axis) + + from pandas.core.apply import frame_apply + op = frame_apply(self, + func=func, + axis=axis, + reduce=reduce) + + if broadcast: + return op.apply_broadcast() + + return op.apply_standard() def applymap(self, func): """ diff --git a/pandas/tests/categorical/test_dtypes.py b/pandas/tests/categorical/test_dtypes.py index 0a41b628bc057..bad2c27026b31 100644 --- a/pandas/tests/categorical/test_dtypes.py +++ b/pandas/tests/categorical/test_dtypes.py @@ -99,10 +99,54 @@ def test_codes_dtypes(self): result = result.remove_categories(['foo%05d' % i for i in range(300)]) assert result.codes.dtype == 'int8' - def test_astype_categorical(self): + @pytest.mark.parametrize('ordered', [True, False]) + def test_astype(self, ordered): + # string + cat = Categorical(list('abbaaccc'), ordered=ordered) + result = cat.astype(object) + expected = np.array(cat) + tm.assert_numpy_array_equal(result, expected) + + msg = 'could not convert string to float' + with tm.assert_raises_regex(ValueError, msg): + cat.astype(float) + + # numeric + cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered) + result = cat.astype(object) + expected = np.array(cat, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + result = cat.astype(int) + expected = np.array(cat, dtype=np.int) + tm.assert_numpy_array_equal(result, expected) + + result = cat.astype(float) + expected = np.array(cat, dtype=np.float) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize('dtype_ordered', [True, False]) + @pytest.mark.parametrize('cat_ordered', [True, False]) + def test_astype_category(self, dtype_ordered, cat_ordered): + # GH 10696/18593 + data = list('abcaacbab') + cat = Categorical(data, categories=list('bac'), ordered=cat_ordered) + + # standard categories + dtype = CategoricalDtype(ordered=dtype_ordered) + result = cat.astype(dtype) + expected = Categorical( + data, categories=cat.categories, ordered=dtype_ordered) + tm.assert_categorical_equal(result, expected) - cat = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) - tm.assert_categorical_equal(cat, cat.astype('category')) - tm.assert_almost_equal(np.array(cat), cat.astype('object')) + # non-standard categories + dtype = CategoricalDtype(list('adc'), dtype_ordered) + result = cat.astype(dtype) + expected = Categorical(data, dtype=dtype) + tm.assert_categorical_equal(result, expected) - pytest.raises(ValueError, lambda: cat.astype(float)) + if dtype_ordered is False: + # dtype='category' can't specify ordered, so only test once + result = cat.astype('category') + expected = cat + tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 4bba6d7601ae8..7014929db4c2d 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2214,3 +2214,12 @@ def test_series_broadcasting(self): df_nan.clip_lower(s, axis=0) for op in ['lt', 'le', 'gt', 'ge', 'eq', 'ne']: getattr(df, op)(s_nan, axis=0) + + def test_series_nat_conversion(self): + # GH 18521 + # Check rank does not mutate DataFrame + df = DataFrame(np.random.randn(10, 3), dtype='float64') + expected = df.copy() + df.rank() + result = df + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index ab2e810d77634..65dd166e1f6a8 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -13,6 +13,7 @@ Timestamp, compat) import pandas as pd from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.apply import frame_apply from pandas.util.testing import (assert_series_equal, assert_frame_equal) import pandas.util.testing as tm @@ -153,8 +154,9 @@ def test_apply_axis1(self): assert tapplied[d] == np.mean(self.frame.xs(d)) def test_apply_ignore_failures(self): - result = self.mixed_frame._apply_standard(np.mean, 0, - ignore_failures=True) + result = frame_apply(self.mixed_frame, + np.mean, 0, + ignore_failures=True).apply_standard() expected = self.mixed_frame._get_numeric_data().apply(np.mean) assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 07e84ad60ef51..8948c5f79900d 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -51,6 +51,25 @@ def test_to_series(self): assert s.index is not idx assert s.name == idx.name + def test_to_series_with_arguments(self): + # GH18699 + + # index kwarg + idx = self.create_index() + s = idx.to_series(index=idx) + + assert s.values is not idx.values + assert s.index is idx + assert s.name == idx.name + + # name kwarg + idx = self.create_index() + s = idx.to_series(name='__test') + + assert s.values is not idx.values + assert s.index is not idx + assert s.name != idx.name + def test_to_frame(self): # see gh-15230 idx = self.create_index() diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index bdee67a4ff674..c89e3ddbfc5d0 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -21,7 +21,8 @@ from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.dtypes.common import is_datetime64_ns_dtype from pandas.util import testing as tm -from pandas.util.testing import assert_series_equal, _skip_if_has_locale +import pandas.util._test_decorators as td +from pandas.util.testing import assert_series_equal from pandas import (isna, to_datetime, Timestamp, Series, DataFrame, Index, DatetimeIndex, NaT, date_range, compat) @@ -143,11 +144,10 @@ def test_to_datetime_format_time(self, cache): for s, format, dt in data: assert to_datetime(s, format=format, cache=cache) == dt + @td.skip_if_has_locale @pytest.mark.parametrize('cache', [True, False]) def test_to_datetime_with_non_exact(self, cache): # GH 10834 - tm._skip_if_has_locale() - # 8904 # exact kw if sys.version_info < (2, 7): @@ -187,6 +187,56 @@ def test_to_datetime_format_weeks(self, cache): class TestToDatetime(object): + @td.skip_if_windows # `tm.set_timezone` does not work in windows + def test_to_datetime_now(self): + # See GH#18666 + with tm.set_timezone('US/Eastern'): + npnow = np.datetime64('now').astype('datetime64[ns]') + pdnow = pd.to_datetime('now') + pdnow2 = pd.to_datetime(['now'])[0] + + # These should all be equal with infinite perf; this gives + # a generous margin of 10 seconds + assert abs(pdnow.value - npnow.astype(np.int64)) < 1e10 + assert abs(pdnow2.value - npnow.astype(np.int64)) < 1e10 + + assert pdnow.tzinfo is None + assert pdnow2.tzinfo is None + + @td.skip_if_windows # `tm.set_timezone` does not work in windows + def test_to_datetime_today(self): + # See GH#18666 + # Test with one timezone far ahead of UTC and another far behind, so + # one of these will _almost_ alawys be in a different day from UTC. + # Unfortunately this test between 12 and 1 AM Samoa time + # this both of these timezones _and_ UTC will all be in the same day, + # so this test will not detect the regression introduced in #18666. + with tm.set_timezone('Pacific/Auckland'): # 12-13 hours ahead of UTC + nptoday = np.datetime64('today').astype('datetime64[ns]') + pdtoday = pd.to_datetime('today') + pdtoday2 = pd.to_datetime(['today'])[0] + + # These should all be equal with infinite perf; this gives + # a generous margin of 10 seconds + assert abs(pdtoday.value - nptoday.astype(np.int64)) < 1e10 + assert abs(pdtoday2.value - nptoday.astype(np.int64)) < 1e10 + + assert pdtoday.tzinfo is None + assert pdtoday2.tzinfo is None + + with tm.set_timezone('US/Samoa'): # 11 hours behind UTC + nptoday = np.datetime64('today').astype('datetime64[ns]') + pdtoday = pd.to_datetime('today') + pdtoday2 = pd.to_datetime(['today'])[0] + + # These should all be equal with infinite perf; this gives + # a generous margin of 10 seconds + assert abs(pdtoday.value - nptoday.astype(np.int64)) < 1e10 + assert abs(pdtoday2.value - nptoday.astype(np.int64)) < 1e10 + + assert pdtoday.tzinfo is None + assert pdtoday2.tzinfo is None + @pytest.mark.parametrize('cache', [True, False]) def test_to_datetime_dt64s(self, cache): in_bound_dts = [ @@ -779,11 +829,10 @@ def test_to_datetime_with_space_in_series(self, cache): result_ignore = to_datetime(s, errors='ignore', cache=cache) tm.assert_series_equal(result_ignore, s) + @td.skip_if_has_locale @pytest.mark.parametrize('cache', [True, False]) def test_to_datetime_with_apply(self, cache): # this is only locale tested with US/None locales - tm._skip_if_has_locale() - # GH 5195 # with a format and coerce a single item to_datetime fails td = Series(['May 04', 'Jun 02', 'Dec 11'], index=[1, 2, 3]) @@ -972,9 +1021,9 @@ def test_dayfirst(self, cache): class TestGuessDatetimeFormat(object): + @td.skip_if_not_us_locale @is_dateutil_le_261 def test_guess_datetime_format_for_array(self): - tm._skip_if_not_us_locale() expected_format = '%Y-%m-%d %H:%M:%S.%f' dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format) @@ -993,9 +1042,9 @@ def test_guess_datetime_format_for_array(self): [np.nan, np.nan, np.nan], dtype='O')) assert format_for_string_of_nans is None + @td.skip_if_not_us_locale @is_dateutil_gt_261 def test_guess_datetime_format_for_array_gt_261(self): - tm._skip_if_not_us_locale() expected_format = '%Y-%m-%d %H:%M:%S.%f' dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format) @@ -1342,9 +1391,9 @@ def test_parsers_timestring(self, cache): assert result4 == exp_now assert result5 == exp_now + @td.skip_if_has_locale def test_parsers_time(self): # GH11818 - _skip_if_has_locale() strings = ["14:15", "1415", "2:15pm", "0215pm", "14:15:00", "141500", "2:15:00pm", "021500pm", time(14, 15)] expected = time(14, 15) diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 9cbcfa4f46008..9df23948ae627 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -6,7 +6,8 @@ import pandas.core.indexes.period as period from pandas.compat import lrange from pandas.tseries.frequencies import get_freq -from pandas._libs.tslibs.resolution import _MONTHS as MONTHS + +from pandas._libs.tslibs.ccalendar import MONTHS from pandas._libs.tslibs.period import period_ordinal, period_asfreq from pandas import (PeriodIndex, Period, DatetimeIndex, Timestamp, Series, date_range, to_datetime, period_range) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index ae9e011d76597..543f59013ff12 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -411,11 +411,10 @@ def test_astype(self): result = IntervalIndex.from_intervals(result.values) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('copy', [True, False]) @pytest.mark.parametrize('name', [None, 'foo']) @pytest.mark.parametrize('dtype_ordered', [True, False]) @pytest.mark.parametrize('index_ordered', [True, False]) - def test_astype_category(self, copy, name, dtype_ordered, index_ordered): + def test_astype_category(self, name, dtype_ordered, index_ordered): # GH 18630 index = self.create_index(ordered=index_ordered) if name: @@ -423,7 +422,7 @@ def test_astype_category(self, copy, name, dtype_ordered, index_ordered): # standard categories dtype = CategoricalDtype(ordered=dtype_ordered) - result = index.astype(dtype, copy=copy) + result = index.astype(dtype) expected = CategoricalIndex(index.tolist(), name=name, categories=index.categories, @@ -432,13 +431,13 @@ def test_astype_category(self, copy, name, dtype_ordered, index_ordered): # non-standard categories dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered) - result = index.astype(dtype, copy=copy) + result = index.astype(dtype) expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype) tm.assert_index_equal(result, expected) if dtype_ordered is False: # dtype='category' can't specify ordered, so only test once - result = index.astype('category', copy=copy) + result = index.astype('category') expected = index tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index abad930793d7f..c809127a66ab8 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -7,6 +7,7 @@ Interval, IntervalIndex, Index, isna, notna, interval_range, Timestamp, Timedelta, compat, date_range, timedelta_range, DateOffset) from pandas.compat import lzip +from pandas.core.common import _asarray_tuplesafe from pandas.tseries.offsets import Day from pandas._libs.interval import IntervalTree from pandas.tests.indexes.common import Base @@ -1072,6 +1073,45 @@ def test_is_non_overlapping_monotonic(self, closed): idx = IntervalIndex.from_breaks(range(4), closed=closed) assert idx.is_non_overlapping_monotonic is True + @pytest.mark.parametrize('tuples', [ + lzip(range(10), range(1, 11)), + lzip(date_range('20170101', periods=10), + date_range('20170101', periods=10)), + lzip(timedelta_range('0 days', periods=10), + timedelta_range('1 day', periods=10))]) + def test_to_tuples(self, tuples): + # GH 18756 + idx = IntervalIndex.from_tuples(tuples) + result = idx.to_tuples() + expected = Index(_asarray_tuplesafe(tuples)) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize('tuples', [ + lzip(range(10), range(1, 11)) + [np.nan], + lzip(date_range('20170101', periods=10), + date_range('20170101', periods=10)) + [np.nan], + lzip(timedelta_range('0 days', periods=10), + timedelta_range('1 day', periods=10)) + [np.nan]]) + @pytest.mark.parametrize('na_tuple', [True, False]) + def test_to_tuples_na(self, tuples, na_tuple): + # GH 18756 + idx = IntervalIndex.from_tuples(tuples) + result = idx.to_tuples(na_tuple=na_tuple) + + # check the non-NA portion + expected_notna = Index(_asarray_tuplesafe(tuples[:-1])) + result_notna = result[:-1] + tm.assert_index_equal(result_notna, expected_notna) + + # check the NA portion + result_na = result[-1] + if na_tuple: + assert isinstance(result_na, tuple) + assert len(result_na) == 2 + assert all(isna(x) for x in result_na) + else: + assert isna(result_na) + class TestIntervalRange(object): diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 752d2deb53304..619a8ca3bf112 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import itertools import pytest import numpy as np @@ -13,6 +14,27 @@ ############################################################### +@pytest.fixture(autouse=True, scope='class') +def check_comprehensiveness(request): + # Iterate over combination of dtype, method and klass + # and ensure that each are contained within a collected test + cls = request.cls + combos = itertools.product(cls.klasses, cls.dtypes, [cls.method]) + + def has_test(combo): + klass, dtype, method = combo + cls_funcs = request.node.session.items + return any(klass in x.name and dtype in x.name and + method in x.name for x in cls_funcs) + + for combo in combos: + if not has_test(combo): + msg = 'test method is not defined: {0}, {1}' + raise AssertionError(msg.format(type(cls), combo)) + + yield + + class CoercionBase(object): klasses = ['index', 'series'] @@ -34,15 +56,6 @@ def _assert(self, left, right, dtype): assert left.dtype == dtype assert right.dtype == dtype - def test_has_comprehensive_tests(self): - for klass in self.klasses: - for dtype in self.dtypes: - method_name = 'test_{0}_{1}_{2}'.format(self.method, - klass, dtype) - if not hasattr(self, method_name): - msg = 'test method is not defined: {0}, {1}' - raise AssertionError(msg.format(type(self), method_name)) - class TestSetitemCoercion(CoercionBase): @@ -62,169 +75,124 @@ def _assert_setitem_series_conversion(self, original_series, loc_value, # temp.loc[1] = loc_value # tm.assert_series_equal(temp, expected_series) - def test_setitem_series_object(self): + @pytest.mark.parametrize("val,exp_dtype", [ + (1, np.object), + (1.1, np.object), + (1 + 1j, np.object), + (True, np.object)]) + def test_setitem_series_object(self, val, exp_dtype): obj = pd.Series(list('abcd')) assert obj.dtype == np.object - # object + int -> object - exp = pd.Series(['a', 1, 'c', 'd']) - self._assert_setitem_series_conversion(obj, 1, exp, np.object) - - # object + float -> object - exp = pd.Series(['a', 1.1, 'c', 'd']) - self._assert_setitem_series_conversion(obj, 1.1, exp, np.object) - - # object + complex -> object - exp = pd.Series(['a', 1 + 1j, 'c', 'd']) - self._assert_setitem_series_conversion(obj, 1 + 1j, exp, np.object) - - # object + bool -> object - exp = pd.Series(['a', True, 'c', 'd']) - self._assert_setitem_series_conversion(obj, True, exp, np.object) + exp = pd.Series(['a', val, 'c', 'd']) + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - def test_setitem_series_int64(self): + @pytest.mark.parametrize("val,exp_dtype", [ + (1, np.int64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.object)]) + def test_setitem_series_int64(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4]) assert obj.dtype == np.int64 - # int + int -> int - exp = pd.Series([1, 1, 3, 4]) - self._assert_setitem_series_conversion(obj, 1, exp, np.int64) + if exp_dtype is np.float64: + exp = pd.Series([1, 1, 3, 4]) + self._assert_setitem_series_conversion(obj, 1.1, exp, np.int64) + pytest.xfail("GH12747 The result must be float") - # int + float -> float - # TODO_GH12747 The result must be float - # tm.assert_series_equal(temp, pd.Series([1, 1.1, 3, 4])) - # assert temp.dtype == np.float64 - exp = pd.Series([1, 1, 3, 4]) - self._assert_setitem_series_conversion(obj, 1.1, exp, np.int64) + exp = pd.Series([1, val, 3, 4]) + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - # int + complex -> complex - exp = pd.Series([1, 1 + 1j, 3, 4]) - self._assert_setitem_series_conversion(obj, 1 + 1j, exp, np.complex128) - - # int + bool -> object - exp = pd.Series([1, True, 3, 4]) - self._assert_setitem_series_conversion(obj, True, exp, np.object) - - def test_setitem_series_int8(self): - # integer dtype coercion (no change) + @pytest.mark.parametrize("val,exp_dtype", [ + (np.int32(1), np.int8), + (np.int16(2**9), np.int16)]) + def test_setitem_series_int8(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4], dtype=np.int8) assert obj.dtype == np.int8 - exp = pd.Series([1, 1, 3, 4], dtype=np.int8) - self._assert_setitem_series_conversion(obj, np.int32(1), exp, np.int8) + if exp_dtype is np.int16: + exp = pd.Series([1, 0, 3, 4], dtype=np.int8) + self._assert_setitem_series_conversion(obj, val, exp, np.int8) + pytest.xfail("BUG: it must be Series([1, 1, 3, 4], dtype=np.int16") - # BUG: it must be Series([1, 1, 3, 4], dtype=np.int16) - exp = pd.Series([1, 0, 3, 4], dtype=np.int8) - self._assert_setitem_series_conversion(obj, np.int16(2**9), exp, - np.int8) + exp = pd.Series([1, val, 3, 4], dtype=np.int8) + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - def test_setitem_series_float64(self): + @pytest.mark.parametrize("val,exp_dtype", [ + (1, np.float64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.object)]) + def test_setitem_series_float64(self, val, exp_dtype): obj = pd.Series([1.1, 2.2, 3.3, 4.4]) assert obj.dtype == np.float64 - # float + int -> float - exp = pd.Series([1.1, 1.0, 3.3, 4.4]) - self._assert_setitem_series_conversion(obj, 1, exp, np.float64) - - # float + float -> float - exp = pd.Series([1.1, 1.1, 3.3, 4.4]) - self._assert_setitem_series_conversion(obj, 1.1, exp, np.float64) - - # float + complex -> complex - exp = pd.Series([1.1, 1 + 1j, 3.3, 4.4]) - self._assert_setitem_series_conversion(obj, 1 + 1j, exp, - np.complex128) - - # float + bool -> object - exp = pd.Series([1.1, True, 3.3, 4.4]) - self._assert_setitem_series_conversion(obj, True, exp, np.object) + exp = pd.Series([1.1, val, 3.3, 4.4]) + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - def test_setitem_series_complex128(self): + @pytest.mark.parametrize("val,exp_dtype", [ + (1, np.complex128), + (1.1, np.complex128), + (1 + 1j, np.complex128), + (True, np.object)]) + def test_setitem_series_complex128(self, val, exp_dtype): obj = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) assert obj.dtype == np.complex128 - # complex + int -> complex - exp = pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j]) - self._assert_setitem_series_conversion(obj, 1, exp, np.complex128) + exp = pd.Series([1 + 1j, val, 3 + 3j, 4 + 4j]) + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - # complex + float -> complex - exp = pd.Series([1 + 1j, 1.1, 3 + 3j, 4 + 4j]) - self._assert_setitem_series_conversion(obj, 1.1, exp, np.complex128) - - # complex + complex -> complex - exp = pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 4 + 4j]) - self._assert_setitem_series_conversion(obj, 1 + 1j, exp, np.complex128) - - # complex + bool -> object - exp = pd.Series([1 + 1j, True, 3 + 3j, 4 + 4j]) - self._assert_setitem_series_conversion(obj, True, exp, np.object) - - def test_setitem_series_bool(self): + @pytest.mark.parametrize("val,exp_dtype", [ + (1, np.int64), + (3, np.int64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.bool)]) + def test_setitem_series_bool(self, val, exp_dtype): obj = pd.Series([True, False, True, False]) assert obj.dtype == np.bool - # bool + int -> int - # TODO_GH12747 The result must be int - # tm.assert_series_equal(temp, pd.Series([1, 1, 1, 0])) - # assert temp.dtype == np.int64 - exp = pd.Series([True, True, True, False]) - self._assert_setitem_series_conversion(obj, 1, exp, np.bool) - - # TODO_GH12747 The result must be int - # assigning int greater than bool - # tm.assert_series_equal(temp, pd.Series([1, 3, 1, 0])) - # assert temp.dtype == np.int64 - exp = pd.Series([True, True, True, False]) - self._assert_setitem_series_conversion(obj, 3, exp, np.bool) - - # bool + float -> float - # TODO_GH12747 The result must be float - # tm.assert_series_equal(temp, pd.Series([1., 1.1, 1., 0.])) - # assert temp.dtype == np.float64 - exp = pd.Series([True, True, True, False]) - self._assert_setitem_series_conversion(obj, 1.1, exp, np.bool) - - # bool + complex -> complex (buggy, results in bool) - # TODO_GH12747 The result must be complex - # tm.assert_series_equal(temp, pd.Series([1, 1 + 1j, 1, 0])) - # assert temp.dtype == np.complex128 - exp = pd.Series([True, True, True, False]) - self._assert_setitem_series_conversion(obj, 1 + 1j, exp, np.bool) - - # bool + bool -> bool - exp = pd.Series([True, True, True, False]) - self._assert_setitem_series_conversion(obj, True, exp, np.bool) - - def test_setitem_series_datetime64(self): + if exp_dtype is np.int64: + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, val, exp, np.bool) + pytest.xfail("TODO_GH12747 The result must be int") + elif exp_dtype is np.float64: + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, val, exp, np.bool) + pytest.xfail("TODO_GH12747 The result must be float") + elif exp_dtype is np.complex128: + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, val, exp, np.bool) + pytest.xfail("TODO_GH12747 The result must be complex") + + exp = pd.Series([True, val, True, False]) + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) + + @pytest.mark.parametrize("val,exp_dtype", [ + (pd.Timestamp('2012-01-01'), 'datetime64[ns]'), + (1, np.object), + ('x', np.object)]) + def test_setitem_series_datetime64(self, val, exp_dtype): obj = pd.Series([pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-03'), pd.Timestamp('2011-01-04')]) assert obj.dtype == 'datetime64[ns]' - # datetime64 + datetime64 -> datetime64 - exp = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp('2012-01-01'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - self._assert_setitem_series_conversion(obj, pd.Timestamp('2012-01-01'), - exp, 'datetime64[ns]') - - # datetime64 + int -> object - exp = pd.Series([pd.Timestamp('2011-01-01'), - 1, - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - self._assert_setitem_series_conversion(obj, 1, exp, 'object') - - # datetime64 + object -> object exp = pd.Series([pd.Timestamp('2011-01-01'), - 'x', + val, pd.Timestamp('2011-01-03'), pd.Timestamp('2011-01-04')]) - self._assert_setitem_series_conversion(obj, 'x', exp, np.object) - - def test_setitem_series_datetime64tz(self): + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) + + @pytest.mark.parametrize("val,exp_dtype", [ + (pd.Timestamp('2012-01-01', tz='US/Eastern'), + 'datetime64[ns, US/Eastern]'), + (pd.Timestamp('2012-01-01', tz='US/Pacific'), np.object), + (pd.Timestamp('2012-01-01'), np.object), + (1, np.object)]) + def test_setitem_series_datetime64tz(self, val, exp_dtype): tz = 'US/Eastern' obj = pd.Series([pd.Timestamp('2011-01-01', tz=tz), pd.Timestamp('2011-01-02', tz=tz), @@ -232,71 +200,28 @@ def test_setitem_series_datetime64tz(self): pd.Timestamp('2011-01-04', tz=tz)]) assert obj.dtype == 'datetime64[ns, US/Eastern]' - # datetime64tz + datetime64tz -> datetime64tz - exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2012-01-01', tz=tz), - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - value = pd.Timestamp('2012-01-01', tz=tz) - self._assert_setitem_series_conversion(obj, value, exp, - 'datetime64[ns, US/Eastern]') - - # datetime64tz + datetime64tz (different tz) -> object - exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2012-01-01', tz='US/Pacific'), - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - value = pd.Timestamp('2012-01-01', tz='US/Pacific') - self._assert_setitem_series_conversion(obj, value, exp, np.object) - - # datetime64tz + datetime64 -> object - exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2012-01-01'), - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - value = pd.Timestamp('2012-01-01') - self._assert_setitem_series_conversion(obj, value, exp, np.object) - - # datetime64 + int -> object exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - 1, + val, pd.Timestamp('2011-01-03', tz=tz), pd.Timestamp('2011-01-04', tz=tz)]) - self._assert_setitem_series_conversion(obj, 1, exp, np.object) - - # ToDo: add more tests once the above issue has been fixed + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - def test_setitem_series_timedelta64(self): + @pytest.mark.parametrize("val,exp_dtype", [ + (pd.Timedelta('12 day'), 'timedelta64[ns]'), + (1, np.object), + ('x', np.object)]) + def test_setitem_series_timedelta64(self, val, exp_dtype): obj = pd.Series([pd.Timedelta('1 day'), pd.Timedelta('2 day'), pd.Timedelta('3 day'), pd.Timedelta('4 day')]) assert obj.dtype == 'timedelta64[ns]' - # timedelta64 + timedelta64 -> timedelta64 - exp = pd.Series([pd.Timedelta('1 day'), - pd.Timedelta('12 day'), - pd.Timedelta('3 day'), - pd.Timedelta('4 day')]) - self._assert_setitem_series_conversion(obj, pd.Timedelta('12 day'), - exp, 'timedelta64[ns]') - - # timedelta64 + int -> object - exp = pd.Series([pd.Timedelta('1 day'), - 1, - pd.Timedelta('3 day'), - pd.Timedelta('4 day')]) - self._assert_setitem_series_conversion(obj, 1, exp, np.object) - - # timedelta64 + object -> object exp = pd.Series([pd.Timedelta('1 day'), - 'x', + val, pd.Timedelta('3 day'), pd.Timedelta('4 day')]) - self._assert_setitem_series_conversion(obj, 'x', exp, np.object) - - def test_setitem_series_period(self): - pass + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) def _assert_setitem_index_conversion(self, original_series, loc_key, expected_index, expected_dtype): @@ -315,58 +240,54 @@ def _assert_setitem_index_conversion(self, original_series, loc_key, # check dtype explicitly for sure assert temp.index.dtype == expected_dtype - def test_setitem_index_object(self): + @pytest.mark.parametrize("val,exp_dtype", [ + ('x', np.object), + (5, IndexError), + (1.1, np.object)]) + def test_setitem_index_object(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4], index=list('abcd')) assert obj.index.dtype == np.object - # object + object -> object - exp_index = pd.Index(list('abcdx')) - self._assert_setitem_index_conversion(obj, 'x', exp_index, np.object) - - # object + int -> IndexError, regarded as location - temp = obj.copy() - with pytest.raises(IndexError): - temp[5] = 5 - - # object + float -> object - exp_index = pd.Index(['a', 'b', 'c', 'd', 1.1]) - self._assert_setitem_index_conversion(obj, 1.1, exp_index, np.object) - - def test_setitem_index_int64(self): - # tests setitem with non-existing numeric key + if exp_dtype is IndexError: + temp = obj.copy() + with pytest.raises(exp_dtype): + temp[5] = 5 + else: + exp_index = pd.Index(list('abcd') + [val]) + self._assert_setitem_index_conversion(obj, val, exp_index, + exp_dtype) + + @pytest.mark.parametrize("val,exp_dtype", [ + (5, np.int64), + (1.1, np.float64), + ('x', np.object)]) + def test_setitem_index_int64(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4]) assert obj.index.dtype == np.int64 - # int + int -> int - exp_index = pd.Index([0, 1, 2, 3, 5]) - self._assert_setitem_index_conversion(obj, 5, exp_index, np.int64) - - # int + float -> float - exp_index = pd.Index([0, 1, 2, 3, 1.1]) - self._assert_setitem_index_conversion(obj, 1.1, exp_index, np.float64) - - # int + object -> object - exp_index = pd.Index([0, 1, 2, 3, 'x']) - self._assert_setitem_index_conversion(obj, 'x', exp_index, np.object) + exp_index = pd.Index([0, 1, 2, 3, val]) + self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) - def test_setitem_index_float64(self): - # tests setitem with non-existing numeric key + @pytest.mark.parametrize("val,exp_dtype", [ + (5, IndexError), + (5.1, np.float64), + ('x', np.object)]) + def test_setitem_index_float64(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4], index=[1.1, 2.1, 3.1, 4.1]) assert obj.index.dtype == np.float64 - # float + int -> int - temp = obj.copy() - # TODO_GH12747 The result must be float - with pytest.raises(IndexError): - temp[5] = 5 + if exp_dtype is IndexError: + # float + int -> int + temp = obj.copy() + with pytest.raises(exp_dtype): + temp[5] = 5 + pytest.xfail("TODO_GH12747 The result must be float") - # float + float -> float - exp_index = pd.Index([1.1, 2.1, 3.1, 4.1, 5.1]) - self._assert_setitem_index_conversion(obj, 5.1, exp_index, np.float64) + exp_index = pd.Index([1.1, 2.1, 3.1, 4.1, val]) + self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) - # float + object -> object - exp_index = pd.Index([1.1, 2.1, 3.1, 4.1, 'x']) - self._assert_setitem_index_conversion(obj, 'x', exp_index, np.object) + def test_setitem_series_period(self): + pass def test_setitem_index_complex128(self): pass @@ -400,121 +321,70 @@ def _assert_insert_conversion(self, original, value, tm.assert_index_equal(res, expected) assert res.dtype == expected_dtype - def test_insert_index_object(self): + @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [ + (1, 1, np.object), + (1.1, 1.1, np.object), + (False, False, np.object), + ('x', 'x', np.object)]) + def test_insert_index_object(self, insert, coerced_val, coerced_dtype): obj = pd.Index(list('abcd')) assert obj.dtype == np.object - # object + int -> object - exp = pd.Index(['a', 1, 'b', 'c', 'd']) - self._assert_insert_conversion(obj, 1, exp, np.object) - - # object + float -> object - exp = pd.Index(['a', 1.1, 'b', 'c', 'd']) - self._assert_insert_conversion(obj, 1.1, exp, np.object) - - # object + bool -> object - res = obj.insert(1, False) - tm.assert_index_equal(res, pd.Index(['a', False, 'b', 'c', 'd'])) - assert res.dtype == np.object - - # object + object -> object - exp = pd.Index(['a', 'x', 'b', 'c', 'd']) - self._assert_insert_conversion(obj, 'x', exp, np.object) + exp = pd.Index(['a', coerced_val, 'b', 'c', 'd']) + self._assert_insert_conversion(obj, insert, exp, coerced_dtype) - def test_insert_index_int64(self): + @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [ + (1, 1, np.int64), + (1.1, 1.1, np.float64), + (False, 0, np.int64), + ('x', 'x', np.object)]) + def test_insert_index_int64(self, insert, coerced_val, coerced_dtype): obj = pd.Int64Index([1, 2, 3, 4]) assert obj.dtype == np.int64 - # int + int -> int - exp = pd.Index([1, 1, 2, 3, 4]) - self._assert_insert_conversion(obj, 1, exp, np.int64) + exp = pd.Index([1, coerced_val, 2, 3, 4]) + self._assert_insert_conversion(obj, insert, exp, coerced_dtype) - # int + float -> float - exp = pd.Index([1, 1.1, 2, 3, 4]) - self._assert_insert_conversion(obj, 1.1, exp, np.float64) - - # int + bool -> int - exp = pd.Index([1, 0, 2, 3, 4]) - self._assert_insert_conversion(obj, False, exp, np.int64) - - # int + object -> object - exp = pd.Index([1, 'x', 2, 3, 4]) - self._assert_insert_conversion(obj, 'x', exp, np.object) - - def test_insert_index_float64(self): + @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [ + (1, 1., np.float64), + (1.1, 1.1, np.float64), + (False, 0., np.float64), + ('x', 'x', np.object)]) + def test_insert_index_float64(self, insert, coerced_val, coerced_dtype): obj = pd.Float64Index([1., 2., 3., 4.]) assert obj.dtype == np.float64 - # float + int -> int - exp = pd.Index([1., 1., 2., 3., 4.]) - self._assert_insert_conversion(obj, 1, exp, np.float64) - - # float + float -> float - exp = pd.Index([1., 1.1, 2., 3., 4.]) - self._assert_insert_conversion(obj, 1.1, exp, np.float64) - - # float + bool -> float - exp = pd.Index([1., 0., 2., 3., 4.]) - self._assert_insert_conversion(obj, False, exp, np.float64) + exp = pd.Index([1., coerced_val, 2., 3., 4.]) + self._assert_insert_conversion(obj, insert, exp, coerced_dtype) - # float + object -> object - exp = pd.Index([1., 'x', 2., 3., 4.]) - self._assert_insert_conversion(obj, 'x', exp, np.object) - - def test_insert_index_complex128(self): - pass - - def test_insert_index_bool(self): - pass - - def test_insert_index_datetime64(self): + @pytest.mark.parametrize('fill_val,exp_dtype', [ + (pd.Timestamp('2012-01-01'), 'datetime64[ns]'), + (pd.Timestamp('2012-01-01', tz='US/Eastern'), + 'datetime64[ns, US/Eastern]')], + ids=['datetime64', 'datetime64tz']) + def test_insert_index_datetimes(self, fill_val, exp_dtype): obj = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', - '2011-01-04']) - assert obj.dtype == 'datetime64[ns]' + '2011-01-04'], tz=fill_val.tz) + assert obj.dtype == exp_dtype - # datetime64 + datetime64 => datetime64 - exp = pd.DatetimeIndex(['2011-01-01', '2012-01-01', '2011-01-02', - '2011-01-03', '2011-01-04']) - self._assert_insert_conversion(obj, pd.Timestamp('2012-01-01'), - exp, 'datetime64[ns]') + exp = pd.DatetimeIndex(['2011-01-01', fill_val.date(), '2011-01-02', + '2011-01-03', '2011-01-04'], tz=fill_val.tz) + self._assert_insert_conversion(obj, fill_val, exp, exp_dtype) - # ToDo: must coerce to object msg = "Passed item and index have different timezone" - with tm.assert_raises_regex(ValueError, msg): - obj.insert(1, pd.Timestamp('2012-01-01', tz='US/Eastern')) - - # ToDo: must coerce to object - msg = "cannot insert DatetimeIndex with incompatible label" - with tm.assert_raises_regex(TypeError, msg): - obj.insert(1, 1) - - def test_insert_index_datetime64tz(self): - obj = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', - '2011-01-04'], tz='US/Eastern') - assert obj.dtype == 'datetime64[ns, US/Eastern]' - - # datetime64tz + datetime64tz => datetime64 - exp = pd.DatetimeIndex(['2011-01-01', '2012-01-01', '2011-01-02', - '2011-01-03', '2011-01-04'], tz='US/Eastern') - val = pd.Timestamp('2012-01-01', tz='US/Eastern') - self._assert_insert_conversion(obj, val, exp, - 'datetime64[ns, US/Eastern]') + if fill_val.tz: + with tm.assert_raises_regex(ValueError, msg): + obj.insert(1, pd.Timestamp('2012-01-01')) - # ToDo: must coerce to object - msg = "Passed item and index have different timezone" - with tm.assert_raises_regex(ValueError, msg): - obj.insert(1, pd.Timestamp('2012-01-01')) - - # ToDo: must coerce to object - msg = "Passed item and index have different timezone" with tm.assert_raises_regex(ValueError, msg): obj.insert(1, pd.Timestamp('2012-01-01', tz='Asia/Tokyo')) - # ToDo: must coerce to object msg = "cannot insert DatetimeIndex with incompatible label" with tm.assert_raises_regex(TypeError, msg): obj.insert(1, 1) + pytest.xfail("ToDo: must coerce to object") + def test_insert_index_timedelta64(self): obj = pd.TimedeltaIndex(['1 day', '2 day', '3 day', '4 day']) assert obj.dtype == 'timedelta64[ns]' @@ -534,41 +404,33 @@ def test_insert_index_timedelta64(self): with tm.assert_raises_regex(TypeError, msg): obj.insert(1, 1) - def test_insert_index_period(self): + @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [ + (pd.Period('2012-01', freq='M'), '2012-01', 'period[M]'), + (pd.Timestamp('2012-01-01'), pd.Timestamp('2012-01-01'), np.object), + (1, 1, np.object), + ('x', 'x', np.object)]) + def test_insert_index_period(self, insert, coerced_val, coerced_dtype): obj = pd.PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], freq='M') assert obj.dtype == 'period[M]' - # period + period => period - exp = pd.PeriodIndex(['2011-01', '2012-01', '2011-02', - '2011-03', '2011-04'], freq='M') - self._assert_insert_conversion(obj, pd.Period('2012-01', freq='M'), - exp, 'period[M]') - - # period + datetime64 => object - exp = pd.Index([pd.Period('2011-01', freq='M'), - pd.Timestamp('2012-01-01'), - pd.Period('2011-02', freq='M'), - pd.Period('2011-03', freq='M'), - pd.Period('2011-04', freq='M')], freq='M') - self._assert_insert_conversion(obj, pd.Timestamp('2012-01-01'), - exp, np.object) - - # period + int => object - exp = pd.Index([pd.Period('2011-01', freq='M'), - 1, - pd.Period('2011-02', freq='M'), - pd.Period('2011-03', freq='M'), - pd.Period('2011-04', freq='M')], freq='M') - self._assert_insert_conversion(obj, 1, exp, np.object) - - # period + object => object - exp = pd.Index([pd.Period('2011-01', freq='M'), - 'x', - pd.Period('2011-02', freq='M'), - pd.Period('2011-03', freq='M'), - pd.Period('2011-04', freq='M')], freq='M') - self._assert_insert_conversion(obj, 'x', exp, np.object) + if isinstance(insert, pd.Period): + index_type = pd.PeriodIndex + else: + index_type = pd.Index + + exp = index_type([pd.Period('2011-01', freq='M'), + coerced_val, + pd.Period('2011-02', freq='M'), + pd.Period('2011-03', freq='M'), + pd.Period('2011-04', freq='M')], freq='M') + self._assert_insert_conversion(obj, insert, exp, coerced_dtype) + + def test_insert_index_complex128(self): + pass + + def test_insert_index_bool(self): + pass class TestWhereCoercion(CoercionBase): @@ -582,233 +444,128 @@ def _assert_where_conversion(self, original, cond, values, res = target.where(cond, values) self._assert(res, expected, expected_dtype) - def _where_object_common(self, klass): + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], + ids=['series', 'index']) + @pytest.mark.parametrize("fill_val,exp_dtype", [ + (1, np.object), + (1.1, np.object), + (1 + 1j, np.object), + (True, np.object)]) + def test_where_object(self, klass, fill_val, exp_dtype): obj = klass(list('abcd')) assert obj.dtype == np.object cond = klass([True, False, True, False]) - # object + int -> object - exp = klass(['a', 1, 'c', 1]) - self._assert_where_conversion(obj, cond, 1, exp, np.object) - - values = klass([5, 6, 7, 8]) - exp = klass(['a', 6, 'c', 8]) - self._assert_where_conversion(obj, cond, values, exp, np.object) - - # object + float -> object - exp = klass(['a', 1.1, 'c', 1.1]) - self._assert_where_conversion(obj, cond, 1.1, exp, np.object) - - values = klass([5.5, 6.6, 7.7, 8.8]) - exp = klass(['a', 6.6, 'c', 8.8]) - self._assert_where_conversion(obj, cond, values, exp, np.object) - - # object + complex -> object - exp = klass(['a', 1 + 1j, 'c', 1 + 1j]) - self._assert_where_conversion(obj, cond, 1 + 1j, exp, np.object) - - values = klass([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) - exp = klass(['a', 6 + 6j, 'c', 8 + 8j]) - self._assert_where_conversion(obj, cond, values, exp, np.object) + if fill_val is True and klass is pd.Series: + ret_val = 1 + else: + ret_val = fill_val - if klass is pd.Series: - exp = klass(['a', 1, 'c', 1]) - self._assert_where_conversion(obj, cond, True, exp, np.object) + exp = klass(['a', ret_val, 'c', ret_val]) + self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) + if fill_val is True: values = klass([True, False, True, True]) - exp = klass(['a', 0, 'c', 1]) - self._assert_where_conversion(obj, cond, values, exp, np.object) - elif klass is pd.Index: - # object + bool -> object - exp = klass(['a', True, 'c', True]) - self._assert_where_conversion(obj, cond, True, exp, np.object) - - values = klass([True, False, True, True]) - exp = klass(['a', False, 'c', True]) - self._assert_where_conversion(obj, cond, values, exp, np.object) else: - NotImplementedError - - def test_where_series_object(self): - self._where_object_common(pd.Series) - - def test_where_index_object(self): - self._where_object_common(pd.Index) - - def _where_int64_common(self, klass): + values = klass(fill_val * x for x in [5, 6, 7, 8]) + + exp = klass(['a', values[1], 'c', values[3]]) + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) + + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], + ids=['series', 'index']) + @pytest.mark.parametrize("fill_val,exp_dtype", [ + (1, np.int64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.object)]) + def test_where_int64(self, klass, fill_val, exp_dtype): + if klass is pd.Index and exp_dtype is np.complex128: + pytest.skip("Complex Index not supported") obj = klass([1, 2, 3, 4]) assert obj.dtype == np.int64 cond = klass([True, False, True, False]) - # int + int -> int - exp = klass([1, 1, 3, 1]) - self._assert_where_conversion(obj, cond, 1, exp, np.int64) + exp = klass([1, fill_val, 3, fill_val]) + self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) - values = klass([5, 6, 7, 8]) - exp = klass([1, 6, 3, 8]) - self._assert_where_conversion(obj, cond, values, exp, np.int64) - - # int + float -> float - exp = klass([1, 1.1, 3, 1.1]) - self._assert_where_conversion(obj, cond, 1.1, exp, np.float64) - - values = klass([5.5, 6.6, 7.7, 8.8]) - exp = klass([1, 6.6, 3, 8.8]) - self._assert_where_conversion(obj, cond, values, exp, np.float64) - - # int + complex -> complex - if klass is pd.Series: - exp = klass([1, 1 + 1j, 3, 1 + 1j]) - self._assert_where_conversion(obj, cond, 1 + 1j, exp, - np.complex128) - - values = klass([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) - exp = klass([1, 6 + 6j, 3, 8 + 8j]) - self._assert_where_conversion(obj, cond, values, exp, - np.complex128) - - # int + bool -> object - exp = klass([1, True, 3, True]) - self._assert_where_conversion(obj, cond, True, exp, np.object) - - values = klass([True, False, True, True]) - exp = klass([1, False, 3, True]) - self._assert_where_conversion(obj, cond, values, exp, np.object) - - def test_where_series_int64(self): - self._where_int64_common(pd.Series) - - def test_where_index_int64(self): - self._where_int64_common(pd.Index) - - def _where_float64_common(self, klass): + if fill_val is True: + values = klass([True, False, True, True]) + else: + values = klass(x * fill_val for x in [5, 6, 7, 8]) + exp = klass([1, values[1], 3, values[3]]) + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) + + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], + ids=['series', 'index']) + @pytest.mark.parametrize("fill_val, exp_dtype", [ + (1, np.float64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.object)]) + def test_where_float64(self, klass, fill_val, exp_dtype): + if klass is pd.Index and exp_dtype is np.complex128: + pytest.skip("Complex Index not supported") obj = klass([1.1, 2.2, 3.3, 4.4]) assert obj.dtype == np.float64 cond = klass([True, False, True, False]) - # float + int -> float - exp = klass([1.1, 1.0, 3.3, 1.0]) - self._assert_where_conversion(obj, cond, 1, exp, np.float64) - - values = klass([5, 6, 7, 8]) - exp = klass([1.1, 6.0, 3.3, 8.0]) - self._assert_where_conversion(obj, cond, values, exp, np.float64) - - # float + float -> float - exp = klass([1.1, 1.1, 3.3, 1.1]) - self._assert_where_conversion(obj, cond, 1.1, exp, np.float64) - - values = klass([5.5, 6.6, 7.7, 8.8]) - exp = klass([1.1, 6.6, 3.3, 8.8]) - self._assert_where_conversion(obj, cond, values, exp, np.float64) - - # float + complex -> complex - if klass is pd.Series: - exp = klass([1.1, 1 + 1j, 3.3, 1 + 1j]) - self._assert_where_conversion(obj, cond, 1 + 1j, exp, - np.complex128) - - values = klass([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) - exp = klass([1.1, 6 + 6j, 3.3, 8 + 8j]) - self._assert_where_conversion(obj, cond, values, exp, - np.complex128) - - # float + bool -> object - exp = klass([1.1, True, 3.3, True]) - self._assert_where_conversion(obj, cond, True, exp, np.object) + exp = klass([1.1, fill_val, 3.3, fill_val]) + self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) - values = klass([True, False, True, True]) - exp = klass([1.1, False, 3.3, True]) - self._assert_where_conversion(obj, cond, values, exp, np.object) - - def test_where_series_float64(self): - self._where_float64_common(pd.Series) - - def test_where_index_float64(self): - self._where_float64_common(pd.Index) - - def test_where_series_complex128(self): + if fill_val is True: + values = klass([True, False, True, True]) + else: + values = klass(x * fill_val for x in [5, 6, 7, 8]) + exp = klass([1.1, values[1], 3.3, values[3]]) + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) + + @pytest.mark.parametrize("fill_val,exp_dtype", [ + (1, np.complex128), + (1.1, np.complex128), + (1 + 1j, np.complex128), + (True, np.object)]) + def test_where_series_complex128(self, fill_val, exp_dtype): obj = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) assert obj.dtype == np.complex128 cond = pd.Series([True, False, True, False]) - # complex + int -> complex - exp = pd.Series([1 + 1j, 1, 3 + 3j, 1]) - self._assert_where_conversion(obj, cond, 1, exp, np.complex128) - - values = pd.Series([5, 6, 7, 8]) - exp = pd.Series([1 + 1j, 6.0, 3 + 3j, 8.0]) - self._assert_where_conversion(obj, cond, values, exp, np.complex128) - - # complex + float -> complex - exp = pd.Series([1 + 1j, 1.1, 3 + 3j, 1.1]) - self._assert_where_conversion(obj, cond, 1.1, exp, np.complex128) - - values = pd.Series([5.5, 6.6, 7.7, 8.8]) - exp = pd.Series([1 + 1j, 6.6, 3 + 3j, 8.8]) - self._assert_where_conversion(obj, cond, values, exp, np.complex128) - - # complex + complex -> complex - exp = pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 1 + 1j]) - self._assert_where_conversion(obj, cond, 1 + 1j, exp, np.complex128) + exp = pd.Series([1 + 1j, fill_val, 3 + 3j, fill_val]) + self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) - values = pd.Series([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) - exp = pd.Series([1 + 1j, 6 + 6j, 3 + 3j, 8 + 8j]) - self._assert_where_conversion(obj, cond, values, exp, np.complex128) - - # complex + bool -> object - exp = pd.Series([1 + 1j, True, 3 + 3j, True]) - self._assert_where_conversion(obj, cond, True, exp, np.object) - - values = pd.Series([True, False, True, True]) - exp = pd.Series([1 + 1j, False, 3 + 3j, True]) - self._assert_where_conversion(obj, cond, values, exp, np.object) - - def test_where_index_complex128(self): - pass + if fill_val is True: + values = pd.Series([True, False, True, True]) + else: + values = pd.Series(x * fill_val for x in [5, 6, 7, 8]) + exp = pd.Series([1 + 1j, values[1], 3 + 3j, values[3]]) + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - def test_where_series_bool(self): + @pytest.mark.parametrize("fill_val,exp_dtype", [ + (1, np.object), + (1.1, np.object), + (1 + 1j, np.object), + (True, np.bool)]) + def test_where_series_bool(self, fill_val, exp_dtype): obj = pd.Series([True, False, True, False]) assert obj.dtype == np.bool cond = pd.Series([True, False, True, False]) - # bool + int -> object - exp = pd.Series([True, 1, True, 1]) - self._assert_where_conversion(obj, cond, 1, exp, np.object) - - values = pd.Series([5, 6, 7, 8]) - exp = pd.Series([True, 6, True, 8]) - self._assert_where_conversion(obj, cond, values, exp, np.object) - - # bool + float -> object - exp = pd.Series([True, 1.1, True, 1.1]) - self._assert_where_conversion(obj, cond, 1.1, exp, np.object) - - values = pd.Series([5.5, 6.6, 7.7, 8.8]) - exp = pd.Series([True, 6.6, True, 8.8]) - self._assert_where_conversion(obj, cond, values, exp, np.object) + exp = pd.Series([True, fill_val, True, fill_val]) + self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) - # bool + complex -> object - exp = pd.Series([True, 1 + 1j, True, 1 + 1j]) - self._assert_where_conversion(obj, cond, 1 + 1j, exp, np.object) - - values = pd.Series([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) - exp = pd.Series([True, 6 + 6j, True, 8 + 8j]) - self._assert_where_conversion(obj, cond, values, exp, np.object) - - # bool + bool -> bool - exp = pd.Series([True, True, True, True]) - self._assert_where_conversion(obj, cond, True, exp, np.bool) - - values = pd.Series([True, False, True, True]) - exp = pd.Series([True, False, True, True]) - self._assert_where_conversion(obj, cond, values, exp, np.bool) - - def test_where_index_bool(self): - pass - - def test_where_series_datetime64(self): + if fill_val is True: + values = pd.Series([True, False, True, True]) + else: + values = pd.Series(x * fill_val for x in [5, 6, 7, 8]) + exp = pd.Series([True, values[1], True, values[3]]) + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) + + @pytest.mark.parametrize("fill_val,exp_dtype", [ + (pd.Timestamp('2012-01-01'), 'datetime64[ns]'), + (pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object)], + ids=['datetime64', 'datetime64tz']) + def test_where_series_datetime64(self, fill_val, exp_dtype): obj = pd.Series([pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-03'), @@ -816,46 +573,29 @@ def test_where_series_datetime64(self): assert obj.dtype == 'datetime64[ns]' cond = pd.Series([True, False, True, False]) - # datetime64 + datetime64 -> datetime64 - exp = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp('2012-01-01'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2012-01-01')]) - self._assert_where_conversion(obj, cond, pd.Timestamp('2012-01-01'), - exp, 'datetime64[ns]') - - values = pd.Series([pd.Timestamp('2012-01-01'), - pd.Timestamp('2012-01-02'), - pd.Timestamp('2012-01-03'), - pd.Timestamp('2012-01-04')]) - exp = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp('2012-01-02'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2012-01-04')]) - self._assert_where_conversion(obj, cond, values, exp, 'datetime64[ns]') + exp = pd.Series([pd.Timestamp('2011-01-01'), fill_val, + pd.Timestamp('2011-01-03'), fill_val]) + self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) - # datetime64 + datetime64tz -> object - exp = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp('2012-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2012-01-01', tz='US/Eastern')]) - self._assert_where_conversion( - obj, cond, - pd.Timestamp('2012-01-01', tz='US/Eastern'), - exp, np.object) - - # ToDo: do not coerce to UTC, must be object - values = pd.Series([pd.Timestamp('2012-01-01', tz='US/Eastern'), - pd.Timestamp('2012-01-02', tz='US/Eastern'), - pd.Timestamp('2012-01-03', tz='US/Eastern'), - pd.Timestamp('2012-01-04', tz='US/Eastern')]) - exp = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp('2012-01-02 05:00'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2012-01-04 05:00')]) - self._assert_where_conversion(obj, cond, values, exp, 'datetime64[ns]') - - def test_where_index_datetime64(self): + values = pd.Series(pd.date_range(fill_val, periods=4)) + if fill_val.tz: + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-02 05:00'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2012-01-04 05:00')]) + self._assert_where_conversion(obj, cond, values, exp, + 'datetime64[ns]') + pytest.xfail("ToDo: do not coerce to UTC, must be object") + + exp = pd.Series([pd.Timestamp('2011-01-01'), values[1], + pd.Timestamp('2011-01-03'), values[3]]) + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) + + @pytest.mark.parametrize("fill_val,exp_dtype", [ + (pd.Timestamp('2012-01-01'), 'datetime64[ns]'), + (pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object)], + ids=['datetime64', 'datetime64tz']) + def test_where_index_datetime(self, fill_val, exp_dtype): obj = pd.Index([pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-03'), @@ -863,38 +603,30 @@ def test_where_index_datetime64(self): assert obj.dtype == 'datetime64[ns]' cond = pd.Index([True, False, True, False]) - # datetime64 + datetime64 -> datetime64 - # must support scalar - msg = "cannot coerce a Timestamp with a tz on a naive Block" - with pytest.raises(TypeError): - obj.where(cond, pd.Timestamp('2012-01-01')) - - values = pd.Index([pd.Timestamp('2012-01-01'), - pd.Timestamp('2012-01-02'), - pd.Timestamp('2012-01-03'), - pd.Timestamp('2012-01-04')]) - exp = pd.Index([pd.Timestamp('2011-01-01'), - pd.Timestamp('2012-01-02'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2012-01-04')]) - self._assert_where_conversion(obj, cond, values, exp, 'datetime64[ns]') - - # ToDo: coerce to object msg = ("Index\\(\\.\\.\\.\\) must be called with a collection " "of some kind") with tm.assert_raises_regex(TypeError, msg): - obj.where(cond, pd.Timestamp('2012-01-01', tz='US/Eastern')) + obj.where(cond, fill_val) - # ToDo: do not ignore timezone, must be object - values = pd.Index([pd.Timestamp('2012-01-01', tz='US/Eastern'), - pd.Timestamp('2012-01-02', tz='US/Eastern'), - pd.Timestamp('2012-01-03', tz='US/Eastern'), - pd.Timestamp('2012-01-04', tz='US/Eastern')]) + values = pd.Index(pd.date_range(fill_val, periods=4)) exp = pd.Index([pd.Timestamp('2011-01-01'), pd.Timestamp('2012-01-02'), pd.Timestamp('2011-01-03'), pd.Timestamp('2012-01-04')]) - self._assert_where_conversion(obj, cond, values, exp, 'datetime64[ns]') + + if fill_val.tz: + self._assert_where_conversion(obj, cond, values, exp, + 'datetime64[ns]') + pytest.xfail("ToDo: do not ignore timezone, must be object") + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) + pytest.xfail("datetime64 + datetime64 -> datetime64 must support" + " scalar") + + def test_where_index_complex128(self): + pass + + def test_where_index_bool(self): + pass def test_where_series_datetime64tz(self): pass @@ -921,6 +653,9 @@ class TestFillnaSeriesCoercion(CoercionBase): method = 'fillna' + def test_has_comprehensive_tests(self): + pass + def _assert_fillna_conversion(self, original, value, expected, expected_dtype): """ test coercion triggered by fillna """ @@ -928,181 +663,105 @@ def _assert_fillna_conversion(self, original, value, res = target.fillna(value) self._assert(res, expected, expected_dtype) - def _fillna_object_common(self, klass): + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], + ids=['series', 'index']) + @pytest.mark.parametrize("fill_val, fill_dtype", [ + (1, np.object), + (1.1, np.object), + (1 + 1j, np.object), + (True, np.object)]) + def test_fillna_object(self, klass, fill_val, fill_dtype): obj = klass(['a', np.nan, 'c', 'd']) assert obj.dtype == np.object - # object + int -> object - exp = klass(['a', 1, 'c', 'd']) - self._assert_fillna_conversion(obj, 1, exp, np.object) - - # object + float -> object - exp = klass(['a', 1.1, 'c', 'd']) - self._assert_fillna_conversion(obj, 1.1, exp, np.object) - - # object + complex -> object - exp = klass(['a', 1 + 1j, 'c', 'd']) - self._assert_fillna_conversion(obj, 1 + 1j, exp, np.object) - - # object + bool -> object - exp = klass(['a', True, 'c', 'd']) - self._assert_fillna_conversion(obj, True, exp, np.object) - - def test_fillna_series_object(self): - self._fillna_object_common(pd.Series) - - def test_fillna_index_object(self): - self._fillna_object_common(pd.Index) - - def test_fillna_series_int64(self): - # int can't hold NaN - pass - - def test_fillna_index_int64(self): - pass - - def _fillna_float64_common(self, klass, complex): + exp = klass(['a', fill_val, 'c', 'd']) + self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) + + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], + ids=['series', 'index']) + @pytest.mark.parametrize("fill_val,fill_dtype", [ + (1, np.float64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.object)]) + def test_fillna_float64(self, klass, fill_val, fill_dtype): obj = klass([1.1, np.nan, 3.3, 4.4]) assert obj.dtype == np.float64 - # float + int -> float - exp = klass([1.1, 1.0, 3.3, 4.4]) - self._assert_fillna_conversion(obj, 1, exp, np.float64) - - # float + float -> float - exp = klass([1.1, 1.1, 3.3, 4.4]) - self._assert_fillna_conversion(obj, 1.1, exp, np.float64) - + exp = klass([1.1, fill_val, 3.3, 4.4]) # float + complex -> we don't support a complex Index # complex for Series, # object for Index - exp = klass([1.1, 1 + 1j, 3.3, 4.4]) - self._assert_fillna_conversion(obj, 1 + 1j, exp, complex) - - # float + bool -> object - exp = klass([1.1, True, 3.3, 4.4]) - self._assert_fillna_conversion(obj, True, exp, np.object) - - def test_fillna_series_float64(self): - self._fillna_float64_common(pd.Series, complex=np.complex128) - - def test_fillna_index_float64(self): - self._fillna_float64_common(pd.Index, complex=np.object) - - def test_fillna_series_complex128(self): + if fill_dtype == np.complex128 and klass == pd.Index: + fill_dtype = np.object + self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) + + @pytest.mark.parametrize("fill_val,fill_dtype", [ + (1, np.complex128), + (1.1, np.complex128), + (1 + 1j, np.complex128), + (True, np.object)]) + def test_fillna_series_complex128(self, fill_val, fill_dtype): obj = pd.Series([1 + 1j, np.nan, 3 + 3j, 4 + 4j]) assert obj.dtype == np.complex128 - # complex + int -> complex - exp = pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j]) - self._assert_fillna_conversion(obj, 1, exp, np.complex128) - - # complex + float -> complex - exp = pd.Series([1 + 1j, 1.1, 3 + 3j, 4 + 4j]) - self._assert_fillna_conversion(obj, 1.1, exp, np.complex128) - - # complex + complex -> complex - exp = pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 4 + 4j]) - self._assert_fillna_conversion(obj, 1 + 1j, exp, np.complex128) - - # complex + bool -> object - exp = pd.Series([1 + 1j, True, 3 + 3j, 4 + 4j]) - self._assert_fillna_conversion(obj, True, exp, np.object) - - def test_fillna_index_complex128(self): - self._fillna_float64_common(pd.Index, complex=np.object) - - def test_fillna_series_bool(self): - # bool can't hold NaN - pass - - def test_fillna_index_bool(self): - pass - - def test_fillna_series_datetime64(self): - obj = pd.Series([pd.Timestamp('2011-01-01'), - pd.NaT, - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) + exp = pd.Series([1 + 1j, fill_val, 3 + 3j, 4 + 4j]) + self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) + + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], + ids=['series', 'index']) + @pytest.mark.parametrize("fill_val,fill_dtype", [ + (pd.Timestamp('2012-01-01'), 'datetime64[ns]'), + (pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object), + (1, np.object), ('x', np.object)], + ids=['datetime64', 'datetime64tz', 'object', 'object']) + def test_fillna_datetime(self, klass, fill_val, fill_dtype): + obj = klass([pd.Timestamp('2011-01-01'), + pd.NaT, + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) assert obj.dtype == 'datetime64[ns]' - # datetime64 + datetime64 => datetime64 - exp = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp('2012-01-01'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - self._assert_fillna_conversion(obj, pd.Timestamp('2012-01-01'), - exp, 'datetime64[ns]') - - # datetime64 + datetime64tz => object - exp = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp('2012-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - value = pd.Timestamp('2012-01-01', tz='US/Eastern') - self._assert_fillna_conversion(obj, value, exp, np.object) - - # datetime64 + int => object - exp = pd.Series([pd.Timestamp('2011-01-01'), - 1, - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - self._assert_fillna_conversion(obj, 1, exp, 'object') - - # datetime64 + object => object - exp = pd.Series([pd.Timestamp('2011-01-01'), - 'x', - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - self._assert_fillna_conversion(obj, 'x', exp, np.object) - - def test_fillna_series_datetime64tz(self): + exp = klass([pd.Timestamp('2011-01-01'), + fill_val, + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) + + @pytest.mark.parametrize("klass", [pd.Series, pd.Index]) + @pytest.mark.parametrize("fill_val,fill_dtype", [ + (pd.Timestamp('2012-01-01', tz='US/Eastern'), + 'datetime64[ns, US/Eastern]'), + (pd.Timestamp('2012-01-01'), np.object), + (pd.Timestamp('2012-01-01', tz='Asia/Tokyo'), np.object), + (1, np.object), + ('x', np.object)]) + def test_fillna_datetime64tz(self, klass, fill_val, fill_dtype): tz = 'US/Eastern' - obj = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.NaT, - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) + obj = klass([pd.Timestamp('2011-01-01', tz=tz), + pd.NaT, + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) assert obj.dtype == 'datetime64[ns, US/Eastern]' - # datetime64tz + datetime64tz => datetime64tz - exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2012-01-01', tz=tz), - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - value = pd.Timestamp('2012-01-01', tz=tz) - self._assert_fillna_conversion(obj, value, exp, - 'datetime64[ns, US/Eastern]') + exp = klass([pd.Timestamp('2011-01-01', tz=tz), + fill_val, + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) - # datetime64tz + datetime64 => object - exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2012-01-01'), - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - value = pd.Timestamp('2012-01-01') - self._assert_fillna_conversion(obj, value, exp, np.object) + def test_fillna_series_int64(self): + pass - # datetime64tz + datetime64tz(different tz) => object - exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2012-01-01', tz='Asia/Tokyo'), - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - value = pd.Timestamp('2012-01-01', tz='Asia/Tokyo') - self._assert_fillna_conversion(obj, value, exp, np.object) + def test_fillna_index_int64(self): + pass - # datetime64tz + int => object - exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - 1, - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - self._assert_fillna_conversion(obj, 1, exp, np.object) + def test_fillna_series_bool(self): + pass - # datetime64tz + object => object - exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - 'x', - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - self._assert_fillna_conversion(obj, 'x', exp, np.object) + def test_fillna_index_bool(self): + pass def test_fillna_series_timedelta64(self): pass @@ -1110,83 +769,6 @@ def test_fillna_series_timedelta64(self): def test_fillna_series_period(self): pass - def test_fillna_index_datetime64(self): - obj = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03', - '2011-01-04']) - assert obj.dtype == 'datetime64[ns]' - - # datetime64 + datetime64 => datetime64 - exp = pd.DatetimeIndex(['2011-01-01', '2012-01-01', - '2011-01-03', '2011-01-04']) - self._assert_fillna_conversion(obj, pd.Timestamp('2012-01-01'), - exp, 'datetime64[ns]') - - # datetime64 + datetime64tz => object - exp = pd.Index([pd.Timestamp('2011-01-01'), - pd.Timestamp('2012-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - value = pd.Timestamp('2012-01-01', tz='US/Eastern') - self._assert_fillna_conversion(obj, value, exp, np.object) - - # datetime64 + int => object - exp = pd.Index([pd.Timestamp('2011-01-01'), - 1, - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - self._assert_fillna_conversion(obj, 1, exp, np.object) - - # datetime64 + object => object - exp = pd.Index([pd.Timestamp('2011-01-01'), - 'x', - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - self._assert_fillna_conversion(obj, 'x', exp, np.object) - - def test_fillna_index_datetime64tz(self): - tz = 'US/Eastern' - - obj = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03', - '2011-01-04'], tz=tz) - assert obj.dtype == 'datetime64[ns, US/Eastern]' - - # datetime64tz + datetime64tz => datetime64tz - exp = pd.DatetimeIndex(['2011-01-01', '2012-01-01', - '2011-01-03', '2011-01-04'], tz=tz) - value = pd.Timestamp('2012-01-01', tz=tz) - self._assert_fillna_conversion(obj, value, exp, - 'datetime64[ns, US/Eastern]') - - # datetime64tz + datetime64 => object - exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2012-01-01'), - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - value = pd.Timestamp('2012-01-01') - self._assert_fillna_conversion(obj, value, exp, np.object) - - # datetime64tz + datetime64tz(different tz) => object - exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2012-01-01', tz='Asia/Tokyo'), - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - value = pd.Timestamp('2012-01-01', tz='Asia/Tokyo') - self._assert_fillna_conversion(obj, value, exp, np.object) - - # datetime64tz + int => object - exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), - 1, - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - self._assert_fillna_conversion(obj, 1, exp, np.object) - - # datetime64tz + object => object - exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), - 'x', - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - self._assert_fillna_conversion(obj, 'x', exp, np.object) - def test_fillna_index_timedelta64(self): pass @@ -1196,38 +778,49 @@ def test_fillna_index_period(self): class TestReplaceSeriesCoercion(CoercionBase): - # not indexing, but place here for consisntency - klasses = ['series'] method = 'replace' - def setup_method(self, method): - self.rep = {} - self.rep['object'] = ['a', 'b'] - self.rep['int64'] = [4, 5] - self.rep['float64'] = [1.1, 2.2] - self.rep['complex128'] = [1 + 1j, 2 + 2j] - self.rep['bool'] = [True, False] - self.rep['datetime64[ns]'] = [pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-03')] - - for tz in ['UTC', 'US/Eastern']: - # to test tz => different tz replacement - key = 'datetime64[ns, {0}]'.format(tz) - self.rep[key] = [pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2011-01-03', tz=tz)] - - self.rep['timedelta64[ns]'] = [pd.Timedelta('1 day'), - pd.Timedelta('2 day')] - - def _assert_replace_conversion(self, from_key, to_key, how): + rep = {} + rep['object'] = ['a', 'b'] + rep['int64'] = [4, 5] + rep['float64'] = [1.1, 2.2] + rep['complex128'] = [1 + 1j, 2 + 2j] + rep['bool'] = [True, False] + rep['datetime64[ns]'] = [pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-03')] + + for tz in ['UTC', 'US/Eastern']: + # to test tz => different tz replacement + key = 'datetime64[ns, {0}]'.format(tz) + rep[key] = [pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2011-01-03', tz=tz)] + + rep['timedelta64[ns]'] = [pd.Timedelta('1 day'), + pd.Timedelta('2 day')] + + @pytest.mark.parametrize('how', ['dict', 'series']) + @pytest.mark.parametrize('to_key', [ + 'object', 'int64', 'float64', 'complex128', 'bool', 'datetime64[ns]', + 'datetime64[ns, UTC]', 'datetime64[ns, US/Eastern]', 'timedelta64[ns]' + ], ids=['object', 'int64', 'float64', 'complex128', 'bool', + 'datetime64', 'datetime64tz', 'datetime64tz', 'timedelta64']) + @pytest.mark.parametrize('from_key', [ + 'object', 'int64', 'float64', 'complex128', 'bool', 'datetime64[ns]', + 'datetime64[ns, UTC]', 'datetime64[ns, US/Eastern]', 'timedelta64[ns]'] + ) + def test_replace_series(self, how, to_key, from_key): + if from_key == 'bool' and how == 'series' and compat.PY3: + # doesn't work in PY3, though ...dict_from_bool works fine + pytest.skip("doesn't work as in PY3") + index = pd.Index([3, 4], name='xxx') obj = pd.Series(self.rep[from_key], index=index, name='yyy') assert obj.dtype == from_key if (from_key.startswith('datetime') and to_key.startswith('datetime')): - # different tz, currently mask_missing raises SystemError - return + pytest.xfail("different tz, currently mask_missing " + "raises SystemError") if how == 'dict': replacer = dict(zip(self.rep[from_key], self.rep[to_key])) @@ -1242,7 +835,6 @@ def _assert_replace_conversion(self, from_key, to_key, how): (from_key == 'complex128' and to_key in ('int64', 'float64'))): - # buggy on 32-bit / window if compat.is_platform_32bit() or compat.is_platform_windows(): pytest.skip("32-bit platform buggy: {0} -> {1}".format (from_key, to_key)) @@ -1257,77 +849,5 @@ def _assert_replace_conversion(self, from_key, to_key, how): tm.assert_series_equal(result, exp) - def test_replace_series_object(self): - from_key = 'object' - for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='dict') - - for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='series') - - def test_replace_series_int64(self): - from_key = 'int64' - for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='dict') - - for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='series') - - def test_replace_series_float64(self): - from_key = 'float64' - for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='dict') - - for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='series') - - def test_replace_series_complex128(self): - from_key = 'complex128' - for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='dict') - - for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='series') - - def test_replace_series_bool(self): - from_key = 'bool' - for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='dict') - - for to_key in self.rep: - - if compat.PY3: - # doesn't work in PY3, though ...dict_from_bool works fine - pytest.skip("doesn't work as in PY3") - - self._assert_replace_conversion(from_key, to_key, how='series') - - def test_replace_series_datetime64(self): - from_key = 'datetime64[ns]' - for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='dict') - - from_key = 'datetime64[ns]' - for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='series') - - def test_replace_series_datetime64tz(self): - from_key = 'datetime64[ns, US/Eastern]' - for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='dict') - - from_key = 'datetime64[ns, US/Eastern]' - for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='series') - - def test_replace_series_timedelta64(self): - from_key = 'timedelta64[ns]' - for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='dict') - - from_key = 'timedelta64[ns]' - for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='series') - def test_replace_series_period(self): pass diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 600dd843fcdb4..0a0f39595c7e6 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -400,7 +400,6 @@ def test_to_latex_longtable(self, frame): 1 & 2 & b2 \\ \end{longtable} """ - open("expected.txt", "w").write(withindex_result) assert withindex_result == withindex_expected withoutindex_result = df.to_latex(index=False, longtable=True) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index c059f01ecf3f4..5da347e47957c 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -1,6 +1,7 @@ import pandas as pd from pandas.compat import PY2 import pandas.util.testing as tm +import pandas.util._test_decorators as td from pandas.errors import EmptyDataError import os import io @@ -71,8 +72,8 @@ def test_from_iterator(self): tm.assert_frame_equal(df, df0.iloc[2:5, :]) rdr.close() + @td.skip_if_no('pathlib') def test_path_pathlib(self): - tm._skip_if_no_pathlib() from pathlib import Path for j in 0, 1: df0 = self.data[j] @@ -82,8 +83,8 @@ def test_path_pathlib(self): df = pd.read_sas(fname, encoding='utf-8') tm.assert_frame_equal(df, df0) + @td.skip_if_no('py.path') def test_path_localpath(self): - tm._skip_if_no_localpath() from py.path import local as LocalPath for j in 0, 1: df0 = self.data[j] diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 707580bfe9601..13a393d9109ae 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -8,6 +8,7 @@ import pandas as pd import pandas.util.testing as tm +import pandas.util._test_decorators as td from pandas.io import common from pandas.compat import is_platform_windows, StringIO, FileNotFoundError @@ -67,17 +68,15 @@ def test_expand_user_normal_path(self): assert expanded_name == filename assert os.path.expanduser(filename) == expanded_name + @td.skip_if_no('pathlib') def test_stringify_path_pathlib(self): - tm._skip_if_no_pathlib() - rel_path = common._stringify_path(Path('.')) assert rel_path == '.' redundant_path = common._stringify_path(Path('foo//bar')) assert redundant_path == os.path.join('foo', 'bar') + @td.skip_if_no('py.path') def test_stringify_path_localpath(self): - tm._skip_if_no_localpath() - path = os.path.join('foo', 'bar') abs_path = os.path.abspath(path) lpath = LocalPath(path) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 4efeeecf8ee4a..274d60c40e83f 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -13,6 +13,7 @@ import pandas as pd import pandas.util.testing as tm +import pandas.util._test_decorators as td from pandas import DataFrame, Index, MultiIndex from pandas.compat import u, range, map, BytesIO, iteritems from pandas.core.config import set_option, get_option @@ -650,11 +651,10 @@ def test_read_from_file_url(self): tm.assert_frame_equal(url_table, local_table) + @td.skip_if_no('pathlib') def test_read_from_pathlib_path(self): # GH12655 - tm._skip_if_no_pathlib() - from pathlib import Path str_path = os.path.join(self.dirpath, 'test1' + self.ext) @@ -665,11 +665,10 @@ def test_read_from_pathlib_path(self): tm.assert_frame_equal(expected, actual) + @td.skip_if_no('py.path') def test_read_from_py_localpath(self): # GH12655 - tm._skip_if_no_localpath() - from py.path import local as LocalPath str_path = os.path.join(self.dirpath, 'test1' + self.ext) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 6df31b73da9b7..d63764e90d26e 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -5119,11 +5119,10 @@ def test_read_nokey_empty(self): store.close() pytest.raises(ValueError, read_hdf, path) + @td.skip_if_no('pathlib') def test_read_from_pathlib_path(self): # GH11773 - tm._skip_if_no_pathlib() - from pathlib import Path expected = DataFrame(np.random.rand(4, 5), @@ -5137,11 +5136,10 @@ def test_read_from_pathlib_path(self): tm.assert_frame_equal(expected, actual) + @td.skip_if_no('py.path') def test_read_from_py_localpath(self): # GH11773 - tm._skip_if_no_localpath() - from py.path import local as LocalPath expected = DataFrame(np.random.rand(4, 5), diff --git a/pandas/tests/scalar/test_parsing.py b/pandas/tests/scalar/test_parsing.py index 70961755ceec9..bff0de649ac5e 100644 --- a/pandas/tests/scalar/test_parsing.py +++ b/pandas/tests/scalar/test_parsing.py @@ -6,10 +6,52 @@ import numpy as np import pytest from dateutil.parser import parse + +import pandas as pd +import pandas.util._test_decorators as td from pandas.conftest import is_dateutil_le_261, is_dateutil_gt_261 from pandas import compat from pandas.util import testing as tm from pandas._libs.tslibs import parsing +from pandas._libs.tslibs.parsing import parse_time_string + + +def test_to_datetime1(): + actual = pd.to_datetime(datetime(2008, 1, 15)) + assert actual == datetime(2008, 1, 15) + + actual = pd.to_datetime('20080115') + assert actual == datetime(2008, 1, 15) + + # unparseable + s = 'Month 1, 1999' + assert pd.to_datetime(s, errors='ignore') == s + + +class TestParseQuarters(object): + + def test_parse_time_string(self): + (date, parsed, reso) = parse_time_string('4Q1984') + (date_lower, parsed_lower, reso_lower) = parse_time_string('4q1984') + assert date == date_lower + assert parsed == parsed_lower + assert reso == reso_lower + + def test_parse_time_quarter_w_dash(self): + # https://github.com/pandas-dev/pandas/issue/9688 + pairs = [('1988-Q2', '1988Q2'), ('2Q-1988', '2Q1988')] + + for dashed, normal in pairs: + (date_dash, parsed_dash, reso_dash) = parse_time_string(dashed) + (date, parsed, reso) = parse_time_string(normal) + + assert date_dash == date + assert parsed_dash == parsed + assert reso_dash == reso + + pytest.raises(parsing.DateParseError, parse_time_string, "-2Q1992") + pytest.raises(parsing.DateParseError, parse_time_string, "2-Q1992") + pytest.raises(parsing.DateParseError, parse_time_string, "4-4Q1992") class TestDatetimeParsingWrappers(object): @@ -66,6 +108,7 @@ def test_parsers_monthfreq(self): class TestGuessDatetimeFormat(object): + @td.skip_if_not_us_locale @is_dateutil_le_261 @pytest.mark.parametrize( "string, format", @@ -79,11 +122,10 @@ class TestGuessDatetimeFormat(object): '%Y-%m-%d %H:%M:%S.%f')]) def test_guess_datetime_format_with_parseable_formats( self, string, format): - tm._skip_if_not_us_locale() - result = parsing._guess_datetime_format(string) assert result == format + @td.skip_if_not_us_locale @is_dateutil_gt_261 @pytest.mark.parametrize( "string", @@ -92,8 +134,6 @@ def test_guess_datetime_format_with_parseable_formats( '2011-12-30 00:00:00.000000']) def test_guess_datetime_format_with_parseable_formats_gt_261( self, string): - tm._skip_if_not_us_locale() - result = parsing._guess_datetime_format(string) assert result is None @@ -118,6 +158,7 @@ def test_guess_datetime_format_with_dayfirst_gt_261(self, dayfirst): ambiguous_string, dayfirst=dayfirst) assert result is None + @td.skip_if_has_locale @is_dateutil_le_261 @pytest.mark.parametrize( "string, format", @@ -127,13 +168,10 @@ def test_guess_datetime_format_with_dayfirst_gt_261(self, dayfirst): ('30/Dec/2011 00:00:00', '%d/%b/%Y %H:%M:%S')]) def test_guess_datetime_format_with_locale_specific_formats( self, string, format): - # The month names will vary depending on the locale, in which - # case these wont be parsed properly (dateutil can't parse them) - tm._skip_if_has_locale() - result = parsing._guess_datetime_format(string) assert result == format + @td.skip_if_has_locale @is_dateutil_gt_261 @pytest.mark.parametrize( "string", @@ -143,10 +181,6 @@ def test_guess_datetime_format_with_locale_specific_formats( '30/Dec/2011 00:00:00']) def test_guess_datetime_format_with_locale_specific_formats_gt_261( self, string): - # The month names will vary depending on the locale, in which - # case these wont be parsed properly (dateutil can't parse them) - tm._skip_if_has_locale() - result = parsing._guess_datetime_format(string) assert result is None diff --git a/pandas/tests/scalar/test_period.py b/pandas/tests/scalar/test_period.py index eb6363689cca0..792eb0d49077f 100644 --- a/pandas/tests/scalar/test_period.py +++ b/pandas/tests/scalar/test_period.py @@ -12,9 +12,9 @@ from pandas._libs import tslib from pandas._libs.tslibs import period as libperiod +from pandas._libs.tslibs.ccalendar import DAYS, MONTHS from pandas._libs.tslibs.parsing import DateParseError from pandas import Period, Timestamp, offsets -from pandas._libs.tslibs.resolution import DAYS, _MONTHS as MONTHS class TestPeriodProperties(object): diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 12d0267005f19..441e811706487 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -322,6 +322,44 @@ def cmp(a, b): lambda x: x.astype('object').astype(Categorical)]: pytest.raises(TypeError, lambda: invalid(s)) + @pytest.mark.parametrize('name', [None, 'foo']) + @pytest.mark.parametrize('dtype_ordered', [True, False]) + @pytest.mark.parametrize('series_ordered', [True, False]) + def test_astype_categorical_to_categorical(self, name, dtype_ordered, + series_ordered): + # GH 10696/18593 + s_data = list('abcaacbab') + s_dtype = CategoricalDtype(list('bac'), ordered=series_ordered) + s = Series(s_data, dtype=s_dtype, name=name) + + # unspecified categories + dtype = CategoricalDtype(ordered=dtype_ordered) + result = s.astype(dtype) + exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered) + expected = Series(s_data, name=name, dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = s.astype('category', ordered=dtype_ordered) + tm.assert_series_equal(result, expected) + + # different categories + dtype = CategoricalDtype(list('adc'), dtype_ordered) + result = s.astype(dtype) + expected = Series(s_data, name=name, dtype=dtype) + tm.assert_series_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = s.astype( + 'category', categories=list('adc'), ordered=dtype_ordered) + tm.assert_series_equal(result, expected) + + if dtype_ordered is False: + # not specifying ordered, so only test once + expected = s + result = s.astype('category') + tm.assert_series_equal(result, expected) + def test_astype_categoricaldtype(self): s = Series(['a', 'b', 'a']) result = s.astype(CategoricalDtype(['a', 'b'], ordered=True)) diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index 311d14e928caa..bccc46f1e0ca8 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from pandas import compat +from pandas import compat, Timestamp import pytest @@ -368,3 +368,13 @@ def test_rank_object_bug(self): # smoke tests Series([np.nan] * 32).astype(object).rank(ascending=True) Series([np.nan] * 32).astype(object).rank(ascending=False) + + def test_rank_modify_inplace(self): + # GH 18521 + # Check rank does not mutate series + s = Series([Timestamp('2017-01-05 10:20:27.569000'), NaT]) + expected = s.copy() + + s.rank() + result = s + assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index b0d0e2a51b5f4..95410c6ea0105 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -8,6 +8,7 @@ import pandas as pd import pandas.util.testing as tm +import pandas.util._test_decorators as td from pandas._libs.tslib import iNaT from pandas.compat import lrange, StringIO, product from pandas.core.indexes.timedeltas import TimedeltaIndex @@ -17,7 +18,7 @@ Timestamp, to_datetime, offsets, timedelta_range) from pandas.util.testing import (assert_series_equal, assert_almost_equal, - assert_frame_equal, _skip_if_has_locale) + assert_frame_equal) from pandas.tests.series.common import TestData @@ -738,10 +739,9 @@ def test_between_time_types(self): pytest.raises(ValueError, series.between_time, datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) + @td.skip_if_has_locale def test_between_time_formats(self): # GH11818 - _skip_if_has_locale() - rng = date_range('1/1/2000', '1/5/2000', freq='5min') ts = DataFrame(np.random.randn(len(rng), 2), index=rng) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 0f0abd8cd3400..6407bee49ad15 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -92,6 +92,7 @@ def test_pandas_datareader(): pandas_datareader.get_data_google('AAPL') +@pytest.mark.xfail(reason="install not working, gh-18780") def test_geopandas(): geopandas = import_module('geopandas') # noqa diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 1fd6befd64f57..f00fa07d868a1 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -22,7 +22,7 @@ from pandas.core.base import SpecificationError, AbstractMethodError from pandas.errors import UnsupportedFunctionCall from pandas.core.groupby import DataError -from pandas._libs.tslibs.resolution import DAYS, _MONTHS as MONTHS + from pandas.tseries.frequencies import to_offset from pandas.core.indexes.datetimes import date_range from pandas.tseries.offsets import Minute, BDay @@ -33,6 +33,7 @@ from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, assert_index_equal) from pandas._libs.tslibs.period import IncompatibleFrequency +from pandas._libs.tslibs.ccalendar import DAYS, MONTHS bday = BDay() diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 84e811301ab4b..5b4c2f9d86674 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -29,9 +29,7 @@ QuarterEnd, BusinessMonthEnd, FY5253, Nano, Easter, FY5253Quarter, LastWeekOfMonth) -from pandas.core.tools.datetimes import ( - format, ole2datetime, parse_time_string, - to_datetime, DateParseError) +from pandas.core.tools.datetimes import format, ole2datetime import pandas.tseries.offsets as offsets from pandas.io.pickle import read_pickle from pandas._libs.tslibs import timezones @@ -67,18 +65,6 @@ def test_ole2datetime(): ole2datetime(60) -def test_to_datetime1(): - actual = to_datetime(datetime(2008, 1, 15)) - assert actual == datetime(2008, 1, 15) - - actual = to_datetime('20080115') - assert actual == datetime(2008, 1, 15) - - # unparseable - s = 'Month 1, 1999' - assert to_datetime(s, errors='ignore') == s - - def test_normalize_date(): actual = normalize_date(datetime(2007, 10, 1, 1, 12, 5, 10)) assert actual == datetime(2007, 10, 1) @@ -2800,32 +2786,6 @@ def test_get_offset_legacy(): get_offset(name) -class TestParseTimeString(object): - - def test_parse_time_string(self): - (date, parsed, reso) = parse_time_string('4Q1984') - (date_lower, parsed_lower, reso_lower) = parse_time_string('4q1984') - assert date == date_lower - assert parsed == parsed_lower - assert reso == reso_lower - - def test_parse_time_quarter_w_dash(self): - # https://github.com/pandas-dev/pandas/issue/9688 - pairs = [('1988-Q2', '1988Q2'), ('2Q-1988', '2Q1988'), ] - - for dashed, normal in pairs: - (date_dash, parsed_dash, reso_dash) = parse_time_string(dashed) - (date, parsed, reso) = parse_time_string(normal) - - assert date_dash == date - assert parsed_dash == parsed - assert reso_dash == reso - - pytest.raises(DateParseError, parse_time_string, "-2Q1992") - pytest.raises(DateParseError, parse_time_string, "2-Q1992") - pytest.raises(DateParseError, parse_time_string, "4-4Q1992") - - def test_get_standard_freq(): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): fstr = get_standard_freq('W') diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 460ad3f5591fc..f6e3d1f271036 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -25,7 +25,8 @@ from pandas._libs.tslibs.resolution import (Resolution, _FrequencyInferer, _TimedeltaFrequencyInferer) -from pandas._libs.tslibs.parsing import _get_rule_month, _MONTH_NUMBERS +from pandas._libs.tslibs.parsing import _get_rule_month +from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS from pytz import AmbiguousTimeError @@ -496,8 +497,8 @@ def _is_annual(rule): def _quarter_months_conform(source, target): - snum = _MONTH_NUMBERS[source] - tnum = _MONTH_NUMBERS[target] + snum = MONTH_NUMBERS[source] + tnum = MONTH_NUMBERS[target] return snum % 3 == tnum % 3 diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index dd5f01a36a43e..8b12b2f3ad2ce 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -16,13 +16,13 @@ from pandas._libs import tslib, Timestamp, OutOfBoundsDatetime, Timedelta from pandas.util._decorators import cache_readonly +from pandas._libs.tslibs import ccalendar from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds import pandas._libs.tslibs.offsets as liboffsets from pandas._libs.tslibs.offsets import ( ApplyTypeError, as_datetime, _is_normalized, _get_calendar, _to_dt64, _validate_business_time, - _int_to_weekday, _weekday_to_int, _determine_offset, apply_index_wraps, roll_yearday, @@ -933,7 +933,7 @@ def name(self): if self.isAnchored: return self.rule_code else: - month = liboffsets._int_to_month[self.n] + month = ccalendar.MONTH_ALIASES[self.n] return "{code}-{month}".format(code=self.rule_code, month=month) @@ -1348,7 +1348,8 @@ def onOffset(self, dt): def rule_code(self): suffix = '' if self.weekday is not None: - suffix = '-{weekday}'.format(weekday=_int_to_weekday[self.weekday]) + weekday = ccalendar.int_to_weekday[self.weekday] + suffix = '-{weekday}'.format(weekday=weekday) return self._prefix + suffix @classmethod @@ -1356,7 +1357,7 @@ def _from_name(cls, suffix=None): if not suffix: weekday = None else: - weekday = _weekday_to_int[suffix] + weekday = ccalendar.weekday_to_int[suffix] return cls(weekday=weekday) @@ -1430,7 +1431,7 @@ def onOffset(self, dt): @property def rule_code(self): - weekday = _int_to_weekday.get(self.weekday, '') + weekday = ccalendar.int_to_weekday.get(self.weekday, '') return '{prefix}-{week}{weekday}'.format(prefix=self._prefix, week=self.week + 1, weekday=weekday) @@ -1443,7 +1444,7 @@ def _from_name(cls, suffix=None): # TODO: handle n here... # only one digit weeks (1 --> week 0, 2 --> week 1, etc.) week = int(suffix[0]) - 1 - weekday = _weekday_to_int[suffix[1:]] + weekday = ccalendar.weekday_to_int[suffix[1:]] return cls(week=week, weekday=weekday) @@ -1509,7 +1510,7 @@ def onOffset(self, dt): @property def rule_code(self): - weekday = _int_to_weekday.get(self.weekday, '') + weekday = ccalendar.int_to_weekday.get(self.weekday, '') return '{prefix}-{weekday}'.format(prefix=self._prefix, weekday=weekday) @@ -1519,7 +1520,7 @@ def _from_name(cls, suffix=None): raise ValueError("Prefix {prefix!r} requires a suffix." .format(prefix=cls._prefix)) # TODO: handle n here... - weekday = _weekday_to_int[suffix] + weekday = ccalendar.weekday_to_int[suffix] return cls(weekday=weekday) # --------------------------------------------------------------------- @@ -1550,7 +1551,7 @@ def isAnchored(self): def _from_name(cls, suffix=None): kwargs = {} if suffix: - kwargs['startingMonth'] = liboffsets._month_to_int[suffix] + kwargs['startingMonth'] = ccalendar.MONTH_TO_CAL_NUM[suffix] else: if cls._from_name_startingMonth is not None: kwargs['startingMonth'] = cls._from_name_startingMonth @@ -1558,7 +1559,7 @@ def _from_name(cls, suffix=None): @property def rule_code(self): - month = liboffsets._int_to_month[self.startingMonth] + month = ccalendar.MONTH_ALIASES[self.startingMonth] return '{prefix}-{month}'.format(prefix=self._prefix, month=month) @apply_wraps @@ -1681,12 +1682,12 @@ def __init__(self, n=1, normalize=False, month=None): def _from_name(cls, suffix=None): kwargs = {} if suffix: - kwargs['month'] = liboffsets._month_to_int[suffix] + kwargs['month'] = ccalendar.MONTH_TO_CAL_NUM[suffix] return cls(**kwargs) @property def rule_code(self): - month = liboffsets._int_to_month[self.month] + month = ccalendar.MONTH_ALIASES[self.month] return '{prefix}-{month}'.format(prefix=self._prefix, month=month) @@ -1906,8 +1907,8 @@ def _get_suffix_prefix(self): def get_rule_code_suffix(self): prefix = self._get_suffix_prefix() - month = liboffsets._int_to_month[self.startingMonth] - weekday = _int_to_weekday[self.weekday] + month = ccalendar.MONTH_ALIASES[self.startingMonth] + weekday = ccalendar.int_to_weekday[self.weekday] return '{prefix}-{month}-{weekday}'.format(prefix=prefix, month=month, weekday=weekday) @@ -1921,8 +1922,8 @@ def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code): raise ValueError("Unable to parse varion_code: " "{code}".format(code=varion_code)) - startingMonth = liboffsets._month_to_int[startingMonth_code] - weekday = _weekday_to_int[weekday_code] + startingMonth = ccalendar.MONTH_TO_CAL_NUM[startingMonth_code] + weekday = ccalendar.weekday_to_int[weekday_code] return {"weekday": weekday, "startingMonth": startingMonth, diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 6d15f360bcbe8..0b2d50d06a66c 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -25,6 +25,7 @@ def test_foo(): """ import pytest +import locale from distutils.version import LooseVersion from pandas.compat import is_platform_windows, is_platform_32bit, PY3 @@ -81,6 +82,50 @@ def _skip_if_mpl_1_5(): mod.use("Agg", warn=False) +def _skip_if_has_locale(): + lang, _ = locale.getlocale() + if lang is not None: + return True + + +def _skip_if_not_us_locale(): + lang, _ = locale.getlocale() + if lang != 'en_US': + return True + + +def skip_if_no(package, min_version=None): + """ + Generic function to help skip test functions when required packages are not + present on the testing system. + + Intended for use as a decorator, this function will wrap the decorated + function with a pytest ``skip_if`` mark. During a pytest test suite + execution, that mark will attempt to import the specified ``package`` and + optionally ensure it meets the ``min_version``. If the import and version + check are unsuccessful, then the decorated function will be skipped. + + Parameters + ---------- + package: str + The name of the package required by the decorated function + min_version: str or None, default None + Optional minimum version of the package required by the decorated + function + + Returns + ------- + decorated_func: function + The decorated function wrapped within a pytest ``skip_if`` mark + """ + def decorated_func(func): + return pytest.mark.skipif( + not safe_import(package, min_version=min_version), + reason="Could not import '{}'".format(package) + )(func) + return decorated_func + + skip_if_no_mpl = pytest.mark.skipif(_skip_if_no_mpl(), reason="Missing matplotlib dependency") skip_if_mpl_1_5 = pytest.mark.skipif(_skip_if_mpl_1_5(), @@ -92,3 +137,10 @@ def _skip_if_mpl_1_5(): skip_if_windows_python_3 = pytest.mark.skipif(is_platform_windows() and PY3, reason=("not used on python3/" "win32")) +skip_if_has_locale = pytest.mark.skipif(_skip_if_has_locale(), + reason="Specific locale is set {lang}" + .format(lang=locale.getlocale()[0])) +skip_if_not_us_locale = pytest.mark.skipif(_skip_if_not_us_locale(), + reason="Specific locale is set " + "{lang}".format( + lang=locale.getlocale()[0])) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 81f84ea646c86..2a0a7c9301752 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -359,22 +359,6 @@ def _skip_if_no_xarray(): pytest.skip("xarray version is too low: {version}".format(version=v)) -def _skip_if_no_pathlib(): - try: - from pathlib import Path # noqa - except ImportError: - import pytest - pytest.skip("pathlib not available") - - -def _skip_if_no_localpath(): - try: - from py.path import local as LocalPath # noqa - except ImportError: - import pytest - pytest.skip("py.path not installed") - - def skip_if_no_ne(engine='numexpr'): from pandas.core.computation.expressions import ( _USE_NUMEXPR, @@ -389,22 +373,6 @@ def skip_if_no_ne(engine='numexpr'): installed=_NUMEXPR_INSTALLED)) -def _skip_if_has_locale(): - import locale - lang, _ = locale.getlocale() - if lang is not None: - import pytest - pytest.skip("Specific locale is set {lang}".format(lang=lang)) - - -def _skip_if_not_us_locale(): - import locale - lang, _ = locale.getlocale() - if lang != 'en_US': - import pytest - pytest.skip("Specific locale is set {lang}".format(lang=lang)) - - def _skip_if_no_mock(): try: import mock # noqa