# Pandas 2.0
- Testing it out!

In [1]:
import pandas as pd
import numpy as np

print('numpy:', np.__version__)
print('pandas', pd.__version__)

numpy: 1.24.2
pandas 2.0.0rc0


In [2]:
df = pd.read_csv('../046-babynames-popularity/names.csv')

In [3]:
type(df['Count'].values)

numpy.ndarray

## Types by Default are the Same

In [4]:
pd.Series([1, 2, 3, 4])

0    1
1    2
2    3
3    4
dtype: int64

In [5]:
pd.Series(['foo', 'bar', 'foobar'])

0       foo
1       bar
2    foobar
dtype: object

## Change dtypes to use arrow

In [6]:
pd.Series([1, 2, 3, 4], dtype='int64[pyarrow]')

0    1
1    2
2    3
3    4
dtype: int64[pyarrow]

In [9]:
# numpy series will switch this to a float
pd.Series([1, 2, 3, np.nan])

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [10]:
# pyarrow will keep this as a int64
pd.Series([1, 2, 3, np.nan], dtype='int64[pyarrow]')

0       1
1       2
2       3
3    <NA>
dtype: int64[pyarrow]

In [11]:
pd.Series(['foo', 'bar', 'foobar'], dtype='string[pyarrow]')

0       foo
1       bar
2    foobar
dtype: string

# Setting pyarrow as the default backend

In [15]:
# The data we loaded before from a csv
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2052781 entries, 0 to 2052780
Data columns (total 4 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   Name    object
 1   Sex     object
 2   Count   int64 
 3   Year    int64 
dtypes: int64(2), object(2)
memory usage: 62.6+ MB


In [17]:
pd.options.mode.dtype_backend = 'pyarrow'
fname = '../046-babynames-popularity/names.csv'
df_arrow = pd.read_csv(fname, engine='pyarrow', use_nullable_dtypes=True)

In [21]:
df_arrow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2052781 entries, 0 to 2052780
Data columns (total 4 columns):
 #   Column  Dtype          
---  ------  -----          
 0   Name    string[pyarrow]
 1   Sex     string[pyarrow]
 2   Count   int64[pyarrow] 
 3   Year    int64[pyarrow] 
dtypes: int64[pyarrow](2), string[pyarrow](2)
memory usage: 61.0 MB


# Speed Comparison

In [26]:
# Old numpy backend
%timeit df['Count'].mean()

2.63 ms ± 39.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
# Old numpy backend
%timeit df_arrow['Count'].mean()

1.48 ms ± 74.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [30]:
%%timeit
# Reading in the data
df = pd.read_csv(fname)

687 ms ± 26.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [31]:
%%timeit
# Reading in the data
df_arrow = pd.read_csv(fname, engine='pyarrow', use_nullable_dtypes=True)

35.4 ms ± 1.63 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [38]:
%%timeit
df['Name'].str.startswith('A')

417 ms ± 3.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [39]:
%%timeit
df_arrow['Name'].str.startswith('A')

13 ms ± 251 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Finding.... pyarrow backend is faster...

In [41]:
import polars as pl
print('polars', pl.__version__)

polars 0.16.11


In [46]:
%%timeit
pl.from_pandas(df)

180 ms ± 4.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [47]:
%%timeit
polars_df = pl.from_pandas(df_arrow)

58.6 ms ± 2.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [50]:
polars_df = pl.from_pandas(df_arrow)

In [60]:
polars_agg = polars_df.groupby('Name') \
    .agg(pl.col(['Count']) \
    .sum().alias('totals'))

In [67]:
polars_agg.to_pandas().to_latex('out.tex')

# Polars vs pandas1 vs pandas2

In [69]:
%%timeit
_ = df.groupby('Name')['Count'].sum()

494 ms ± 51.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [74]:
# %%timeit
# _ = df_arrow.groupby('Name')['Count'].sum()

In [75]:
%%timeit
_ = polars_df.groupby('Name') \
    .agg(pl.col(['Count']) \
    .sum().alias('totals'))

213 ms ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Pyarrow datatypes

In [10]:
import datetime
import pyarrow
import pandas


articles = pandas.DataFrame({
    'title': pandas.Series(['pandas 2.0 and the Arrow revolution',
                            'What I did this weekend'],
                           dtype='string[pyarrow]'),
    # 'tags': pandas.Series([['pandas', 'arrow', 'data'],
    #                        ['scuba-diving', 'rock-climbing']],
    #                       dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.string()))),
    'date': pandas.Series([datetime.date(2023, 2, 22),
                           datetime.date(2022, 11, 3)],
                          dtype='date32[pyarrow]')
})

In [25]:
pandas.to_datetime(articles['date']).dt.dayofyear

0     53
1    307
Name: date, dtype: int32

In [26]:
articles['date'].dt.dayofyear

0     53
1    307
Name: date, dtype: int64[pyarrow]