In the past, pandas recommended Series.values or DataFrame.values for extracting the data from a Series or DataFrame. You’ll still find references to these in old code bases and online. Going forward, we recommend avoiding .values and using .array or .to_numpy(). .values has the following drawbacks:

When your Series contains an extension type, it’s unclear whether Series.values returns a NumPy array or the extension array. Series.array will always return an ExtensionArray, and will never copy data. Series.to_numpy() will always return a NumPy array, potentially at the cost of copying / coercing values.

When your DataFrame contains a mixture of data types, DataFrame.values may involve copying data and coercing values to a common dtype, a relatively expensive operation. DataFrame.to_numpy(), being a method, makes it clearer that the returned NumPy array may not be a view on the same data in the DataFrame.

In [1]:
import pandas as pd
import numpy as np
# Create a DataFrame with 1 million rows and two columns
df = pd.DataFrame({
    'a': np.random.rand(1000000),
    'b': np.random.rand(1000000)
})

In [3]:
df.values

array([[0.65281902, 0.38596966],
       [0.81884361, 0.5905908 ],
       [0.73034918, 0.25829396],
       ...,
       [0.66651263, 0.12558555],
       [0.63100811, 0.71387107],
       [0.45298855, 0.91582466]], shape=(1000000, 2))

In [4]:
df.to_numpy()

array([[0.65281902, 0.38596966],
       [0.81884361, 0.5905908 ],
       [0.73034918, 0.25829396],
       ...,
       [0.66651263, 0.12558555],
       [0.63100811, 0.71387107],
       [0.45298855, 0.91582466]], shape=(1000000, 2))

In [5]:
import time
import math

In [6]:
# Create a DataFrame with 1 million rows and one column 'x'
df = pd.DataFrame({'x': np.linspace(0, 10, 1000000)})

# 1. Using vectorized np.sin function
start = time.time()
df['sin_vectorized'] = np.sin(df['x'])
time_vectorized = time.time() - start
print("Time using vectorized np.sin:", time_vectorized, "seconds")

# 2. Using apply with math.sin (processing element by element)
start = time.time()
df['sin_loop'] = df['x'].apply(math.sin)
time_loop = time.time() - start
print("Time using apply with math.sin:", time_loop, "seconds")

Time using vectorized np.sin: 0.029645919799804688 seconds
Time using apply with math.sin: 0.2880251407623291 seconds


pandas has support for accelerating certain types of binary numerical and boolean operations using the numexpr library and the bottleneck libraries.

These libraries are especially useful when dealing with large data sets, and provide large speedups. numexpr uses smart chunking, caching, and multiple cores. bottleneck is a set of specialized cython routines that are especially fast when dealing with arrays that have nans.

In [7]:
pd.set_option("compute.use_bottleneck", True)
pd.set_option("compute.use_numexpr", True)
# Create a DataFrame with 1 million rows and one column 'x'
df2 = pd.DataFrame({
    'a': np.random.rand(1000000),
    'b': np.random.rand(1000000)
})

start = time.time()
df2['sum_loop'] = df2.apply(lambda row: row['a'] + row['b'], axis=1)
time_loop = time.time() - start
print("Time using apply with lambda:", time_loop, "seconds")

Time using apply with lambda: 10.989691734313965 seconds


In [8]:
pd.set_option("compute.use_bottleneck", False)
pd.set_option("compute.use_numexpr", False)

# Create a DataFrame with 1 million rows and one column 'x'
df2 = pd.DataFrame({
    'a': np.random.rand(1000000),
    'b': np.random.rand(1000000)
})

start = time.time()
df2['sum_loop'] = df2.apply(lambda row: row['a'] + row['b'], axis=1)
time_loop = time.time() - start
print("Time using apply with lambda:", time_loop, "seconds")

Time using apply with lambda: 12.076786041259766 seconds


Flexibe comparison

In [11]:
df.eq(df2)

Unnamed: 0,a,b,sin_loop,sin_vectorized,sum_loop,x
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
999995,False,False,False,False,False,False
999996,False,False,False,False,False,False
999997,False,False,False,False,False,False
999998,False,False,False,False,False,False


In [13]:
df

Unnamed: 0,x,sin_vectorized,sin_loop
0,0.00000,0.000000,0.000000
1,0.00001,0.000010,0.000010
2,0.00002,0.000020,0.000020
3,0.00003,0.000030,0.000030
4,0.00004,0.000040,0.000040
...,...,...,...
999995,9.99996,-0.543988,-0.543988
999996,9.99997,-0.543996,-0.543996
999997,9.99998,-0.544004,-0.544004
999998,9.99999,-0.544013,-0.544013


In [14]:
(df > 0).all()

x                 True
sin_vectorized    True
sin_loop          True
dtype: bool

In [15]:
(df > 0).any()

x                 True
sin_vectorized    True
sin_loop          True
dtype: bool

In [17]:
df3 = pd.DataFrame()

In [18]:
df3.empty

True

In [19]:
(df + df).equals(df * 2)

True

In [23]:
df['sin_loop'].mode()

0        -1.0
1        -1.0
2        -1.0
3        -1.0
4        -1.0
         ... 
999995    1.0
999996    1.0
999997    1.0
999998    1.0
999999    1.0
Name: sin_loop, Length: 1000000, dtype: float64

## Functions

In [2]:
tsdf = pd.DataFrame(
    np.random.randn(10, 3),
    columns=["A", "B", "C"],
    index=pd.date_range("1/1/2000", periods=10),
)

In [3]:
tsdf.iloc[3:7] = np.nan

In [4]:
tsdf.agg("sum")

A   -3.374537
B   -4.492862
C   -1.443488
dtype: float64

In [5]:
tsdf.agg(["sum"])

Unnamed: 0,A,B,C
sum,-3.374537,-4.492862,-1.443488


In [6]:
tsdf.agg(["sum", "mean"])

Unnamed: 0,A,B,C
sum,-3.374537,-4.492862,-1.443488
mean,-0.562423,-0.74881,-0.240581


In [8]:
def mean_add_std(x):
    return x.mean() + x.std()


tsdf.agg(["sum", mean_add_std])

Unnamed: 0,A,B,C
sum,-3.374537,-4.492862,-1.443488
mean_add_std,0.203731,0.417472,1.050961


In [9]:
from functools import partial

q_25 = partial(pd.Series.quantile, q=0.25)

In [11]:
q_25 = partial(pd.Series.quantile, q=0.25)
q_25.__name__ = "25%"
tsdf.agg([q_25])

Unnamed: 0,A,B,C
25%,-1.062405,-1.003618,-0.853759


In [14]:
tsdf.transform(lambda x: (x - x.mean() / x.std()))

Unnamed: 0,A,B,C
2000-01-01,0.245562,-2.20622,-2.166416
2000-01-02,-0.44707,0.293142,0.89177
2000-01-03,-0.639729,0.817045,0.206095
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,0.312248,-0.122517,-0.958677
2000-01-09,0.027936,-0.441252,1.330589
2000-01-10,1.531033,1.019235,0.370798


In [25]:
def f(x):
    return len(str(x))



tsdf.map(f)

Unnamed: 0,A,B,C
2000-01-01,20,19,19
2000-01-02,19,18,18
2000-01-03,19,19,20
2000-01-04,3,3,3
2000-01-05,3,3,3
2000-01-06,3,3,3
2000-01-07,3,3,3
2000-01-08,20,19,19
2000-01-09,18,19,18
2000-01-10,18,18,19


## Reindex

In [26]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])

In [27]:
s.reindex(["e", "b", "f", "d"])

e    1.406243
b   -0.704249
f         NaN
d    0.416557
dtype: float64

In [28]:
rs = s.reindex(df.index)

In [30]:
df2 = df.reindex(["a", "b", "c"], columns=["one", "two"])

df3 = df2 - df2.mean()

df.reindex_like(df2)

Unnamed: 0,one,two
a,,
b,,
c,,


In [31]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])

s1 = s[:4]

s2 = s[1:]

In [33]:
s1, s2

(a    0.234222
 b   -0.590862
 c    0.252886
 d    0.917153
 dtype: float64,
 b   -0.590862
 c    0.252886
 d    0.917153
 e    1.335910
 dtype: float64)

In [32]:
s1.align(s2)

(a    0.234222
 b   -0.590862
 c    0.252886
 d    0.917153
 e         NaN
 dtype: float64,
 a         NaN
 b   -0.590862
 c    0.252886
 d    0.917153
 e    1.335910
 dtype: float64)

In [34]:
s1.align(s2, join='inner')

(b   -0.590862
 c    0.252886
 d    0.917153
 dtype: float64,
 b   -0.590862
 c    0.252886
 d    0.917153
 dtype: float64)

In [35]:
s1.align(s2, join='outer')

(a    0.234222
 b   -0.590862
 c    0.252886
 d    0.917153
 e         NaN
 dtype: float64,
 a         NaN
 b   -0.590862
 c    0.252886
 d    0.917153
 e    1.335910
 dtype: float64)

In [36]:
s1.align(s2, join='left')

(a    0.234222
 b   -0.590862
 c    0.252886
 d    0.917153
 dtype: float64,
 a         NaN
 b   -0.590862
 c    0.252886
 d    0.917153
 dtype: float64)

In [37]:
s1.align(s2, join='right')

(b   -0.590862
 c    0.252886
 d    0.917153
 e         NaN
 dtype: float64,
 b   -0.590862
 c    0.252886
 d    0.917153
 e    1.335910
 dtype: float64)

## Fillling with reindex

In [42]:
rng = pd.date_range("1/3/2000", periods=8)

ts = pd.Series(np.random.randn(8), index=rng)

ts2 = ts.iloc[[0, 3, 6]]

In [46]:
ts

2000-01-03   -0.335298
2000-01-04    0.000851
2000-01-05    2.666858
2000-01-06    0.021667
2000-01-07    0.740509
2000-01-08    1.749340
2000-01-09   -2.719069
2000-01-10    0.533897
Freq: D, dtype: float64

In [54]:
ts2.reindex(ts.index)

2000-01-03   -0.335298
2000-01-04         NaN
2000-01-05         NaN
2000-01-06    0.021667
2000-01-07         NaN
2000-01-08         NaN
2000-01-09   -2.719069
2000-01-10         NaN
Freq: D, dtype: float64

In [55]:
ts2.reindex(ts.index, method='ffill')

2000-01-03   -0.335298
2000-01-04   -0.335298
2000-01-05   -0.335298
2000-01-06    0.021667
2000-01-07    0.021667
2000-01-08    0.021667
2000-01-09   -2.719069
2000-01-10   -2.719069
Freq: D, dtype: float64

In [56]:
ts2.reindex(ts.index, method='ffill', tolerance="1 day")

2000-01-03   -0.335298
2000-01-04   -0.335298
2000-01-05         NaN
2000-01-06    0.021667
2000-01-07    0.021667
2000-01-08         NaN
2000-01-09   -2.719069
2000-01-10   -2.719069
Freq: D, dtype: float64

In [58]:
s.rename(str.upper)

A    0.234222
B   -0.590862
C    0.252886
D    0.917153
E    1.335910
dtype: float64

In [63]:
s = pd.Series(np.random.permutation(10))
s.nsmallest(3)
s.nlargest(3)

2    9
5    8
3    7
dtype: int64

In [65]:
df1 = pd.DataFrame(
    {
        "a": [-2, -1, 1, 10, 8, 11, -1],
        "b": list("abdceff"),
        "c": [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0],
    }
)
df1.columns = pd.MultiIndex.from_tuples(
    [("a", "one"), ("a", "two"), ("b", "three")]
)


df1.sort_values(by=("a", "two"))

Unnamed: 0_level_0,a,a,b
Unnamed: 0_level_1,one,two,three
0,-2,a,1.0
1,-1,b,2.0
3,10,c,3.2
2,1,d,4.0
4,8,e,
5,11,f,3.0
6,-1,f,4.0
