In [1]:
# MOTIVATING query() AND eval(): COMPOUND EXPRESSION
import numpy as np
rng = np.random.RandomState(42)
x = rng.rand(1000000)
y = rng.rand(1000000)
%timeit x + y

3.72 ms ± 55.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [2]:
# this is much faster than doing the addition via a Python loop or comprehension:
%timeit np.fromiter((xi + yi for xi, yi in zip(x, y)), dtype=x.dtype, count=len(x))

406 ms ± 4.19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
# But this abstraction can become less efficient when you are computing compound
# expressions. For example, consider the following expression:
mask = (x > 0.5) & (y < 0.5)
# Because NumPy evaluates each subexpression, this is roughly equivalent to the following:
tmp1 = (x > 0.5)
tmp2 = (y < 0.5)
mask = tmp1 & tmp2
# In other words, every intermediate step is explicitly allocated in memory

In [4]:
# The Numexpr library gives you the ability to compute this type of compound
# expression element by element, without the need to allocate full intermediate arrays.
import numexpr
mask_numexpr = numexpr.evaluate('(x > 0.5) & (y < 0.5)')
np.allclose(mask, mask_numexpr)

# The benefit here is that Numexpr evaluates the expression in a way that does not use
# full-sized temporary arrays, and thus can be much more efficient than NumPy, espe‐
# cially for large arrays.

True

In [5]:
# PANDAS.eval() FOR EFFICIENT OPERATIONS
# The eval() function in Pandas uses string expressions to efficiently compute operations using DataFrames
import pandas as pd
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for i in range(4))

In [6]:
# To compute the sum of all four DataFrames using the typical Pandas approach, we can just write the sum:
%timeit df1 + df2 + df3 + df4

104 ms ± 628 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
%timeit pd.eval('df1 + df2 + df3 + df4')

46.7 ms ± 277 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
np.allclose(df1 + df2 + df3 + df4, pd.eval('df1 + df2 + df3 + df4'))

True

In [9]:
# --- Operations supported by pd.eval() ---
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3))) for i in range(5))

In [10]:
# Arithmetic operators
result1 = -df1 * df2 / (df3 + df4) - df5
result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')
np.allclose(result1, result2)

True

In [11]:
# Comparison operators
result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval('df1 < df2 <= df3 != df4')
np.allclose(result1, result2)

True

In [12]:
# Bitwise operators
result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
np.allclose(result1, result2)

True

In [13]:
result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)')
np.allclose(result1, result3)

True

In [15]:
# DATAFRAME.eval() for column-wise operations
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()

Unnamed: 0,A,B,C
0,0.406939,0.069938,0.069087
1,0.235615,0.154374,0.677945
2,0.433839,0.652324,0.264038
3,0.808055,0.347197,0.589161
4,0.252418,0.557789,0.573154


In [16]:
result1 = (df['A'] + df['B']) / (df['C'] - 1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
np.allclose(result1, result2)

True

In [20]:
result3 = df.eval('(A + B) / (C - 1)')
np.allclose(result1, result3)

True

In [17]:
# --- Assignment in DataFrame.eval() ---
df.head()

Unnamed: 0,A,B,C
0,0.406939,0.069938,0.069087
1,0.235615,0.154374,0.677945
2,0.433839,0.652324,0.264038
3,0.808055,0.347197,0.589161
4,0.252418,0.557789,0.573154


In [19]:
df.eval('D = (A + B) / C', inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.406939,0.069938,0.069087,6.902512
1,0.235615,0.154374,0.677945,0.57525
2,0.433839,0.652324,0.264038,4.113663
3,0.808055,0.347197,0.589161,1.960844
4,0.252418,0.557789,0.573154,1.413594


In [21]:
df.eval('D = (A - B) / C', inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.406939,0.069938,0.069087,4.877876
1,0.235615,0.154374,0.677945,0.119834
2,0.433839,0.652324,0.264038,-0.827474
3,0.808055,0.347197,0.589161,0.78223
4,0.252418,0.557789,0.573154,-0.53279


In [22]:
# --- Local variables in DataFrame.eval() ---
column_mean = df.mean(1)
result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean')
np.allclose(result1, result2)

True

In [23]:
# DATAFRAME.query() METHOD
# The DataFrame has another method based on evaluated strings, called the query() method
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)

True

In [24]:
result2 = df.query('A < 0.5 and B < 0.5')
np.allclose(result1, result2)

True