# High performance pandas

```py
# x and y are columns in dataframe in this example
mask = (x > .5 & (y < .5))
df[df[mask]]
```

under the hood

```py
# tmp is rows
tmp1 = (x > .5)
tmp2 = (x < .5)
mask = tmp1 & tmp2
```
for higher performance use:
```py
pd.eval(""), df.query() -> # does elementwise operation with numexpr (numexpr = number express)
```


In [2]:
import numpy as np
import pandas as pd

nrows, ncols = 1_000_000, 100

df1, df2, df3, df4 = [pd.DataFrame(np.random.randn(nrows,ncols)) for _ in range(4)]

In [4]:
df1.shape

(1000000, 100)

In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 100 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   0       1000000 non-null  float64
 1   1       1000000 non-null  float64
 2   2       1000000 non-null  float64
 3   3       1000000 non-null  float64
 4   4       1000000 non-null  float64
 5   5       1000000 non-null  float64
 6   6       1000000 non-null  float64
 7   7       1000000 non-null  float64
 8   8       1000000 non-null  float64
 9   9       1000000 non-null  float64
 10  10      1000000 non-null  float64
 11  11      1000000 non-null  float64
 12  12      1000000 non-null  float64
 13  13      1000000 non-null  float64
 14  14      1000000 non-null  float64
 15  15      1000000 non-null  float64
 16  16      1000000 non-null  float64
 17  17      1000000 non-null  float64
 18  18      1000000 non-null  float64
 19  19      1000000 non-null  float64
 20  20      1000000 non-null

In [6]:
%timeit df1+df2+df3+df4

7.45 s ± 670 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%timeit pd.eval("df1+df2+df3+df4")

3.94 s ± 907 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
standard = df1+df2+df3+df4
sum_eval = pd.eval("df1+df2+df3+df4")

sum_eval.equals(standard)

True

In [23]:
rolls = pd.DataFrame(np.random.randint(1,6, (6,3)), columns = ["Die1", "Die2", "Die3"])
rolls


Unnamed: 0,Die1,Die2,Die3
0,1,2,3
1,2,5,3
2,4,5,3
3,5,2,4
4,4,1,5
5,5,5,2


In [24]:
rolls.eval("Sum = Die1 + Die2 + Die3", inplace = True)


In [25]:
high = 11,
rolls.eval("Winner = Sum > @high")

Unnamed: 0,Die1,Die2,Die3,Sum,Winner
0,1,2,3,6,False
1,2,5,3,10,False
2,4,5,3,12,True
3,5,2,4,11,False
4,4,1,5,10,False
5,5,5,2,12,True


In [27]:
# Traditional way to filter from DataFrame.

rolls[rolls["Sum"] > high]

Unnamed: 0,Die1,Die2,Die3,Sum
2,4,5,3,12
5,5,5,2,12


## Query

- filter using query

In [34]:
rolls.query("Sum > @high")

rolls.query("Sum > @high & Die3 > 2") # you can use & symbol aswell.

Unnamed: 0,Die1,Die2,Die3,Sum
2,4,5,3,12


In [35]:
df_os = pd.read_csv("../Data/athlete_events.csv")

