# High performance pandas

```py
mask = (x > .5) & (y < .5)
df[df[mask]]
```

under the hood

```py
tmp1 = (x > .5)
tmp2 = (y < .5)
mask = tmp1 & tmp2
```
use pd.eval("") -> does elementwise operation with numexp

In [1]:
import numpy as np
import pandas as pd

nrows, ncols = 1_000_000, 100

df1,df2,df3,df4 = [pd.DataFrame(np.random.randn(nrows, ncols))for _ in range(4)]
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-1.339727,-0.27168,-0.513045,-0.526242,-1.056389,-1.276358,0.466754,0.823974,-0.303771,0.88641,...,-0.811404,1.179726,-0.494745,0.779461,1.632342,0.721008,1.315705,-0.145323,0.346462,0.780949
1,-1.043291,-0.735183,-1.021236,-0.153658,-1.142626,-0.508312,0.448485,-0.230889,-0.955037,1.125865,...,0.55626,0.469788,0.789528,0.804247,1.499228,-0.023517,-0.017669,-0.804718,0.158853,-0.706924
2,-0.693178,1.744491,0.004286,0.46938,-0.783724,0.15769,-0.864271,0.409577,0.085618,0.26787,...,1.251443,-1.437601,-0.691643,-0.85802,0.917177,0.443429,0.079583,-0.066231,-0.821445,0.01044
3,0.295429,-2.133311,-0.675765,1.613114,0.179281,0.697073,-1.051779,0.759767,-0.319435,0.664738,...,0.352951,-0.580606,-0.367326,-0.690457,-0.579044,1.524929,-1.466006,-0.497796,-0.673597,0.225251
4,0.128728,-0.967757,0.903041,-0.304895,-0.303864,-1.963146,0.138538,0.158673,0.258967,-0.484882,...,0.25433,1.465071,-0.291297,0.595248,-0.058608,-0.628012,1.72388,-0.176084,-0.909589,-0.742877


In [2]:
df1.shape

(1000000, 100)

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 100 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   0       1000000 non-null  float64
 1   1       1000000 non-null  float64
 2   2       1000000 non-null  float64
 3   3       1000000 non-null  float64
 4   4       1000000 non-null  float64
 5   5       1000000 non-null  float64
 6   6       1000000 non-null  float64
 7   7       1000000 non-null  float64
 8   8       1000000 non-null  float64
 9   9       1000000 non-null  float64
 10  10      1000000 non-null  float64
 11  11      1000000 non-null  float64
 12  12      1000000 non-null  float64
 13  13      1000000 non-null  float64
 14  14      1000000 non-null  float64
 15  15      1000000 non-null  float64
 16  16      1000000 non-null  float64
 17  17      1000000 non-null  float64
 18  18      1000000 non-null  float64
 19  19      1000000 non-null  float64
 20  20      1000000 non-null

In [7]:
%timeit df1+df2+df3+df4
%timeit pd.eval("df1+df2+df3+df4")

1.98 s ± 250 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
725 ms ± 19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
standard = df1+df2+df3+df4
sum_eval = pd.eval("df1+df2+df3+df4")

sum_eval.equals(standard)

True

In [11]:
rolls = pd.DataFrame(np.random.randint(1,6, (6,3)), columns = ["Die1", "Die2", "Die3"])
rolls

Unnamed: 0,Die1,Die2,Die3
0,3,3,2
1,4,4,4
2,5,2,3
3,3,3,2
4,5,1,5
5,4,3,5


In [13]:
rolls.eval("Sum = Die1 + Die2 + Die3", inplace=True)
rolls

Unnamed: 0,Die1,Die2,Die3,Sum
0,3,3,2,8
1,4,4,4,12
2,5,2,3,10
3,3,3,2,8
4,5,1,5,11
5,4,3,5,12


In [25]:
high = 9
rolls.eval("Winner = Sum > @high")

Unnamed: 0,Die1,Die2,Die3,Sum,Winner
0,3,3,2,8,False
1,4,4,4,12,True
2,5,2,3,10,True
3,3,3,2,8,False
4,5,1,5,11,True
5,4,3,5,12,True


In [30]:
# traditional way to filter from a dataframe
rolls[rolls["Sum"] > high]

Unnamed: 0,Die1,Die2,Die3,Sum
1,4,4,4,12
2,5,2,3,10
4,5,1,5,11
5,4,3,5,12


# Query
- filter using query

In [31]:
rolls.query("Sum > @high & Die1 == 4")

Unnamed: 0,Die1,Die2,Die3,Sum
1,4,4,4,12
5,4,3,5,12


In [36]:
df_os = pd.read_csv("../Data/athlete_events.csv")
df_os.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [40]:
df_os[df_os["NOC"] == "SWE"].head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
725,414,Arvid berg,M,26.0,,,Sweden,SWE,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's Hammer Throw,
726,415,Bjrn Olof Conny berg,M,23.0,181.0,76.0,Sweden,SWE,1992 Winter,1992,Winter,Albertville,Freestyle Skiing,Freestyle Skiing Men's Moguls,
727,416,Nils Georg berg,M,19.0,181.0,78.0,Sweden,SWE,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's Long Jump,Bronze
728,416,Nils Georg berg,M,19.0,181.0,78.0,Sweden,SWE,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's Triple Jump,Silver
729,417,Sara Helena berg,F,17.0,190.0,73.0,Sweden,SWE,1988 Summer,1988,Summer,Seoul,Swimming,Swimming Women's 50 metres Freestyle,


In [38]:
df_os.query("NOC == 'SWE'").head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
725,414,Arvid berg,M,26.0,,,Sweden,SWE,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's Hammer Throw,
726,415,Bjrn Olof Conny berg,M,23.0,181.0,76.0,Sweden,SWE,1992 Winter,1992,Winter,Albertville,Freestyle Skiing,Freestyle Skiing Men's Moguls,
727,416,Nils Georg berg,M,19.0,181.0,78.0,Sweden,SWE,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's Long Jump,Bronze
728,416,Nils Georg berg,M,19.0,181.0,78.0,Sweden,SWE,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's Triple Jump,Silver
729,417,Sara Helena berg,F,17.0,190.0,73.0,Sweden,SWE,1988 Summer,1988,Summer,Seoul,Swimming,Swimming Women's 50 metres Freestyle,


In [41]:
%timeit df_os[df_os["NOC"] == "SWE"]
%timeit df_os.query("NOC == 'SWE'")

27 ms ± 1.65 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
22.3 ms ± 3.69 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [43]:
%timeit df_os[df_os["Height"] > 180]
%timeit df_os.query("Height > 180")

19.1 ms ± 321 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
28.2 ms ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [44]:
df_os[(df_os["Sex"] == "F") & (df_os["Height"] > 100) & (df_os["NOC"] == "SWE")].head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
729,417,Sara Helena berg,F,17.0,190.0,73.0,Sweden,SWE,1988 Summer,1988,Summer,Seoul,Swimming,Swimming Women's 50 metres Freestyle,
731,419,Majken berg,F,30.0,170.0,60.0,Sweden,SWE,1948 Summer,1948,Summer,London,Athletics,Athletics Women's Discus Throw,
1631,904,Gun Margareta del (-Nilsson),F,25.0,163.0,52.0,Sweden,SWE,1964 Winter,1964,Winter,Innsbruck,Cross Country Skiing,Cross Country Skiing Women's 5 kilometres,
1632,904,Gun Margareta del (-Nilsson),F,25.0,163.0,52.0,Sweden,SWE,1964 Winter,1964,Winter,Innsbruck,Cross Country Skiing,Cross Country Skiing Women's 10 kilometres,
1742,970,Lena Kristina Adler,F,18.0,162.0,56.0,Sweden,SWE,1960 Summer,1960,Summer,Roma,Gymnastics,Gymnastics Women's Individual All-Around,


In [46]:
df_os.query("Sex == 'F' & Height > 180 & NOC == 'SWE'")

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
729,417,Sara Helena berg,F,17.0,190.0,73.0,Sweden,SWE,1988 Summer,1988,Summer,Seoul,Swimming,Swimming Women's 50 metres Freestyle,
5175,2940,Jenny Alm,F,27.0,184.0,80.0,Sweden,SWE,2016 Summer,2016,Summer,Rio de Janeiro,Handball,Handball Women's Handball,
7555,4210,Marina Vladimirovna Andrievskaia,F,29.0,182.0,66.0,Sweden,SWE,2004 Summer,2004,Summer,Athina,Badminton,Badminton Women's Singles,
19070,10088,Anna Therese Bengtsson,F,29.0,187.0,83.0,Sweden,SWE,2008 Summer,2008,Summer,Beijing,Handball,Handball Women's Handball,
28221,14643,Maria Helene Brandin,F,25.0,186.0,85.0,Sweden,SWE,1988 Summer,1988,Summer,Seoul,Rowing,Rowing Women's Double Sculls,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242230,121329,Linnea Maria Torstenson,F,33.0,186.0,82.0,Sweden,SWE,2016 Summer,2016,Summer,Rio de Janeiro,Handball,Handball Women's Handball,
259242,129789,Anna Karolina Westberg,F,22.0,184.0,78.0,Sweden,SWE,2000 Summer,2000,Summer,Sydney,Football,Football Women's Football,
259243,129789,Anna Karolina Westberg,F,26.0,184.0,78.0,Sweden,SWE,2004 Summer,2004,Summer,Athina,Football,Football Women's Football,
259934,130126,Johanna Maria Wiberg,F,24.0,184.0,78.0,Sweden,SWE,2008 Summer,2008,Summer,Beijing,Handball,Handball Women's Handball,


In [47]:
%timeit df_os[(df_os["Sex"] == "F") & (df_os["Height"] > 180) & (df_os["NOC"] == "SWE")]
%timeit df_os.query("Sex == 'F' & Height > 180 & NOC == 'SWE'")

50.8 ms ± 1.95 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
33.9 ms ± 6.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
