# Efficient `pandas`

In [1]:
%pylab inline
plt.style.use("bmh")

Populating the interactive namespace from numpy and matplotlib


In [2]:
import numpy as np
import pandas as pd

In [3]:
import string

# Data

In [4]:
df = pd.DataFrame(np.arange(2000).reshape((1000,2)),
                  columns=['a', 'b'],
                  index=np.random.choice(list(string.ascii_lowercase), 1000, replace=True))

In [5]:
df.head()

Unnamed: 0,a,b
w,0,1
e,2,3
k,4,5
l,6,7
d,8,9


# Loops

## Naive

In [6]:
def iterate_df(df):
    """Iterate over df in Python loop."""

    result = []
    
    for i in range(df.shape[0]):
        row = df.iloc[i]
        result.append(row['a']/row['b'])
    return pd.Series(result, name="div_result", index=df.index)

In [7]:
%timeit -n 10 -r 5 iterate_df(df)

126 ms ± 12.2 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)


## Using `iterrows`

In [8]:
def iterate_df_rows(df):
    """Iterate over df in Python loop."""

    result = []
    
    for ri, row in df.iterrows():
        result.append(row['a']/row['b'])
    return pd.Series(result, name="AgeGroup", index=df.index)

In [9]:
%timeit -n 20 -r 5 iterate_df_rows(df)

88.9 ms ± 5.42 ms per loop (mean ± std. dev. of 5 runs, 20 loops each)


## Using `apply`

In [10]:
%timeit -n 20 -r 5 df.apply(lambda x: x['a']/x['b'], axis=1)
#This is significantly faster than the above two options

15.6 ms ± 1.27 ms per loop (mean ± std. dev. of 5 runs, 20 loops each)


## Using vectorization

In [13]:
%timeit -n 20 -r 5 df['a']/df['b']
#This is almost 100 times faster

264 µs ± 108 µs per loop (mean ± std. dev. of 5 runs, 20 loops each)


In [12]:
df['a']/df['b']

w    0.000000
e    0.666667
k    0.800000
l    0.857143
d    0.888889
       ...   
j    0.999498
i    0.999498
t    0.999499
q    0.999499
o    0.999500
Length: 1000, dtype: float64

In [53]:
%timeit -n 20 -r 5 df['a'].values/df['b'].values
#And this was almost instant, utilizing numpy-> In numpy, a and b are both in the same array, if they are the same dtype.
#If they weren't the same dtype, it may return the same speed as the pandas vectorization.

The slowest run took 7.53 times longer than the fastest. This could mean that an intermediate result is being cached.
29 µs ± 25.3 µs per loop (mean ± std. dev. of 5 runs, 20 loops each)


In [18]:
df['a'].values/df['b'].values

array([0.        , 0.66666667, 0.8       , 0.85714286, 0.88888889,
       0.90909091, 0.92307692, 0.93333333, 0.94117647, 0.94736842,
       0.95238095, 0.95652174, 0.96      , 0.96296296, 0.96551724,
       0.96774194, 0.96969697, 0.97142857, 0.97297297, 0.97435897,
       0.97560976, 0.97674419, 0.97777778, 0.9787234 , 0.97959184,
       0.98039216, 0.98113208, 0.98181818, 0.98245614, 0.98305085,
       0.98360656, 0.98412698, 0.98461538, 0.98507463, 0.98550725,
       0.98591549, 0.98630137, 0.98666667, 0.98701299, 0.98734177,
       0.98765432, 0.98795181, 0.98823529, 0.98850575, 0.98876404,
       0.98901099, 0.98924731, 0.98947368, 0.98969072, 0.98989899,
       0.99009901, 0.99029126, 0.99047619, 0.99065421, 0.99082569,
       0.99099099, 0.99115044, 0.99130435, 0.99145299, 0.99159664,
       0.99173554, 0.99186992, 0.992     , 0.99212598, 0.99224806,
       0.99236641, 0.9924812 , 0.99259259, 0.99270073, 0.99280576,
       0.9929078 , 0.99300699, 0.99310345, 0.99319728, 0.99328

Moral of the story-- Never use loops.

# Memory

In [54]:
titanic_train = pd.read_csv("train.csv", index_col="PassengerId")
titanic_test = pd.read_csv("test.csv", index_col="PassengerId")
titanic = pd.concat([titanic_train, titanic_test], sort=False)

titanic.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [55]:
titanic.dtypes #Alof of inneficiency comes from the data types used to store our data. For example, no reason Pclass neads to be an int64

Survived    float64
Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [56]:
titanic.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    1309 non-null   int64  
 2   Name      1309 non-null   object 
 3   Sex       1309 non-null   object 
 4   Age       1046 non-null   float64
 5   SibSp     1309 non-null   int64  
 6   Parch     1309 non-null   int64  
 7   Ticket    1309 non-null   object 
 8   Fare      1308 non-null   float64
 9   Cabin     295 non-null    object 
 10  Embarked  1307 non-null   object 
dtypes: float64(3), int64(3), object(5)
memory usage: 462.8 KB


Our memory is small, but so is our data. With larger data sets, we may need to cut down this memory usage

In [57]:
titanic["Pclass"] = pd.to_numeric(titanic["Pclass"], downcast="unsigned") #Downcast as much as possible, and make it unsigned (never going to be negative)

In [58]:
titanic.dtypes

Survived    float64
Pclass        uint8
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [59]:
titanic.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    1309 non-null   uint8  
 2   Name      1309 non-null   object 
 3   Sex       1309 non-null   object 
 4   Age       1046 non-null   float64
 5   SibSp     1309 non-null   int64  
 6   Parch     1309 non-null   int64  
 7   Ticket    1309 non-null   object 
 8   Fare      1308 non-null   float64
 9   Cabin     295 non-null    object 
 10  Embarked  1307 non-null   object 
dtypes: float64(3), int64(2), object(5), uint8(1)
memory usage: 453.9 KB


Didn't make such a big dent in our memory

In [60]:
titanic["SibSp"] = pd.to_numeric(titanic["SibSp"], downcast="unsigned")
titanic["Parch"] = pd.to_numeric(titanic["Parch"], downcast="unsigned")

In [61]:
titanic.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    1309 non-null   uint8  
 2   Name      1309 non-null   object 
 3   Sex       1309 non-null   object 
 4   Age       1046 non-null   float64
 5   SibSp     1309 non-null   uint8  
 6   Parch     1309 non-null   uint8  
 7   Ticket    1309 non-null   object 
 8   Fare      1308 non-null   float64
 9   Cabin     295 non-null    object 
 10  Embarked  1307 non-null   object 
dtypes: float64(3), object(5), uint8(3)
memory usage: 436.0 KB


## How to read it with correct `dtype`s right away?

In [62]:
titanic_train_trunc = pd.read_csv("train.csv", index_col="PassengerId",
                                  dtype={"Pclass":np.uint8,
                                         "SibSp":np.uint8,
                                         "Parch":np.uint8,
                                         "Survived":np.float32,
                                         "Age":np.float32,
                                         "Fare":np.float32},
                                  converters={"Sex": lambda x: (np.uint8(1)
                                                                if x=="female" else np.uint8(0)),
                                              "Embarked": lambda x: 0 if x=="S" else (1 if x=="C" else 2)})
titanic_test_trunc = pd.read_csv("test.csv", index_col="PassengerId",
                                 dtype={"Pclass":np.uint8,
                                        "SibSp":np.uint8,
                                        "Parch":np.uint8,
                                        "Survived":np.float32,
                                        "Age":np.float32,
                                        "Fare":np.float32},
                                 converters={"Sex": lambda x: np.uint8(1) if x=="female" else np.uint8(0),
                                             "Embarked": lambda x: 0 if x=="S" else (1 if x=="C" else 2)})
titanic_trunc = pd.concat([titanic_train_trunc, titanic_test_trunc], sort=False)

titanic_trunc.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float32
 1   Pclass    1309 non-null   uint8  
 2   Name      1309 non-null   object 
 3   Sex       1309 non-null   int64  
 4   Age       1046 non-null   float32
 5   SibSp     1309 non-null   uint8  
 6   Parch     1309 non-null   uint8  
 7   Ticket    1309 non-null   object 
 8   Fare      1308 non-null   float32
 9   Cabin     295 non-null    object 
 10  Embarked  1309 non-null   int64  
dtypes: float32(3), int64(2), object(3), uint8(3)
memory usage: 288.1 KB


THe memory is almost half the size that it was before

In [63]:
titanic_trunc.select_dtypes(np.uint8).head()

Unnamed: 0_level_0,Pclass,SibSp,Parch
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3,1,0
2,1,1,0
3,3,0,0
4,1,1,0
5,3,0,0


In [64]:
titanic_trunc

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.250000,,0
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.283302,C85,1
3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925000,,0
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.099998,C123,0
5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.050000,,0
...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",0,,0,0,A.5. 3236,8.050000,,0
1306,,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,PC 17758,108.900002,C105,1
1307,,3,"Saether, Mr. Simon Sivertsen",0,38.5,0,0,SOTON/O.Q. 3101262,7.250000,,0
1308,,3,"Ware, Mr. Frederick",0,,0,0,359309,8.050000,,0
