# Efficient Memory use in Pandas

In [2]:
import pandas as pd
import numpy as np

## Create our Data

In [3]:
def get_dataset(size):
    df = pd.DataFrame()
    df['position'] = np.random.choice(['left','middle','right'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red','blue','yellow','green'], size)
    df['win'] = np.random.choice(['yes','no'], size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

In [4]:
df = get_dataset(1_000_000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   position  1000000 non-null  object 
 1   age       1000000 non-null  int64  
 2   team      1000000 non-null  object 
 3   win       1000000 non-null  object 
 4   prob      1000000 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 38.1+ MB


In [5]:
%timeit df['age_rank'] = df.groupby(['team','position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team','position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team','position','win'])['prob'].rank()

397 ms ± 9.23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
441 ms ± 9.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
510 ms ± 13.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
df = get_dataset(1_000_000)
df['position'] = df['position'].astype('category')
df['team'] = df['team'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int64   
 2   team      1000000 non-null  category
 3   win       1000000 non-null  object  
 4   prob      1000000 non-null  float64 
dtypes: category(2), float64(1), int64(1), object(1)
memory usage: 24.8+ MB


## Downcasting Ints
Value Range
- int8 can store integers from -128 to 127.
- int16 can store integers from -32768 to 32767.
- int64 can store integers from -9223372036854775808 to 9223372036854775807.

In [7]:
df['age'] = df['age'].astype('int8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int8    
 2   team      1000000 non-null  category
 3   win       1000000 non-null  object  
 4   prob      1000000 non-null  float64 
dtypes: category(2), float64(1), int8(1), object(1)
memory usage: 18.1+ MB


## Downcasting Floats

In [8]:
df['prob'] = df['prob'].astype('float32')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int8    
 2   team      1000000 non-null  category
 3   win       1000000 non-null  object  
 4   prob      1000000 non-null  float32 
dtypes: category(2), float32(1), int8(1), object(1)
memory usage: 14.3+ MB


## Casting bool (true/false)

In [9]:
def set_dtypes(df):
    df['position'] = df['position'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int8')
    df['prob'] = df['prob'].astype('float32')
    df['win'] = df['win'].map({'yes':True, 'no':False})
    return df

## Time Comparison before/after casting dtypes

In [10]:
df = get_dataset(1_000_000)
%timeit df['age_rank'] = df.groupby(['team','position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team','position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team','position','win'])['prob'].rank()

359 ms ± 24.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
459 ms ± 31.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
530 ms ± 20.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
df = get_dataset(1_000_000)
df = set_dtypes(df)
%timeit df['age_rank'] = df.groupby(['team','position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team','position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team','position','win'])['prob'].rank()

215 ms ± 2.79 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
324 ms ± 5.09 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
349 ms ± 13.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Larger Data

In [None]:
df = get_dataset(10_000_000)
%timeit df['age_rank'] = df.groupby(['team','position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team','position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team','position','win'])['prob'].rank()

In [None]:
df = get_dataset(10_000_000)
df = set_dtypes(df)
%timeit df['age_rank'] = df.groupby(['team','position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team','position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team','position','win'])['prob'].rank()