# Ideas

In [1]:
import pandas as pd
import numpy as np

## I1. Speed up Pandas Code
- Reduce Memory Useage

In [2]:
def get_dataset(size):
    # Create Fake Dataset
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big','medium','small'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red','blue','yellow','green'], size)
    df['win'] = np.random.choice(['yes','no'], size)
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['date'] = np.random.choice(dates, size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

In [3]:
df = get_dataset(1_000)
%timeit df['age_rank'] = df.groupby(['team','size'])['age'].rank()
%timeit df['date_rank'] = df.groupby(['team','size'])['date'].rank()

1.56 ms ± 26.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.6 ms ± 20.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [4]:
df = get_dataset(1_000_000)
%timeit df['age_rank'] = df.groupby(['team','size'])['age'].rank()
%timeit df['date_rank'] = df.groupby(['team','size'])['date'].rank()
%timeit df['prob_rank'] = df.groupby(['team','size'])['prob'].rank()

492 ms ± 8.49 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
551 ms ± 15.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
592 ms ± 5.97 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
df = get_dataset(1_000_000)
df['size'] = df['size'].astype('category')
%timeit df['age_rank'] = df.groupby(['team','size'])['age'].rank()
%timeit df['date_rank'] = df.groupby(['team','size'])['date'].rank()
%timeit df['prob_rank'] = df.groupby(['team','size'])['prob'].rank()

363 ms ± 6.93 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
433 ms ± 8.63 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
493 ms ± 3.89 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
df = get_dataset(1_000_000)
df['size'] = df['size'].astype('category')
df['team'] = df['team'].astype('category')
%timeit df['age_rank'] = df.groupby(['team','size'])['age'].rank()
%timeit df['date_rank'] = df.groupby(['team','size'])['date'].rank()
%timeit df['prob_rank'] = df.groupby(['team','size'])['prob'].rank()

288 ms ± 7.54 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
350 ms ± 16.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
393 ms ± 6.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
df = get_dataset(1_000_000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  object        
 1   age     1000000 non-null  int64         
 2   team    1000000 non-null  object        
 3   dq      1000000 non-null  object        
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 45.8+ MB


In [8]:
df = get_dataset(1_000_000)
df['size'] = df['size'].astype('category')
df['team'] = df['team'].astype('category')
df['age'] = df['age'].astype('int16')
df.info(verbose=False, show_counts=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Columns: 6 entries, size to prob
dtypes: category(2), datetime64[ns](1), float64(1), int16(1), object(1)
memory usage: 26.7+ MB


In [9]:
df = get_dataset(1_000_000)
df['size'] = df['size'].astype('category')
df['team'] = df['team'].astype('category')
df['age'] = df['age'].astype('int16')
df['dq'] = df['dq'].map({'yes':True, 'no': False})
df['prob'] = df['prob'].astype('float16')
df.info(verbose=False, show_counts=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Columns: 6 entries, size to prob
dtypes: bool(1), category(2), datetime64[ns](1), float16(1), int16(1)
memory usage: 14.3 MB


In [10]:
df = get_dataset(1_000_000)
df['size'] = df['size'].astype('category')
df['team'] = df['team'].astype('category')
df['age'] = df['age'].astype('int16')
%timeit df['age_rank'] = df.groupby(['team','size'])['age'].rank()
%timeit df['date_rank'] = df.groupby(['team','size'])['date'].rank()

298 ms ± 8.25 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
334 ms ± 3.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
df = get_dataset(1_000_000)
df['size'] = df['size'].astype('category')
df['team'] = df['team'].astype('category')
df['age'] = df['age'].astype('int16')
df['dq'] = df['dq'].map({'yes':True, 'no': False})
df['prob'] = df['prob'].astype('float16')
%timeit df['age_rank'] = df.groupby(['team','size'])['age'].rank()
%timeit df['date_rank'] = df.groupby(['team','size'])['date'].rank()
%timeit df['prob_rank'] = df.groupby(['team','size'])['prob'].rank()

282 ms ± 6.41 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
350 ms ± 4.52 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
360 ms ± 10.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Dataformat Comparison

In [12]:
def set_dtypes(df):
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int16')
    df['dq'] = df['dq'].map({'yes':True, 'no': False})
    df['prob'] = df['prob'].astype('float16')
    return df

In [13]:
df = get_dataset(1_000)
%timeit df.to_csv('test.csv')
%timeit df_csv = pd.read_csv('test.csv')

14.8 ms ± 479 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.73 ms ± 38.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
# 51k
!ls -GFlash test.csv

52K -rw-r--r-- 1 root 51K Mar 26 17:13 test.csv


In [15]:
df = get_dataset(1_000)
%timeit df.to_pickle('test.pickle')
%timeit df_pickle = pd.read_pickle('test.pickle')

973 µs ± 289 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
867 µs ± 8.84 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [16]:
# 46k
!ls -GFlash test.pickle

48K -rw-r--r-- 1 root 46K Mar 26 17:13 test.pickle


In [17]:
df = get_dataset(1_000)
%timeit df.to_parquet('test.parquet')
%timeit df_parquet = pd.read_parquet('test.parquet')

3.88 ms ± 207 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.64 ms ± 92.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
# 21k
!ls -GFlash test.parquet

24K -rw-r--r-- 1 root 21K Mar 26 17:13 test.parquet


In [19]:
df = get_dataset(1_000)
%timeit df.to_feather('test.feather')
%timeit df_feather = pd.read_feather('test.feather')

3.17 ms ± 66.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.34 ms ± 138 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [20]:
# 36k
!ls -GFlash test.feather

36K -rw-r--r-- 1 root 36K Mar 26 17:13 test.feather


In [21]:
df_csv = pd.read_csv('test.csv', index_col=[0])
df_pickle = pd.read_pickle('test.pickle')
df_feather = pd.read_feather('test.feather')
df_parquet = pd.read_parquet('test.parquet')

In [22]:
df_csv.info()
df_pickle.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   size    1000 non-null   object 
 1   age     1000 non-null   int64  
 2   team    1000 non-null   object 
 3   dq      1000 non-null   object 
 4   date    1000 non-null   object 
 5   prob    1000 non-null   float64
dtypes: float64(1), int64(1), object(4)
memory usage: 54.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   size    1000 non-null   object        
 1   age     1000 non-null   int64         
 2   team    1000 non-null   object        
 3   dq      1000 non-null   object        
 4   date    1000 non-null   datetime64[ns]
 5   prob    1000 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 47.0+ 

In [23]:
df_feather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   size    1000 non-null   object        
 1   age     1000 non-null   int64         
 2   team    1000 non-null   object        
 3   dq      1000 non-null   object        
 4   date    1000 non-null   datetime64[ns]
 5   prob    1000 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 47.0+ KB


In [24]:
df_pickle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   size    1000 non-null   object        
 1   age     1000 non-null   int64         
 2   team    1000 non-null   object        
 3   dq      1000 non-null   object        
 4   date    1000 non-null   datetime64[ns]
 5   prob    1000 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 47.0+ KB


# 10 Pandas Dataframe Functions to know

In [25]:
df = get_dataset(1_000)

In [26]:
df

Unnamed: 0,size,age,team,dq,date,prob
0,medium,14,yellow,yes,2022-12-22,0.374009
1,medium,25,green,no,2020-03-08,0.865554
2,small,42,red,yes,2021-11-02,0.427245
3,big,31,blue,no,2021-07-18,0.867769
4,small,47,blue,yes,2020-04-24,0.673837
...,...,...,...,...,...,...
995,medium,31,yellow,yes,2021-11-25,0.122868
996,medium,18,green,yes,2020-10-06,0.289478
997,big,6,red,yes,2020-10-05,0.831830
998,big,15,green,yes,2021-12-01,0.557518
