In [5]:
import pandas as pd
import numpy as np

# Create our dataset

In [56]:
def get_dataset(size):
    # Create Fake Dataset
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big', 'medium', 'small'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red', 'blue', 'yellow', 'green'], size)
    df['win'] = np.random.choice(['yes', 'no', size])
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['date'] = np.random.choice(dates, size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

def set_dtypes(df):
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int16')
    df['win'] = df['win'].map({'yes' : True, 'no' : False})
    df['prob'] = df['prob'].astype('float16')
    return df


# CSV
- 46 MB
- 3.23s to save
- 0.6s to read

In [67]:
%%timeit
df = get_dataset(1_000_000)
df.to_csv('test_csv.csv', index=True)

3.23 s ± 34.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [50]:
%%timeit
df = pd.read_csv('test_csv.csv', index_col=[0])

564 ms ± 13 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [51]:
df.head()

Unnamed: 0,size,age,team,win,date,prob
0,big,36,blue,yes,2022-05-12,0.040263
1,medium,41,yellow,yes,2021-10-05,0.320839
2,big,41,yellow,yes,2022-10-07,0.726127
3,medium,12,red,yes,2022-08-31,0.356284
4,big,39,green,yes,2022-09-03,0.353293


In [57]:
df = get_dataset(1_000_000)
df = set_dtypes(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  category      
 1   age     1000000 non-null  int16         
 2   team    1000000 non-null  category      
 3   win     1000000 non-null  bool          
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float16       
dtypes: bool(1), category(2), datetime64[ns](1), float16(1), int16(1)
memory usage: 14.3 MB


In [58]:
df.to_csv('test_csv.csv', index=False)
df = pd.read_csv('test_csv.csv')

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   size    1000000 non-null  object 
 1   age     1000000 non-null  int64  
 2   team    1000000 non-null  object 
 3   win     1000000 non-null  bool   
 4   date    1000000 non-null  object 
 5   prob    1000000 non-null  float64
dtypes: bool(1), float64(1), int64(1), object(3)
memory usage: 39.1+ MB


In [63]:
df = pd.read_csv('test_csv.csv',
                 dtype={'size':'category',
                        'age':'int16',
                        'team':'category',
                        'prob':'float16',})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype   
---  ------  --------------    -----   
 0   size    1000000 non-null  category
 1   age     1000000 non-null  int16   
 2   team    1000000 non-null  category
 3   win     1000000 non-null  bool    
 4   date    1000000 non-null  object  
 5   prob    1000000 non-null  float16 
dtypes: bool(1), category(2), float16(1), int16(1), object(1)
memory usage: 14.3+ MB


# Pickle
- 0.5s to write
- 0.2s to read

In [72]:
df = get_dataset(1_000_000)
%timeit df.to_pickle('test.pickle')
%timeit df_pickle = pd.read_pickle('test.pickle')

538 ms ± 21.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
241 ms ± 17.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [73]:
df = get_dataset(1_000_000)
df = set_dtypes(df)
%timeit df.to_pickle('test.pickle')
%timeit df_pickle = pd.read_pickle('test.pickle')

The slowest run took 10.24 times longer than the fastest. This could mean that an intermediate result is being cached.
12.7 ms ± 17.2 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.57 ms ± 297 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  category      
 1   age     1000000 non-null  int16         
 2   team    1000000 non-null  category      
 3   win     1000000 non-null  bool          
 4   date    1000000 non-null  datetime64[ns]
 5   prob    1000000 non-null  float16       
dtypes: bool(1), category(2), datetime64[ns](1), float16(1), int16(1)
memory usage: 14.3 MB


# Parquet
```
!pip install pyarrow
!pip install fastparquet
```


In [76]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-14.0.0-cp311-cp311-win_amd64.whl (24.6 MB)
                                              0.0/24.6 MB ? eta -:--:--
                                              0.0/24.6 MB ? eta -:--:--
                                              0.1/24.6 MB 1.2 MB/s eta 0:00:22
                                              0.4/24.6 MB 3.2 MB/s eta 0:00:08
     -                                        0.6/24.6 MB 3.9 MB/s eta 0:00:07
     -                                        0.9/24.6 MB 4.3 MB/s eta 0:00:06
     -                                        1.2/24.6 MB 4.6 MB/s eta 0:00:06
     --                                       1.5/24.6 MB 4.9 MB/s eta 0:00:05
     --                                       1.7/24.6 MB 5.0 MB/s eta 0:00:05
     ---                                      2.0/24.6 MB 5.1 MB/s eta 0:00:05
     ---                                      2.2/24.6 MB 5.1 MB/s eta 0:00:05
     ----                                     2.5/24.6 MB 5.1


[notice] A new release of pip is available: 23.1 -> 23.3.1
[notice] To update, run: C:\Users\Dominc\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [78]:
df = get_dataset(1_000_000)
%timeit df.to_parquet('test.parquet')
%timeit df_parquet = pd.read_parquet('test.parquet')

377 ms ± 5.08 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
156 ms ± 751 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [82]:
# Read in specific columns
df_example = pd.read_parquet('test.parquet', columns=['date', 'win'])

# Feather
- 0.2s write
- 0.1s read

In [83]:
df = get_dataset(1_000_000)
%timeit df.to_feather('test.feather')
%timeit df_feather = pd.read_feather('test.feather')

196 ms ± 6.95 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
119 ms ± 1.59 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
