In [1]:
import pandas as pd
import numpy as np
import gzip
import pyarrow as pa

In [2]:
def get_dataset(size):
    # Create Fake Dataset
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big','medium','small'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red','blue','yellow','green'], size)
    df['win'] = np.random.choice(['yes','no'], size)
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['date'] = np.random.choice(dates, size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

In [3]:
def set_dtypes(df):
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int16')
    df['win'] = df['win'].map({'yes':True, 'no': False})
    df['prob'] = df['prob'].astype('float32')
    return df

In [4]:
%time df = get_dataset(100_000_000)
df = set_dtypes(df)
df.head()

CPU times: total: 43.2 s
Wall time: 43.5 s


Unnamed: 0,size,age,team,win,date,prob
0,medium,33,red,False,2020-07-08,0.799731
1,small,34,green,True,2022-05-06,0.514919
2,medium,24,green,False,2022-11-11,0.731323
3,small,6,yellow,True,2020-10-30,0.840817
4,small,29,yellow,True,2020-11-17,0.962547


In [7]:
print('writing CSV')
%time df.to_csv('test.csv')

writing CSV
CPU times: total: 4min 22s
Wall time: 4min 23s


In [9]:
print('Reading CSV')
%time df_csv = pd.read_csv('test.csv')

Reading CSV
CPU times: total: 48 s
Wall time: 48 s


In [10]:
print('writing Pickle')
%time df.to_pickle('test.pickle')

writing Pickle
CPU times: total: 1.03 s
Wall time: 1min 1s


In [11]:
print('Reading Pickle')
%time df_pickle = pd.read_pickle('test.pickle')

Reading Pickle
CPU times: total: 422 ms
Wall time: 427 ms


In [12]:
!pip install pyarrow
!pip install fastparquet



In [12]:
print('writing Parquet')
%time df.to_parquet('test.parquet')

writing Parquet
CPU times: total: 10.7 s
Wall time: 10.1 s


In [29]:
print('Reading Parquet')
%time df_parquet = pd.read_parquet('test.parquet')
# df_parquet

Reading Parquet
CPU times: total: 10.9 s
Wall time: 3.97 s


Unnamed: 0,size,age,team,win,date,prob
0,medium,25,green,True,2021-04-22,0.342737
1,big,6,blue,True,2021-12-29,0.575536
2,medium,46,blue,True,2020-11-07,0.514036
3,big,9,yellow,False,2021-10-17,0.266284
4,small,30,red,True,2022-02-04,0.770092
...,...,...,...,...,...,...
99999995,big,16,green,True,2021-09-30,0.032252
99999996,medium,8,red,False,2022-12-30,0.659766
99999997,small,48,blue,True,2020-09-05,0.582637
99999998,medium,12,green,True,2022-11-05,0.088702


In [14]:
print('writing Parquet with gzip')
%time df.to_parquet('test.parquet.gz', compression='gzip')

writing Parquet with gzip
CPU times: total: 34.9 s
Wall time: 35.9 s


In [28]:
print('Reading Parquet with gzip')
%time table = pa.parquet.read_table('test.parquet.gz')
# table.to_pandas()

Reading Parquet with gzip
CPU times: total: 6 s
Wall time: 1.17 s


Unnamed: 0,size,age,team,win,date,prob
0,medium,25,green,True,2021-04-22,0.342737
1,big,6,blue,True,2021-12-29,0.575536
2,medium,46,blue,True,2020-11-07,0.514036
3,big,9,yellow,False,2021-10-17,0.266284
4,small,30,red,True,2022-02-04,0.770092
...,...,...,...,...,...,...
99999995,big,16,green,True,2021-09-30,0.032252
99999996,medium,8,red,False,2022-12-30,0.659766
99999997,small,48,blue,True,2020-09-05,0.582637
99999998,medium,12,green,True,2022-11-05,0.088702


In [15]:
print('writing Feather')
%time df.to_feather('test.feather')

writing Feather
CPU times: total: 2.22 s
Wall time: 1.08 s


In [16]:
print('Reading Feather')
%time df_feather = pd.read_feather('test.feather')

Reading Feather
CPU times: total: 1.59 s
Wall time: 589 ms
