In [1]:
import pandas as pd
import os
import ast
import glob
import datetime
import datatable as dt

In [2]:
import fastparquet
# import feather # only for Python 3.8 and above

In [3]:
output_path = 'outputs/'

In [24]:
def to_df(file_name: str) -> pd.DataFrame:
    '''
    Read single file generated by the API script
    '''
    with open(file_name, 'r') as file:
        lines = file.readlines()
    return pd.json_normalize(ast.literal_eval(lines[2::4][0])['result'])

In [25]:
df = pd.concat([to_df(str(file)) for file in glob.glob('*txt')], ignore_index = True)

In [26]:
df

Unnamed: 0,Lines,Lon,VehicleNumber,Time,Lat,Brigade
0,11,20.925463,1185+1184,2022-07-20 15:00:06,52.260548,3
1,28,20.934261,1239+1240,2022-07-20 15:00:10,52.271328,2
2,27,20.979286,1243,2022-07-20 15:00:05,52.236553,07
3,22,20.931639,1247+1248,2022-07-20 15:00:08,52.279114,011
4,33,20.945185,1255+1256,2022-07-20 14:32:45,52.271680,4
...,...,...,...,...,...,...
106213,24,21.119200,4101,2022-07-20 21:00:12,52.238300,3
106214,2,20.929500,4201,2022-07-20 21:00:10,52.291900,010
106215,17,20.941900,4205,2022-07-20 21:00:13,52.334500,10
106216,2,20.965800,4206,2022-07-20 21:00:11,52.313200,4


In [27]:
def convert_dataset(df: pd.DataFrame) -> pd.DataFrame:
    df['Lat'] = (df['Lat'] * 1_000_000).astype('int32')
    df['Lon'] = (df['Lon'] * 1_000_000).astype('int32')

    df['Time'] = pd.to_datetime(df['Time'], format = '%Y-%m-%d %H:%M:%S')
    df['Time'] = (pd.Timestamp('2022-01-01 00:00:00') - df['Time']).dt.total_seconds().astype('int')
    
    df['VehicleNumber'] =  df['VehicleNumber'].map(lambda x: x.replace('+', ''))
    df['Brigade'] = df['Brigade'].apply(lambda x: '-' + f'{x}'[1:] if x.startswith('0') else x)

    df = df.astype({'Lines': 'int8', 'Brigade': 'int16', 'VehicleNumber': 'int32'})

    for column in df:
        if (df[column].dtype == 'int8') or (df[column].dtype == 'int16') or (df[column].dtype == 'int32'):
            df[column]=pd.to_numeric(df[column], downcast='integer')

    return df

In [29]:
def restore_dataset(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.copy()
    df['Lat'] = df['Lat'] / 1_000_000
    df['Lon'] = df['Lon'] / 1_000_000
    df['Time'] = [pd.to_datetime('01/01/2022') - pd.DateOffset(seconds = x) for x in df['Time']]
    df['Time'] = pd.to_datetime(df['Time'], format = '%Y-%m-%d %H:%M:%S')

    df['VehicleNumber'] = df['VehicleNumber'].astype('str')
    df['VehicleNumber'] = df['VehicleNumber'].map(lambda x: x[0:4] + '+' + x[4:] if len(x) > 5 else x)
    

    df['Brigade'] = df['Brigade'].astype('str') 
    df['Brigade'] = df['Brigade'].apply(lambda x: '0' + f'{x}'[1:] if x.startswith('-') else x)
    return df

In [30]:
df_results = pd.DataFrame(columns = ['method', 'index', 'compressed', 'converted','file_size', 'write_time', 'read_time', 'restore_time', 'total_time'])

In [31]:
def add_row(dataframe, method, index, compressed, converted, file_size, write_time, read_time, restore_time, total_time):
    row = {
        'method': method,
        'index': index,
        'compressed': compressed,
        'converted': converted,
        'file_size': file_size,
        'write_time': write_time,
        'read_time': read_time,
        'restore_time': restore_time,
        'total_time': total_time,
    }
    return dataframe.append(pd.Series(row), ignore_index = True)

### Raw data ###

Raw CSV

In [32]:
method_name = 'Raw CSV'
file_name = f'{output_path}test.csv'

index = 1
compressed = 0
converted = 0

write_time = %timeit -n5 -r5 -o df.to_csv(file_name)
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_csv(file_name)
restore_time = 0
total_time = read_time.average*1000 + restore_time

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time, total_time)

386 ms ± 5.19 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
85.4 ms ± 3.45 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [None]:
# sizes in kilobites ,times in miliseconds 

In [33]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0,85.383484


Raw CSV (no index)

In [34]:
method_name = 'Raw CSV (no index)'
file_name = f'{output_path}test.csv'

index = 0
compressed = 0
converted = 0

write_time = %timeit -n5 -r5 -o df.to_csv(file_name, index= False)
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_csv(file_name)
restore_time = 0
total_time = read_time.average*1000 + restore_time

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time, total_time)

432 ms ± 54.1 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
89.7 ms ± 4.79 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [35]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0,89.725564


CSV (no index, GZIP compression)

In [36]:
method_name = 'CSV (no index, GZIP compression)'
file_name = f'{output_path}test.gzip'

index = 0
compressed = 1
converted = 0

write_time = %timeit -n5 -r5 -o df.to_csv(file_name, index= False, compression='gzip')
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_csv(file_name, compression='gzip')
restore_time = 0
total_time = read_time.average*1000 + restore_time

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time, total_time)

417 ms ± 25.3 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
78 ms ± 576 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [37]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0,78.016364


CSV (no index, BZ2 compression)

In [38]:
method_name = 'CSV (no index, BZ2 compression)'
file_name = f'{output_path}test.bz2'

index = 0
compressed = 1
converted = 0

write_time = %timeit -n5 -r5 -o df.to_csv(file_name, index= False, compression='bz2')
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_csv(file_name, compression='bz2')
restore_time = 0
total_time = read_time.average*1000 + restore_time

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time, total_time)

1.93 s ± 16.3 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
212 ms ± 4.56 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [39]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0,211.910276


Pickle

In [40]:
method_name = 'Pickle'
file_name = f'{output_path}test.pkl'

index = 1
compressed = 0
converted = 0

write_time = %timeit -n5 -r5 -o df.to_pickle(file_name)
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_pickle(file_name)
restore_time = 0
total_time = read_time.average*1000 + restore_time

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time, total_time)

89.6 ms ± 4.12 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
47.3 ms ± 1.7 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [41]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0,47.291392


Pickle (GZIP compression)

In [42]:
method_name = 'Pickle (GZIP compression)'
file_name = f'{output_path}test.pkl'

index = 1
compressed = 1
converted = 0

write_time = %timeit -n5 -r5 -o df.to_pickle(file_name, compression='gzip')
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_pickle(file_name, compression='gzip')
restore_time = 0
total_time = read_time.average*1000 + restore_time

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time, total_time)

138 ms ± 3.11 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
55.2 ms ± 2.79 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [43]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0,55.15808


Pickle (BZ2 compression)

In [44]:
method_name = 'Pickle (BZ2 compression)'
file_name = f'{output_path}test.pkl'

index = 1
compressed = 1
converted = 0

write_time = %timeit -n5 -r5 -o df.to_pickle(file_name, compression='bz2')
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_pickle(file_name, compression='bz2')
restore_time = 0
total_time = read_time.average*1000 + restore_time

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time, total_time)

1.53 s ± 224 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
188 ms ± 30 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [45]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0,188.449248


Pickle (ZIP compression)

In [46]:
method_name = 'Pickle (ZIP compression)'
file_name = f'{output_path}test.pkl'

index = 1
compressed = 1
converted = 0

write_time = %timeit -n5 -r5 -o df.to_pickle(file_name, compression='zip')
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_pickle(file_name, compression='zip')
restore_time = 0
total_time = read_time.average*1000 + restore_time

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time, total_time)

133 ms ± 1.82 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
55.9 ms ± 2.3 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [47]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0,55.85654


Pickle (XZ compression)

In [48]:
method_name = 'Pickle (XZ compression)'
file_name = f'{output_path}test.pkl'

index = 1
compressed = 1
converted = 0

write_time = %timeit -n5 -r5 -o df.to_pickle(file_name, compression='xz')
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_pickle(file_name, compression='xz')
restore_time = 0
total_time = read_time.average*1000 + restore_time

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time, total_time)

993 ms ± 25.5 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
69.1 ms ± 3.91 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [49]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0,55.85654
8,Pickle (XZ compression),1,1,0,31.707764,992.55286,69.091756,0,69.091756


HDF

In [50]:
method_name = 'HDF'
file_name = f'{output_path}test.h5'

index = 1
compressed = 0
converted = 0

write_time = %timeit -n5 -r5 -o df.to_hdf(file_name, key='key', mode = 'w') 
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_hdf(file_name, key = 'key', mode='r')
restore_time = 0
total_time = read_time.average*1000 + restore_time

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time, total_time)

210 ms ± 83.9 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
125 ms ± 5.42 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [51]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0,55.85654
8,Pickle (XZ compression),1,1,0,31.707764,992.55286,69.091756,0,69.091756
9,HDF,1,0,0,7280.578613,210.16126,124.874748,0,124.874748


Parquet

In [52]:
method_name = 'Parquet'
file_name = f'{output_path}test.parquet'

index = 1
compressed = 0
converted = 0

write_time = %timeit -n5 -r5 -o df.to_parquet(file_name, engine='fastparquet')
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_parquet(file_name)
restore_time = 0
total_time = read_time.average*1000 + restore_time

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time, total_time)

261 ms ± 15.4 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
36.6 ms ± 23 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [53]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0,55.85654
8,Pickle (XZ compression),1,1,0,31.707764,992.55286,69.091756,0,69.091756
9,HDF,1,0,0,7280.578613,210.16126,124.874748,0,124.874748


Jay

In [54]:
method_name = 'Jay'
file_name = f'{output_path}test.jay'

index = 1
compressed = 0
converted = 0

write_time = %timeit -n5 -r5 -o dt.Frame(df).to_jay(file_name)
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o dt.fread(file_name).to_pandas()
restore_time = 0
total_time = read_time.average*1000 + restore_time

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time, total_time)

105 ms ± 4.75 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
94.5 ms ± 2.11 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [55]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0,55.85654
8,Pickle (XZ compression),1,1,0,31.707764,992.55286,69.091756,0,69.091756
9,HDF,1,0,0,7280.578613,210.16126,124.874748,0,124.874748


### Converted data ###

Raw CSV

In [56]:
df_t = convert_dataset(df)

In [57]:
df_t

Unnamed: 0,Lines,Lon,VehicleNumber,Time,Lat,Brigade
0,11,20925463,11851184,-17334006,52260548,3
1,28,20934261,12391240,-17334010,52271328,2
2,27,20979286,1243,-17334005,52236553,-7
3,22,20931639,12471248,-17334008,52279114,-11
4,33,20945185,12551256,-17332365,52271680,4
...,...,...,...,...,...,...
106213,24,21119200,4101,-17355612,52238300,3
106214,2,20929500,4201,-17355610,52291900,-10
106215,17,20941900,4205,-17355613,52334500,10
106216,2,20965800,4206,-17355611,52313200,4


In [58]:
method_name = 'Raw CSV - CONVERTED'
file_name = f'{output_path}test.csv'

index = 1
compressed = 0
converted = 1

write_time = %timeit -n5 -r5 -o df_t.to_csv(file_name)
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_csv(file_name)
df_t = pd.read_csv(file_name)
restore_time = %timeit -n5 -r5 -o restore_dataset(df_t)
total_time = read_time.average*1000 + restore_time.average*1000

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time.average*1000, total_time)

277 ms ± 13 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
76.8 ms ± 5.53 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
14 s ± 1.18 s per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [60]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0.0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0.0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0.0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0.0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0.0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0.0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0.0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0.0,55.85654
8,Pickle (XZ compression),1,1,0,31.707764,992.55286,69.091756,0.0,69.091756
9,HDF,1,0,0,7280.578613,210.16126,124.874748,0.0,124.874748


Raw CSV (no index)

In [61]:
method_name = 'Raw CSV (no index) - CONVERTED'
file_name = f'{output_path}test.csv'

index = 0
compressed = 0
converted = 1

write_time = %timeit -n5 -r5 -o df.to_csv(file_name, index= False)
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_csv(file_name)
df_t = pd.read_csv(file_name)
restore_time = %timeit -n5 -r5 -o restore_dataset(df_t)
total_time = read_time.average*1000 + restore_time.average*1000

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time.average*1000, total_time)

219 ms ± 3.57 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
71.8 ms ± 4.1 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
14 s ± 346 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [62]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0.0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0.0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0.0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0.0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0.0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0.0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0.0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0.0,55.85654
8,Pickle (XZ compression),1,1,0,31.707764,992.55286,69.091756,0.0,69.091756
9,HDF,1,0,0,7280.578613,210.16126,124.874748,0.0,124.874748


CSV (no index, GZIP compression)

In [63]:
method_name = 'CSV (no index, GZIP compression) - CONVERTED'
file_name = f'{output_path}test.gzip'

index = 0
compressed = 1
converted = 1

write_time = %timeit -n5 -r5 -o df.to_csv(file_name, index= False, compression='gzip')
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_csv(file_name, compression='gzip')
df_t = pd.read_csv(file_name, compression='gzip')
restore_time = %timeit -n5 -r5 -o restore_dataset(df_t)
total_time = read_time.average*1000 + restore_time.average*1000

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time.average*1000, total_time)

291 ms ± 5.43 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
78.4 ms ± 6.58 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
14.1 s ± 377 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [64]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0.0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0.0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0.0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0.0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0.0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0.0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0.0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0.0,55.85654
8,Pickle (XZ compression),1,1,0,31.707764,992.55286,69.091756,0.0,69.091756
9,HDF,1,0,0,7280.578613,210.16126,124.874748,0.0,124.874748


CSV (no index, BZ2 compression)

In [65]:
method_name = 'CSV (no index, BZ2 compression) - CONVERTED'
file_name = f'{output_path}test.bz2'

index = 0
compressed = 1
converted = 1

write_time = %timeit -n5 -r5 -o df.to_csv(file_name, index= False, compression='bz2')
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_csv(file_name, compression='bz2')
df_t = pd.read_csv(file_name, compression='bz2')
restore_time = %timeit -n5 -r5 -o restore_dataset(df_t)
total_time = read_time.average*1000 + restore_time.average*1000

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time.average*1000, total_time)

1.86 s ± 128 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
197 ms ± 5.41 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
14.1 s ± 830 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [66]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0.0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0.0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0.0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0.0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0.0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0.0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0.0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0.0,55.85654
8,Pickle (XZ compression),1,1,0,31.707764,992.55286,69.091756,0.0,69.091756
9,HDF,1,0,0,7280.578613,210.16126,124.874748,0.0,124.874748


Pickle

In [67]:
method_name = 'Pickle - CONVERTED'
file_name = f'{output_path}test.pkl'

index = 1
compressed = 0
converted = 1

write_time = %timeit -n5 -r5 -o df.to_pickle(file_name)
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_pickle(file_name)
df_t = pd.read_pickle(file_name)
restore_time = %timeit -n5 -r5 -o restore_dataset(df_t)
total_time = read_time.average*1000 + restore_time.average*1000

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time.average*1000, total_time)

74 ms ± 2.25 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
34.5 ms ± 1.82 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
13.8 s ± 484 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [68]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0.0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0.0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0.0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0.0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0.0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0.0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0.0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0.0,55.85654
8,Pickle (XZ compression),1,1,0,31.707764,992.55286,69.091756,0.0,69.091756
9,HDF,1,0,0,7280.578613,210.16126,124.874748,0.0,124.874748


Pickle (GZIP compression)

In [69]:
method_name = 'Pickle (GZIP compression) - CONVERTED'
file_name = f'{output_path}test.pkl'

index = 1
compressed = 1
converted = 1

write_time = %timeit -n5 -r5 -o df.to_pickle(file_name, compression='gzip')
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_pickle(file_name, compression='gzip')
df_t = pd.read_pickle(file_name, compression='gzip')
restore_time = %timeit -n5 -r5 -o restore_dataset(df_t)
total_time = read_time.average*1000 + restore_time.average*1000

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time.average*1000, total_time)

89.7 ms ± 4.16 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
33.4 ms ± 1.45 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
13 s ± 447 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [70]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0.0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0.0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0.0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0.0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0.0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0.0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0.0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0.0,55.85654
8,Pickle (XZ compression),1,1,0,31.707764,992.55286,69.091756,0.0,69.091756
9,HDF,1,0,0,7280.578613,210.16126,124.874748,0.0,124.874748


Pickle (BZ2 compression)

In [71]:
method_name = 'Pickle (BZ2 compression) - CONVERTED'
file_name = f'{output_path}test.pkl'

index = 1
compressed = 1
converted = 1

write_time = %timeit -n5 -r5 -o df.to_pickle(file_name, compression='bz2')
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_pickle(file_name, compression='bz2')
df_t = pd.read_pickle(file_name, compression='bz2')
restore_time = %timeit -n5 -r5 -o restore_dataset(df_t)
total_time = read_time.average*1000 + restore_time.average*1000

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time.average*1000, total_time)


823 ms ± 20.9 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
87.5 ms ± 2.79 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
13.6 s ± 797 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [72]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0.0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0.0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0.0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0.0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0.0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0.0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0.0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0.0,55.85654
8,Pickle (XZ compression),1,1,0,31.707764,992.55286,69.091756,0.0,69.091756
9,HDF,1,0,0,7280.578613,210.16126,124.874748,0.0,124.874748


Pickle (ZIP compression)

In [73]:
method_name = 'Pickle (ZIP compression) - CONVERTED'
file_name = f'{output_path}test.pkl'

index = 1
compressed = 1
converted = 1

write_time = %timeit -n5 -r5 -o df.to_pickle(file_name, compression='zip')
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_pickle(file_name, compression='zip')
df_t = pd.read_pickle(file_name, compression='zip')
restore_time = %timeit -n5 -r5 -o restore_dataset(df_t)
total_time = read_time.average*1000 + restore_time.average*1000

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time.average*1000, total_time)


91.3 ms ± 5.65 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
38.6 ms ± 5.12 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
13.4 s ± 151 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [74]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0.0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0.0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0.0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0.0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0.0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0.0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0.0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0.0,55.85654
8,Pickle (XZ compression),1,1,0,31.707764,992.55286,69.091756,0.0,69.091756
9,HDF,1,0,0,7280.578613,210.16126,124.874748,0.0,124.874748


Pickle (XZ compression)

Unnamed: 0,Lines,Lon,VehicleNumber,Time,Lat,Brigade
0,11,20925463,11851184,-17334006,52260548,3
1,28,20934261,12391240,-17334010,52271328,2
2,27,20979286,1243,-17334005,52236553,-7
3,22,20931639,12471248,-17334008,52279114,-11
4,33,20945185,12551256,-17332365,52271680,4
...,...,...,...,...,...,...
106213,24,21119200,4101,-17355612,52238300,3
106214,2,20929500,4201,-17355610,52291900,-10
106215,17,20941900,4205,-17355613,52334500,10
106216,2,20965800,4206,-17355611,52313200,4


In [75]:
method_name = 'Pickle (XZ compression) - CONVERTED'
file_name = f'{output_path}test.pkl'

index = 1
compressed = 1
converted = 1

write_time = %timeit -n5 -r5 -o df.to_pickle(file_name, compression='xz')
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_pickle(file_name, compression='xz')
df_t = pd.read_pickle(file_name, compression='xz')
restore_time = %timeit -n5 -r5 -o restore_dataset(df_t)
total_time = read_time.average*1000 + restore_time.average*1000

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time.average*1000, total_time)

466 ms ± 7.57 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
40.7 ms ± 1.02 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
13 s ± 335 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [76]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0.0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0.0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0.0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0.0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0.0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0.0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0.0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0.0,55.85654
8,Pickle (XZ compression),1,1,0,31.707764,992.55286,69.091756,0.0,69.091756
9,HDF,1,0,0,7280.578613,210.16126,124.874748,0.0,124.874748


HDF

In [77]:
method_name = 'HDF - CONVERTED'
file_name = f'{output_path}test.h5'

index = 1
compressed = 0
converted = 1

write_time = %timeit -n5 -r5 -o df.to_hdf(file_name, key='key', mode = 'w') 
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_hdf(file_name, key = 'key', mode='r')
df_t = pd.read_hdf(file_name, key = 'key', mode='r')
restore_time = %timeit -n5 -r5 -o restore_dataset(df_t)
total_time = read_time.average*1000 + restore_time.average*1000

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time.average*1000, total_time)

124 ms ± 9.16 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
101 ms ± 4.17 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
12.4 s ± 300 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [78]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0.0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0.0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0.0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0.0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0.0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0.0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0.0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0.0,55.85654
8,Pickle (XZ compression),1,1,0,31.707764,992.55286,69.091756,0.0,69.091756
9,HDF,1,0,0,7280.578613,210.16126,124.874748,0.0,124.874748


Parquet

In [79]:
method_name = 'Parquet - CONVERTED'
file_name = f'{output_path}test.parquet'

index = 1
compressed = 0
converted = 1

write_time = %timeit -n5 -r5 -o df.to_parquet(file_name, engine='fastparquet')
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o pd.read_parquet(file_name)
df_t = pd.read_parquet(file_name)
restore_time = %timeit -n5 -r5 -o restore_dataset(df_t)
total_time = read_time.average*1000 + restore_time.average*1000

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time.average*1000, total_time)

190 ms ± 7.61 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
23.5 ms ± 12.7 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
12.6 s ± 715 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [80]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865.236282,386.332532,85.383484,0.0,85.383484
1,Raw CSV (no index),0,0,0,5262.116432,432.03302,89.725564,0.0,89.725564
2,"CSV (no index, GZIP compression)",0,1,0,65.775871,417.352308,78.016364,0.0,78.016364
3,"CSV (no index, BZ2 compression)",0,1,0,102.049828,1929.781728,211.910276,0.0,211.910276
4,Pickle,1,0,0,5497.30587,89.566628,47.291392,0.0,47.291392
5,Pickle (GZIP compression),1,1,0,72.402954,138.379676,55.15808,0.0,55.15808
6,Pickle (BZ2 compression),1,1,0,72.124481,1531.731052,188.449248,0.0,188.449248
7,Pickle (ZIP compression),1,1,0,90.713501,133.486824,55.85654,0.0,55.85654
8,Pickle (XZ compression),1,1,0,31.707764,992.55286,69.091756,0.0,69.091756
9,HDF,1,0,0,7280.578613,210.16126,124.874748,0.0,124.874748


Jay

In [81]:
method_name = 'Jay - CONVERTED'
file_name = f'{output_path}test.jay'

index = 1
compressed = 0
converted = 1

write_time = %timeit -n5 -r5 -o dt.Frame(df).to_jay(file_name)
file_size = os.path.getsize(file_name) / 1024**2
read_time = %timeit -n5 -r5 -o dt.fread(file_name).to_pandas()
df_t = dt.fread(file_name).to_pandas()
restore_time = %timeit -n5 -r5 -o restore_dataset(df_t)
total_time = read_time.average*1000 + restore_time.average*1000

df_results = add_row(df_results, method_name, index, compressed, converted, file_size*1000, write_time.average*1000, read_time.average*1000, restore_time.average*1000, total_time)

88.3 ms ± 582 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)
79.1 ms ± 1.5 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
12.3 s ± 380 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [84]:
df_results

Unnamed: 0,method,index,compressed,converted,file_size,write_time,read_time,restore_time,total_time
0,Raw CSV,1,0,0,5865,386,85,0,85
1,Raw CSV (no index),0,0,0,5262,432,89,0,89
2,"CSV (no index, GZIP compression)",0,1,0,65,417,78,0,78
3,"CSV (no index, BZ2 compression)",0,1,0,102,1929,211,0,211
4,Pickle,1,0,0,5497,89,47,0,47
5,Pickle (GZIP compression),1,1,0,72,138,55,0,55
6,Pickle (BZ2 compression),1,1,0,72,1531,188,0,188
7,Pickle (ZIP compression),1,1,0,90,133,55,0,55
8,Pickle (XZ compression),1,1,0,31,992,69,0,69
9,HDF,1,0,0,7280,210,124,0,124


In [83]:
df_results = df_results.astype({'file_size': 'int', 'write_time': 'int', 'read_time': 'int', 'restore_time': 'int', 'total_time': 'int'})

In [209]:
df_results.to_csv(f'{output_path}memory_disc_test_results_14d.csv')