In [1]:
import pandas as pd
import time
from typing import Literal
from functools import partial
from typing import Optional
import os
from tqdm import tqdm

In [2]:
df = pd.read_pickle("./data/data_02.pkl.gz")
df

Unnamed: 0,text,uuid,int_0,int_1,int_2,int_3,int_4,int_5,int_6,int_7,...,float_10,float_11,float_12,float_13,float_14,float_15,float_16,float_17,float_18,float_19
0,"Write an article based on this ""A man has been...",227f72fe-fecc-4a8b-83fd-4c0bf79044b5,-1848,-7448,7011,-9271,-818,7723,7408,2765,...,1070.510375,-2632.194811,-1511.181933,3382.551305,2358.230930,6346.687782,7923.668912,7232.114931,-266.226245,-9490.335888
1,Answer the following question: - number is 54 ...,43f035e4-7905-4d4e-9b78-9ce1dbe3f15e,9485,3322,-210,4539,-5712,-6642,-6763,5408,...,9887.030489,1584.941985,1049.882522,-7057.736693,3311.705920,9250.504381,-5306.747252,4346.091141,3705.083462,586.077705
2,Produce a long descriptive sentence that uses ...,7d6cf004-4fc9-44b2-abbb-6aa6bb9e111f,-6568,-1429,7863,-5897,-4845,-9337,-4177,4800,...,122.453601,-6977.767998,-1533.505752,8333.095625,8700.795720,668.340118,4788.639416,-5222.894166,189.314977,-749.222275
3,Write a title for this article:\n\nArbitration...,1391998a-7260-4037-a886-fdba49282e8f,9467,-7638,5174,9196,-2378,9352,-4298,7949,...,5982.354362,-8063.945081,1541.905625,-9395.554755,1492.364621,7765.053363,6506.765269,2367.205007,-785.401680,-5978.024988
4,Read the following paragraph and determine if ...,5d1a24d4-e1d3-4e94-aabd-e58af037cfb3,-9609,-2289,-5844,3732,9806,-9914,3537,5583,...,4007.461801,-1500.406799,2546.834503,-3985.334523,8679.221751,6246.762728,1169.103626,-6382.974071,-2610.357519,4656.015420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363486,"Q: Context: Dragon Ball Z (ドラゴンボール ゼット, Dorago...",e28c91e5-2115-473b-b31e-dd144a23ff5c,2524,-6655,7295,-8549,9114,-3688,-1763,5185,...,1708.789279,-9009.858048,7385.617783,-706.417619,-7424.619217,-2887.129320,-7133.300432,-9886.948402,520.655812,1172.371076
363487,My question is: At the beginning of an academi...,a520be4b-773c-4147-b06a-47a74687060d,-8329,-9144,-8182,-6010,-6693,-2829,-1539,-7076,...,9303.174804,-6311.456145,9012.214267,3875.721583,3114.162623,2359.850872,5780.628130,7284.598954,5170.040989,927.500877
363488,"Leo: Given the sentence ""A small child in wate...",4fde38d0-d92e-40f4-8dd1-d0499f1f04b0,1880,-7913,-8065,1328,-4772,7595,-993,3088,...,-5330.552098,3927.901481,-112.163506,-8667.423113,6856.692759,-1924.300261,-7216.503573,6365.602925,1535.116783,-8458.252345
363489,Explain simply why yes is the correct answer t...,fc592a4e-986b-4502-be87-39beee10c6a9,-9594,-3164,-306,9538,5491,9416,-2350,6787,...,-5342.596101,3930.096931,4469.888351,4613.697146,1460.793708,3206.175178,529.310585,-1578.960446,-8725.192612,-5633.208819


In [3]:
# total memory usage in bytes without index
# see https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.memory_usage.html
mem_usage = df.memory_usage(index=False, deep=True).sum()
mem_usage

np.int64(823608914)

In [4]:
# total memory usage in m_bytes without index
mem_size_mb = mem_usage / 1024 / 1024
mem_size_mb

np.float64(785.454668045044)

In [5]:
def evaluate_performance(
        df,
        method: Literal["csv", "parquet"],
        compression: Optional[str] = None,
        compression_level: Optional[int] = None,
    ) -> dict:
    filename = f"./test_data/data.{method}"

    # set save_fn and read_fn
    save_fn = None
    read_fn = None
    additional_save_params = {}
    if method == "csv":
        if compression is not None:
            additional_save_params["compression"] = compression
        assert compression_level is None
        save_fn = partial(df.to_csv, index=False, path_or_buf=filename, **additional_save_params)
        read_fn = partial(pd.read_csv, filepath_or_buffer=filename, **additional_save_params)
    elif method == "parquet":
        if compression is not None:
            additional_save_params["compression"] = compression
        if compression_level is not None:
            additional_save_params["compression_level"] = compression_level
        save_fn = partial(df.to_parquet, index=False, engine="pyarrow", path=filename, **additional_save_params)
        read_fn = partial(pd.read_parquet, path=filename, engine="pyarrow")
    elif method == "feather":
        if compression is not None:
            additional_save_params["compression"] = compression
        if compression_level is not None:
            additional_save_params["compression_level"] = compression_level
        save_fn = partial(df.to_feather, path=filename, **additional_save_params)
        read_fn = partial(pd.read_feather, path=filename)
    assert save_fn is not None
    assert read_fn is not None


    # write
    start_time = time.time()
    save_fn()
    save_time = time.time() - start_time
    # print("save_time:", save_time)

    # read
    start_time = time.time()
    for _ in range(10):
        read_fn()
    read_time = (time.time() - start_time) / 10
    # print("read_time:", read_time)

    # get size
    file_size = os.path.getsize(filename) / 1024 / 1024
    # print("file_size:", file_size)

    # delete file
    os.remove(filename)

    result = {
        "method": method,
        "save_time": save_time,
        "read_time": read_time,
        "file_size": file_size,
    }

    for k, v in additional_save_params.items():
        result[k] = v

    return result


In [6]:
eval_results = []

In [7]:
# parquet, zstd
for cl in tqdm(range(1, 21)):
    r = evaluate_performance(df, "parquet", compression="zstd", compression_level=cl)
    print(r)
    eval_results.append(r)

  5%|▌         | 1/20 [00:12<04:05, 12.94s/it]

{'method': 'parquet', 'save_time': 2.312278985977173, 'read_time': 1.0616801977157593, 'file_size': 276.7370548248291, 'compression': 'zstd', 'compression_level': 1}


 10%|█         | 2/20 [00:25<03:53, 12.97s/it]

{'method': 'parquet', 'save_time': 2.5550568103790283, 'read_time': 1.043575406074524, 'file_size': 262.0364189147949, 'compression': 'zstd', 'compression_level': 2}


 15%|█▌        | 3/20 [00:39<03:41, 13.03s/it]

{'method': 'parquet', 'save_time': 2.830312967300415, 'read_time': 1.0259203910827637, 'file_size': 253.57188892364502, 'compression': 'zstd', 'compression_level': 3}


 20%|██        | 4/20 [00:51<03:27, 12.99s/it]

{'method': 'parquet', 'save_time': 2.7992680072784424, 'read_time': 1.0134236097335816, 'file_size': 251.29093742370605, 'compression': 'zstd', 'compression_level': 4}


 25%|██▌       | 5/20 [01:06<03:23, 13.54s/it]

{'method': 'parquet', 'save_time': 4.2797229290008545, 'read_time': 1.0230851650238038, 'file_size': 245.69420337677002, 'compression': 'zstd', 'compression_level': 5}


 30%|███       | 6/20 [01:22<03:20, 14.34s/it]

{'method': 'parquet', 'save_time': 5.905365943908691, 'read_time': 0.9991514921188355, 'file_size': 240.56724643707275, 'compression': 'zstd', 'compression_level': 6}


 35%|███▌      | 7/20 [01:38<03:16, 15.09s/it]

{'method': 'parquet', 'save_time': 6.737414836883545, 'read_time': 0.987249779701233, 'file_size': 238.34781455993652, 'compression': 'zstd', 'compression_level': 7}


 40%|████      | 8/20 [01:57<03:12, 16.08s/it]

{'method': 'parquet', 'save_time': 8.506093263626099, 'read_time': 0.9684303045272827, 'file_size': 236.48412799835205, 'compression': 'zstd', 'compression_level': 8}


 45%|████▌     | 9/20 [02:15<03:04, 16.77s/it]

{'method': 'parquet', 'save_time': 8.595574855804443, 'read_time': 0.968661093711853, 'file_size': 236.07024574279785, 'compression': 'zstd', 'compression_level': 9}


 50%|█████     | 10/20 [02:35<02:59, 17.91s/it]

{'method': 'parquet', 'save_time': 11.042462348937988, 'read_time': 0.9419200897216797, 'file_size': 234.6721591949463, 'compression': 'zstd', 'compression_level': 10}


 55%|█████▌    | 11/20 [02:59<02:56, 19.58s/it]

{'method': 'parquet', 'save_time': 13.962411642074585, 'read_time': 0.9412861108779907, 'file_size': 233.91981601715088, 'compression': 'zstd', 'compression_level': 11}


 60%|██████    | 12/20 [03:22<02:45, 20.72s/it]

{'method': 'parquet', 'save_time': 13.933927774429321, 'read_time': 0.9396738052368164, 'file_size': 233.9198350906372, 'compression': 'zstd', 'compression_level': 12}


 65%|██████▌   | 13/20 [04:06<03:13, 27.65s/it]

{'method': 'parquet', 'save_time': 34.09358263015747, 'read_time': 0.9474082946777344, 'file_size': 232.2176866531372, 'compression': 'zstd', 'compression_level': 13}


 70%|███████   | 14/20 [04:54<03:23, 33.89s/it]

{'method': 'parquet', 'save_time': 38.85608983039856, 'read_time': 0.9449542045593262, 'file_size': 231.64834594726562, 'compression': 'zstd', 'compression_level': 14}


 75%|███████▌  | 15/20 [05:44<03:14, 38.83s/it]

{'method': 'parquet', 'save_time': 40.654706954956055, 'read_time': 0.9637369871139526, 'file_size': 231.57147598266602, 'compression': 'zstd', 'compression_level': 15}


 80%|████████  | 16/20 [06:53<03:11, 47.90s/it]

{'method': 'parquet', 'save_time': 59.31754112243652, 'read_time': 0.9645037889480591, 'file_size': 225.10124588012695, 'compression': 'zstd', 'compression_level': 16}


 85%|████████▌ | 17/20 [08:12<02:51, 57.03s/it]

{'method': 'parquet', 'save_time': 68.43108606338501, 'read_time': 0.982734203338623, 'file_size': 224.3366470336914, 'compression': 'zstd', 'compression_level': 17}


 90%|█████████ | 18/20 [09:47<02:17, 68.67s/it]

{'method': 'parquet', 'save_time': 85.78946304321289, 'read_time': 0.9966506004333496, 'file_size': 223.1301155090332, 'compression': 'zstd', 'compression_level': 18}


 95%|█████████▌| 19/20 [11:30<01:18, 78.98s/it]

{'method': 'parquet', 'save_time': 93.06958389282227, 'read_time': 0.9918389081954956, 'file_size': 222.9604148864746, 'compression': 'zstd', 'compression_level': 19}


100%|██████████| 20/20 [13:10<00:00, 39.53s/it]

{'method': 'parquet', 'save_time': 89.84644603729248, 'read_time': 0.9848781108856202, 'file_size': 222.9604148864746, 'compression': 'zstd', 'compression_level': 20}





In [8]:
# parquet, lz4
for cl in tqdm(range(1, 21)):
    r = evaluate_performance(df, "parquet", compression="lz4", compression_level=cl)
    print(r)
    eval_results.append(r)

  5%|▌         | 1/20 [00:08<02:38,  8.34s/it]

{'method': 'parquet', 'save_time': 1.536238193511963, 'read_time': 0.6799933910369873, 'file_size': 375.51332664489746, 'compression': 'lz4', 'compression_level': 1}


 10%|█         | 2/20 [00:16<02:26,  8.15s/it]

{'method': 'parquet', 'save_time': 1.4375081062316895, 'read_time': 0.6575443029403687, 'file_size': 375.51332664489746, 'compression': 'lz4', 'compression_level': 2}


 15%|█▌        | 3/20 [00:29<02:59, 10.55s/it]

{'method': 'parquet', 'save_time': 6.640911102294922, 'read_time': 0.6747983694076538, 'file_size': 305.42485523223877, 'compression': 'lz4', 'compression_level': 3}


 20%|██        | 4/20 [00:44<03:16, 12.28s/it]

{'method': 'parquet', 'save_time': 8.237220764160156, 'read_time': 0.670818829536438, 'file_size': 300.04003524780273, 'compression': 'lz4', 'compression_level': 4}


 25%|██▌       | 5/20 [01:01<03:28, 13.90s/it]

{'method': 'parquet', 'save_time': 10.054103136062622, 'read_time': 0.6717385053634644, 'file_size': 297.17571926116943, 'compression': 'lz4', 'compression_level': 5}


 30%|███       | 6/20 [01:20<03:37, 15.53s/it]

{'method': 'parquet', 'save_time': 12.036086082458496, 'read_time': 0.6647548198699951, 'file_size': 295.8850908279419, 'compression': 'lz4', 'compression_level': 6}


 35%|███▌      | 7/20 [01:40<03:42, 17.14s/it]

{'method': 'parquet', 'save_time': 13.852581024169922, 'read_time': 0.6602885961532593, 'file_size': 295.331880569458, 'compression': 'lz4', 'compression_level': 7}


 40%|████      | 8/20 [02:02<03:43, 18.64s/it]

{'method': 'parquet', 'save_time': 15.337045907974243, 'read_time': 0.6516093015670776, 'file_size': 295.1030435562134, 'compression': 'lz4', 'compression_level': 8}


 45%|████▌     | 9/20 [02:25<03:40, 20.03s/it]

{'method': 'parquet', 'save_time': 16.652570009231567, 'read_time': 0.6417562246322632, 'file_size': 294.99389934539795, 'compression': 'lz4', 'compression_level': 9}


 50%|█████     | 10/20 [02:55<03:50, 23.04s/it]

{'method': 'parquet', 'save_time': 23.113133192062378, 'read_time': 0.6678667068481445, 'file_size': 293.82403659820557, 'compression': 'lz4', 'compression_level': 10}


 55%|█████▌    | 11/20 [03:31<04:03, 27.01s/it]

{'method': 'parquet', 'save_time': 29.203999996185303, 'read_time': 0.6806789875030518, 'file_size': 293.4069423675537, 'compression': 'lz4', 'compression_level': 11}


 60%|██████    | 12/20 [04:11<04:07, 30.88s/it]

{'method': 'parquet', 'save_time': 33.34485602378845, 'read_time': 0.6359492301940918, 'file_size': 293.3767786026001, 'compression': 'lz4', 'compression_level': 12}


 65%|██████▌   | 13/20 [04:50<03:54, 33.57s/it]

{'method': 'parquet', 'save_time': 33.37185215950012, 'read_time': 0.6393235921859741, 'file_size': 293.3767786026001, 'compression': 'lz4', 'compression_level': 13}


 70%|███████   | 14/20 [05:30<03:32, 35.49s/it]

{'method': 'parquet', 'save_time': 33.322794914245605, 'read_time': 0.6590963840484619, 'file_size': 293.3767786026001, 'compression': 'lz4', 'compression_level': 14}


 75%|███████▌  | 15/20 [06:10<03:04, 36.81s/it]

{'method': 'parquet', 'save_time': 33.37194490432739, 'read_time': 0.6485883235931397, 'file_size': 293.3767786026001, 'compression': 'lz4', 'compression_level': 15}


 80%|████████  | 16/20 [06:50<02:31, 37.84s/it]

{'method': 'parquet', 'save_time': 33.40215802192688, 'read_time': 0.6823335886001587, 'file_size': 293.3767786026001, 'compression': 'lz4', 'compression_level': 16}


 85%|████████▌ | 17/20 [07:30<01:55, 38.50s/it]

{'method': 'parquet', 'save_time': 33.22955894470215, 'read_time': 0.6824392318725586, 'file_size': 293.3767786026001, 'compression': 'lz4', 'compression_level': 17}


 90%|█████████ | 18/20 [08:11<01:17, 38.98s/it]

{'method': 'parquet', 'save_time': 33.36378717422485, 'read_time': 0.6724496126174927, 'file_size': 293.3767786026001, 'compression': 'lz4', 'compression_level': 18}


 95%|█████████▌| 19/20 [08:51<00:39, 39.36s/it]

{'method': 'parquet', 'save_time': 33.410090923309326, 'read_time': 0.6825837135314942, 'file_size': 293.3767786026001, 'compression': 'lz4', 'compression_level': 19}


100%|██████████| 20/20 [09:32<00:00, 28.63s/it]

{'method': 'parquet', 'save_time': 34.173468828201294, 'read_time': 0.7123449087142945, 'file_size': 293.3767786026001, 'compression': 'lz4', 'compression_level': 20}





In [9]:
# feather, zstd
for cl in tqdm(range(1, 21)):
    r = evaluate_performance(df, "feather", compression="zstd", compression_level=cl)
    print(r)
    eval_results.append(r)

  5%|▌         | 1/20 [00:09<03:00,  9.50s/it]

{'method': 'feather', 'save_time': 1.6337261199951172, 'read_time': 0.7863580226898194, 'file_size': 274.08423042297363, 'compression': 'zstd', 'compression_level': 1}


 10%|█         | 2/20 [00:20<03:02, 10.14s/it]

{'method': 'feather', 'save_time': 2.02420973777771, 'read_time': 0.8554614067077637, 'file_size': 258.8376941680908, 'compression': 'zstd', 'compression_level': 2}


 15%|█▌        | 3/20 [00:30<02:58, 10.48s/it]

{'method': 'feather', 'save_time': 2.2185800075531006, 'read_time': 0.8664433002471924, 'file_size': 249.05484199523926, 'compression': 'zstd', 'compression_level': 3}


 20%|██        | 4/20 [00:41<02:51, 10.69s/it]

{'method': 'feather', 'save_time': 2.320780038833618, 'read_time': 0.8685598134994507, 'file_size': 245.37567329406738, 'compression': 'zstd', 'compression_level': 4}


 25%|██▌       | 5/20 [00:54<02:50, 11.37s/it]

{'method': 'feather', 'save_time': 3.8632969856262207, 'read_time': 0.8724118232727051, 'file_size': 239.80018043518066, 'compression': 'zstd', 'compression_level': 5}


 30%|███       | 6/20 [01:08<02:51, 12.22s/it]

{'method': 'feather', 'save_time': 5.458178758621216, 'read_time': 0.838856291770935, 'file_size': 234.57203102111816, 'compression': 'zstd', 'compression_level': 6}


 35%|███▌      | 7/20 [01:23<02:49, 13.03s/it]

{'method': 'feather', 'save_time': 6.479353904724121, 'read_time': 0.8212389707565307, 'file_size': 230.76948738098145, 'compression': 'zstd', 'compression_level': 7}


 40%|████      | 8/20 [01:39<02:48, 14.08s/it]

{'method': 'feather', 'save_time': 8.300963163375854, 'read_time': 0.8029086828231812, 'file_size': 228.068941116333, 'compression': 'zstd', 'compression_level': 8}


 45%|████▌     | 9/20 [01:55<02:43, 14.83s/it]

{'method': 'feather', 'save_time': 8.390407085418701, 'read_time': 0.8096047163009643, 'file_size': 224.30322456359863, 'compression': 'zstd', 'compression_level': 9}


 50%|█████     | 10/20 [02:15<02:41, 16.19s/it]

{'method': 'feather', 'save_time': 11.370028972625732, 'read_time': 0.7869773149490357, 'file_size': 221.17950630187988, 'compression': 'zstd', 'compression_level': 10}


 55%|█████▌    | 11/20 [02:39<02:47, 18.57s/it]

{'method': 'feather', 'save_time': 16.065247058868408, 'read_time': 0.789002799987793, 'file_size': 219.3731174468994, 'compression': 'zstd', 'compression_level': 11}


 60%|██████    | 12/20 [03:04<02:45, 20.65s/it]

{'method': 'feather', 'save_time': 17.576188325881958, 'read_time': 0.7830511808395386, 'file_size': 218.901216506958, 'compression': 'zstd', 'compression_level': 12}


 65%|██████▌   | 13/20 [04:00<03:39, 31.42s/it]

{'method': 'feather', 'save_time': 48.42420196533203, 'read_time': 0.7761694908142089, 'file_size': 216.753267288208, 'compression': 'zstd', 'compression_level': 13}


 70%|███████   | 14/20 [05:10<04:17, 42.92s/it]

{'method': 'feather', 'save_time': 61.80457401275635, 'read_time': 0.768505597114563, 'file_size': 213.89065742492676, 'compression': 'zstd', 'compression_level': 14}


 75%|███████▌  | 15/20 [06:58<05:13, 62.76s/it]

{'method': 'feather', 'save_time': 101.14088892936707, 'read_time': 0.7600722312927246, 'file_size': 211.61963081359863, 'compression': 'zstd', 'compression_level': 15}


 80%|████████  | 16/20 [08:33<04:49, 72.42s/it]

{'method': 'feather', 'save_time': 87.12003111839294, 'read_time': 0.7743613004684449, 'file_size': 207.89100074768066, 'compression': 'zstd', 'compression_level': 16}


 85%|████████▌ | 17/20 [10:56<04:40, 93.51s/it]

{'method': 'feather', 'save_time': 134.67507195472717, 'read_time': 0.7857311010360718, 'file_size': 202.5795383453369, 'compression': 'zstd', 'compression_level': 17}


 90%|█████████ | 18/20 [13:40<03:49, 114.64s/it]

{'method': 'feather', 'save_time': 155.8468301296234, 'read_time': 0.7980259895324707, 'file_size': 200.37107276916504, 'compression': 'zstd', 'compression_level': 18}


 95%|█████████▌| 19/20 [16:59<02:20, 140.14s/it]

{'method': 'feather', 'save_time': 191.88540387153625, 'read_time': 0.7650203227996826, 'file_size': 199.14063453674316, 'compression': 'zstd', 'compression_level': 19}


100%|██████████| 20/20 [20:59<00:00, 62.97s/it] 

{'method': 'feather', 'save_time': 232.0722999572754, 'read_time': 0.7568066120147705, 'file_size': 190.44285774230957, 'compression': 'zstd', 'compression_level': 20}





In [10]:
# feather, lz4
for cl in tqdm(range(1, 21)):
    r = evaluate_performance(df, "feather", compression="lz4", compression_level=cl)
    print(r)
    eval_results.append(r)

  5%|▌         | 1/20 [00:06<02:06,  6.68s/it]

{'method': 'feather', 'save_time': 1.290541172027588, 'read_time': 0.5380644083023072, 'file_size': 382.2165927886963, 'compression': 'lz4', 'compression_level': 1}


 10%|█         | 2/20 [00:13<01:57,  6.55s/it]

{'method': 'feather', 'save_time': 1.2576570510864258, 'read_time': 0.5195846080780029, 'file_size': 382.2165927886963, 'compression': 'lz4', 'compression_level': 2}


 15%|█▌        | 3/20 [00:23<02:22,  8.36s/it]

{'method': 'feather', 'save_time': 5.197656154632568, 'read_time': 0.5323812007904053, 'file_size': 308.3122501373291, 'compression': 'lz4', 'compression_level': 3}


 20%|██        | 4/20 [00:35<02:36,  9.81s/it]

{'method': 'feather', 'save_time': 6.713475227355957, 'read_time': 0.5307584047317505, 'file_size': 302.7686405181885, 'compression': 'lz4', 'compression_level': 4}


 25%|██▌       | 5/20 [00:49<02:49, 11.31s/it]

{'method': 'feather', 'save_time': 8.655227899551392, 'read_time': 0.5320410966873169, 'file_size': 299.7300205230713, 'compression': 'lz4', 'compression_level': 5}


 30%|███       | 6/20 [01:05<03:00, 12.91s/it]

{'method': 'feather', 'save_time': 10.621734142303467, 'read_time': 0.5369843244552612, 'file_size': 298.2436695098877, 'compression': 'lz4', 'compression_level': 6}


 35%|███▌      | 7/20 [01:23<03:08, 14.53s/it]

{'method': 'feather', 'save_time': 12.492452144622803, 'read_time': 0.5383381843566895, 'file_size': 297.3937244415283, 'compression': 'lz4', 'compression_level': 7}


 40%|████      | 8/20 [01:42<03:12, 16.08s/it]

{'method': 'feather', 'save_time': 13.951219320297241, 'read_time': 0.5421907901763916, 'file_size': 296.850923538208, 'compression': 'lz4', 'compression_level': 8}


 45%|████▌     | 9/20 [02:03<03:12, 17.53s/it]

{'method': 'feather', 'save_time': 15.313683032989502, 'read_time': 0.5396970987319947, 'file_size': 296.74151039123535, 'compression': 'lz4', 'compression_level': 9}


 50%|█████     | 10/20 [02:30<03:22, 20.29s/it]

{'method': 'feather', 'save_time': 21.218377828598022, 'read_time': 0.5263140678405762, 'file_size': 297.1527271270752, 'compression': 'lz4', 'compression_level': 10}


 55%|█████▌    | 11/20 [03:02<03:34, 23.85s/it]

{'method': 'feather', 'save_time': 26.610891103744507, 'read_time': 0.528836989402771, 'file_size': 296.0005741119385, 'compression': 'lz4', 'compression_level': 11}


 60%|██████    | 12/20 [03:38<03:40, 27.59s/it]

{'method': 'feather', 'save_time': 30.65232515335083, 'read_time': 0.5497368812561035, 'file_size': 295.0932788848877, 'compression': 'lz4', 'compression_level': 12}


 65%|██████▌   | 13/20 [04:14<03:31, 30.15s/it]

{'method': 'feather', 'save_time': 30.72233510017395, 'read_time': 0.5294882774353027, 'file_size': 295.0932788848877, 'compression': 'lz4', 'compression_level': 13}


 70%|███████   | 14/20 [04:50<03:11, 31.95s/it]

{'method': 'feather', 'save_time': 30.714147090911865, 'read_time': 0.5392313241958618, 'file_size': 295.0932788848877, 'compression': 'lz4', 'compression_level': 14}


 75%|███████▌  | 15/20 [05:26<02:45, 33.18s/it]

{'method': 'feather', 'save_time': 30.66402006149292, 'read_time': 0.5357027053833008, 'file_size': 295.0932788848877, 'compression': 'lz4', 'compression_level': 15}


 80%|████████  | 16/20 [06:02<02:16, 34.08s/it]

{'method': 'feather', 'save_time': 30.73835802078247, 'read_time': 0.5429946899414062, 'file_size': 295.0932788848877, 'compression': 'lz4', 'compression_level': 16}


 85%|████████▌ | 17/20 [06:38<01:43, 34.66s/it]

{'method': 'feather', 'save_time': 30.627331018447876, 'read_time': 0.5377061128616333, 'file_size': 295.0932788848877, 'compression': 'lz4', 'compression_level': 17}


 90%|█████████ | 18/20 [07:14<01:10, 35.07s/it]

{'method': 'feather', 'save_time': 30.710247039794922, 'read_time': 0.5314020156860352, 'file_size': 295.0932788848877, 'compression': 'lz4', 'compression_level': 18}


 95%|█████████▌| 19/20 [07:50<00:35, 35.36s/it]

{'method': 'feather', 'save_time': 30.61126208305359, 'read_time': 0.541713285446167, 'file_size': 295.0932788848877, 'compression': 'lz4', 'compression_level': 19}


100%|██████████| 20/20 [08:26<00:00, 25.34s/it]

{'method': 'feather', 'save_time': 30.8820641040802, 'read_time': 0.5315488815307617, 'file_size': 295.0932788848877, 'compression': 'lz4', 'compression_level': 20}





In [11]:
# parquet
for c in ["gzip", "snappy", "lz4", "zstd", "brotli", None]:
    r = evaluate_performance(df, "parquet", compression=c)
    print(r)
    eval_results.append(r)

{'method': 'parquet', 'save_time': 21.936084985733032, 'read_time': 1.857405996322632, 'file_size': 261.4524908065796, 'compression': 'gzip'}
{'method': 'parquet', 'save_time': 1.4769291877746582, 'read_time': 0.756915807723999, 'file_size': 372.26979064941406, 'compression': 'snappy'}
{'method': 'parquet', 'save_time': 1.436607837677002, 'read_time': 0.6520306825637817, 'file_size': 375.51332664489746, 'compression': 'lz4'}
{'method': 'parquet', 'save_time': 1.6624550819396973, 'read_time': 0.8908697843551636, 'file_size': 276.7370548248291, 'compression': 'zstd'}
{'method': 'parquet', 'save_time': 18.984238862991333, 'read_time': 1.5739156007766724, 'file_size': 230.74532318115234, 'compression': 'brotli'}
{'method': 'parquet', 'save_time': 1.4593620300292969, 'read_time': 0.7634199619293213, 'file_size': 372.26979064941406}


In [12]:
# feather
for c in ["lz4", "zstd", "uncompressed", None]:
    r = evaluate_performance(df, "feather", compression=c)
    print(r)
    eval_results.append(r)

{'method': 'feather', 'save_time': 1.2862539291381836, 'read_time': 0.5309521913528442, 'file_size': 382.2165927886963, 'compression': 'lz4'}
{'method': 'feather', 'save_time': 1.38962721824646, 'read_time': 0.7580457925796509, 'file_size': 274.08423042297363, 'compression': 'zstd'}
{'method': 'feather', 'save_time': 0.34214329719543457, 'read_time': 0.38784117698669435, 'file_size': 634.9402027130127, 'compression': 'uncompressed'}
{'method': 'feather', 'save_time': 1.252026081085205, 'read_time': 0.5269888877868653, 'file_size': 382.2165927886963}


In [13]:
# csv
for c in ['zip', 'gzip', 'bz2', 'zstd', 'xz', 'tar', None]:
    r = evaluate_performance(df, "csv", compression=c)
    print(r)
    eval_results.append(r)

{'method': 'csv', 'save_time': 32.36922287940979, 'read_time': 6.146654200553894, 'file_size': 298.97631645202637, 'compression': 'zip'}
{'method': 'csv', 'save_time': 36.42905068397522, 'read_time': 8.302969312667846, 'file_size': 298.6697826385498, 'compression': 'gzip'}
{'method': 'csv', 'save_time': 49.01720690727234, 'read_time': 20.00485780239105, 'file_size': 230.6348533630371, 'compression': 'bz2'}
{'method': 'csv', 'save_time': 14.664268970489502, 'read_time': 6.1076120138168335, 'file_size': 289.5586061477661, 'compression': 'zstd'}
{'method': 'csv', 'save_time': 331.15426301956177, 'read_time': 15.681923604011535, 'file_size': 212.4040298461914, 'compression': 'xz'}
{'method': 'csv', 'save_time': 11.371416807174683, 'read_time': 5.131657218933105, 'file_size': 691.07421875, 'compression': 'tar'}
{'method': 'csv', 'save_time': 11.353543043136597, 'read_time': 5.117451095581055, 'file_size': 691.0666332244873}


In [14]:
df = pd.DataFrame(eval_results)
df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level
0,parquet,2.312279,1.061680,276.737055,zstd,1.0
1,parquet,2.555057,1.043575,262.036419,zstd,2.0
2,parquet,2.830313,1.025920,253.571889,zstd,3.0
3,parquet,2.799268,1.013424,251.290937,zstd,4.0
4,parquet,4.279723,1.023085,245.694203,zstd,5.0
...,...,...,...,...,...,...
92,csv,49.017207,20.004858,230.634853,bz2,
93,csv,14.664269,6.107612,289.558606,zstd,
94,csv,331.154263,15.681924,212.404030,xz,
95,csv,11.371417,5.131657,691.074219,tar,


In [15]:
df["comp_ratio"] = mem_size_mb / df["file_size"]
df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio
0,parquet,2.312279,1.061680,276.737055,zstd,1.0,2.838271
1,parquet,2.555057,1.043575,262.036419,zstd,2.0,2.997502
2,parquet,2.830313,1.025920,253.571889,zstd,3.0,3.097562
3,parquet,2.799268,1.013424,251.290937,zstd,4.0,3.125678
4,parquet,4.279723,1.023085,245.694203,zstd,5.0,3.196879
...,...,...,...,...,...,...,...
92,csv,49.017207,20.004858,230.634853,bz2,,3.405620
93,csv,14.664269,6.107612,289.558606,zstd,,2.712593
94,csv,331.154263,15.681924,212.404030,xz,,3.697927
95,csv,11.371417,5.131657,691.074219,tar,,1.136571


In [16]:
df.to_pickle("./data/test_results_03.pkl.gz", compression="gzip")