In [1]:
import pandas as pd
import time
from typing import Literal
from functools import partial
from typing import Optional
import os
from tqdm import tqdm

In [2]:
df = pd.read_pickle("./data/data.pkl.gz")
df

Unnamed: 0,text,uuid,int_0,int_1,int_2,int_3,int_4,int_5,int_6,int_7,...,float_10,float_11,float_12,float_13,float_14,float_15,float_16,float_17,float_18,float_19
0,"Write an article based on this ""A man has been...",9b193ab1-ab92-4d74-9aac-876f21c302a0,3474,-8309,-1183,-6862,-2121,-6685,-6140,-5582,...,-1704.235408,8690.18989,7636.103984,-5305.07615,-4549.317322,-3380.911654,-4222.91801,1709.491323,1727.529697,-6372.93414
1,Answer the following question: - number is 54 ...,dc0a6127-3977-4ec2-8ecc-60de9df14571,3474,-8309,-1183,-6862,-2121,-6685,-6140,-5582,...,-1704.235408,8690.18989,7636.103984,-5305.07615,-4549.317322,-3380.911654,-4222.91801,1709.491323,1727.529697,-6372.93414
2,Produce a long descriptive sentence that uses ...,b6189a7f-9bd2-4360-b381-6e8bd1056109,3474,-8309,-1183,-6862,-2121,-6685,-6140,-5582,...,-1704.235408,8690.18989,7636.103984,-5305.07615,-4549.317322,-3380.911654,-4222.91801,1709.491323,1727.529697,-6372.93414
3,Write a title for this article:\n\nArbitration...,c64c0f95-661e-4ad5-ad61-ab99ecf30f42,3474,-8309,-1183,-6862,-2121,-6685,-6140,-5582,...,-1704.235408,8690.18989,7636.103984,-5305.07615,-4549.317322,-3380.911654,-4222.91801,1709.491323,1727.529697,-6372.93414
4,Read the following paragraph and determine if ...,f516250e-57b9-46a4-9397-adbf0c1f5326,3474,-8309,-1183,-6862,-2121,-6685,-6140,-5582,...,-1704.235408,8690.18989,7636.103984,-5305.07615,-4549.317322,-3380.911654,-4222.91801,1709.491323,1727.529697,-6372.93414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363486,"Q: Context: Dragon Ball Z (ドラゴンボール ゼット, Dorago...",8ba7f794-07f2-451c-90c5-24f7c4e7734b,3474,-8309,-1183,-6862,-2121,-6685,-6140,-5582,...,-1704.235408,8690.18989,7636.103984,-5305.07615,-4549.317322,-3380.911654,-4222.91801,1709.491323,1727.529697,-6372.93414
363487,My question is: At the beginning of an academi...,3c338e45-3525-4dd4-88fe-1cc31ccc59ef,3474,-8309,-1183,-6862,-2121,-6685,-6140,-5582,...,-1704.235408,8690.18989,7636.103984,-5305.07615,-4549.317322,-3380.911654,-4222.91801,1709.491323,1727.529697,-6372.93414
363488,"Leo: Given the sentence ""A small child in wate...",28a3dbab-0e47-4a40-acc2-05a92f02abb5,3474,-8309,-1183,-6862,-2121,-6685,-6140,-5582,...,-1704.235408,8690.18989,7636.103984,-5305.07615,-4549.317322,-3380.911654,-4222.91801,1709.491323,1727.529697,-6372.93414
363489,Explain simply why yes is the correct answer t...,314b76cb-9390-4e76-8a6a-6d5034765345,3474,-8309,-1183,-6862,-2121,-6685,-6140,-5582,...,-1704.235408,8690.18989,7636.103984,-5305.07615,-4549.317322,-3380.911654,-4222.91801,1709.491323,1727.529697,-6372.93414


In [3]:
# total memory usage in bytes without index
# see https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.memory_usage.html
mem_usage = df.memory_usage(index=False, deep=True).sum()
mem_usage

np.int64(823608914)

In [4]:
# total memory usage in m_bytes without index
mem_size_mb = mem_usage / 1024 / 1024
mem_size_mb

np.float64(785.454668045044)

In [5]:
def evaluate_performance(
        df,
        method: Literal["csv", "parquet"],
        compression: Optional[str] = None,
        compression_level: Optional[int] = None,
    ) -> dict:
    filename = f"./test_data/data.{method}"

    # set save_fn and read_fn
    save_fn = None
    read_fn = None
    additional_save_params = {}
    if method == "csv":
        if compression is not None:
            additional_save_params["compression"] = compression
        assert compression_level is None
        save_fn = partial(df.to_csv, index=False, path_or_buf=filename, **additional_save_params)
        read_fn = partial(pd.read_csv, filepath_or_buffer=filename, **additional_save_params)
    elif method == "parquet":
        if compression is not None:
            additional_save_params["compression"] = compression
        if compression_level is not None:
            additional_save_params["compression_level"] = compression_level
        save_fn = partial(df.to_parquet, index=False, engine="pyarrow", path=filename, **additional_save_params)
        read_fn = partial(pd.read_parquet, path=filename, engine="pyarrow")
    elif method == "feather":
        if compression is not None:
            additional_save_params["compression"] = compression
        if compression_level is not None:
            additional_save_params["compression_level"] = compression_level
        save_fn = partial(df.to_feather, path=filename, **additional_save_params)
        read_fn = partial(pd.read_feather, path=filename)
    assert save_fn is not None
    assert read_fn is not None


    # write
    start_time = time.time()
    save_fn()
    save_time = time.time() - start_time
    # print("save_time:", save_time)

    # read
    start_time = time.time()
    for _ in range(10):
        read_fn()
    read_time = (time.time() - start_time) / 10
    # print("read_time:", read_time)

    # get size
    file_size = os.path.getsize(filename) / 1024 / 1024
    # print("file_size:", file_size)

    # delete file
    os.remove(filename)

    result = {
        "method": method,
        "save_time": save_time,
        "read_time": read_time,
        "file_size": file_size,
    }

    for k, v in additional_save_params.items():
        result[k] = v

    return result


In [6]:
eval_results = []

In [7]:
# parquet, zstd
for cl in tqdm(range(1, 21)):
    r = evaluate_performance(df, "parquet", compression="zstd", compression_level=cl)
    print(r)
    eval_results.append(r)

  5%|▌         | 1/20 [00:11<03:34, 11.31s/it]

{'method': 'parquet', 'save_time': 1.7168431282043457, 'read_time': 0.9589344024658203, 'file_size': 205.0951271057129, 'compression': 'zstd', 'compression_level': 1}


 10%|█         | 2/20 [00:23<03:32, 11.80s/it]

{'method': 'parquet', 'save_time': 2.1638808250427246, 'read_time': 0.9968081951141358, 'file_size': 190.3500156402588, 'compression': 'zstd', 'compression_level': 2}


 15%|█▌        | 3/20 [00:36<03:26, 12.15s/it]

{'method': 'parquet', 'save_time': 2.4616479873657227, 'read_time': 1.0109827041625976, 'file_size': 181.8889560699463, 'compression': 'zstd', 'compression_level': 3}


 20%|██        | 4/20 [00:48<03:17, 12.34s/it]

{'method': 'parquet', 'save_time': 2.55195689201355, 'read_time': 1.0079713106155395, 'file_size': 179.64045810699463, 'compression': 'zstd', 'compression_level': 4}


 25%|██▌       | 5/20 [01:03<03:16, 13.07s/it]

{'method': 'parquet', 'save_time': 4.067563056945801, 'read_time': 1.0285160064697265, 'file_size': 174.0382194519043, 'compression': 'zstd', 'compression_level': 5}


 30%|███       | 6/20 [01:18<03:13, 13.84s/it]

{'method': 'parquet', 'save_time': 5.7080299854278564, 'read_time': 0.961374306678772, 'file_size': 168.9164524078369, 'compression': 'zstd', 'compression_level': 6}


 35%|███▌      | 7/20 [01:34<03:08, 14.46s/it]

{'method': 'parquet', 'save_time': 6.3573222160339355, 'read_time': 0.9390930891036987, 'file_size': 166.69308376312256, 'compression': 'zstd', 'compression_level': 7}


 40%|████      | 8/20 [01:51<03:03, 15.29s/it]

{'method': 'parquet', 'save_time': 8.056328058242798, 'read_time': 0.8995661973953247, 'file_size': 164.8323574066162, 'compression': 'zstd', 'compression_level': 8}


 45%|████▌     | 9/20 [02:08<02:53, 15.81s/it]

{'method': 'parquet', 'save_time': 8.054328203201294, 'read_time': 0.8899359941482544, 'file_size': 164.41628742218018, 'compression': 'zstd', 'compression_level': 9}


 50%|█████     | 10/20 [02:27<02:48, 16.84s/it]

{'method': 'parquet', 'save_time': 10.362239837646484, 'read_time': 0.8775696992874146, 'file_size': 163.0141191482544, 'compression': 'zstd', 'compression_level': 10}


 55%|█████▌    | 11/20 [02:49<02:45, 18.35s/it]

{'method': 'parquet', 'save_time': 13.053364992141724, 'read_time': 0.874109411239624, 'file_size': 162.2701644897461, 'compression': 'zstd', 'compression_level': 11}


 60%|██████    | 12/20 [03:10<02:35, 19.40s/it]

{'method': 'parquet', 'save_time': 13.024783849716187, 'read_time': 0.8779834985733033, 'file_size': 162.2701644897461, 'compression': 'zstd', 'compression_level': 12}


 65%|██████▌   | 13/20 [03:50<02:59, 25.59s/it]

{'method': 'parquet', 'save_time': 30.98591899871826, 'read_time': 0.8818485975265503, 'file_size': 160.5430212020874, 'compression': 'zstd', 'compression_level': 13}


 70%|███████   | 14/20 [04:34<03:07, 31.23s/it]

{'method': 'parquet', 'save_time': 35.568851947784424, 'read_time': 0.8690489053726196, 'file_size': 159.9736909866333, 'compression': 'zstd', 'compression_level': 14}


 75%|███████▌  | 15/20 [05:20<02:58, 35.66s/it]

{'method': 'parquet', 'save_time': 37.22284293174744, 'read_time': 0.8725491046905518, 'file_size': 159.89682865142822, 'compression': 'zstd', 'compression_level': 15}


 80%|████████  | 16/20 [06:22<02:53, 43.44s/it]

{'method': 'parquet', 'save_time': 52.612979888916016, 'read_time': 0.8881033897399903, 'file_size': 153.43690872192383, 'compression': 'zstd', 'compression_level': 16}


 85%|████████▌ | 17/20 [07:33<02:35, 51.90s/it]

{'method': 'parquet', 'save_time': 62.5625102519989, 'read_time': 0.9025753974914551, 'file_size': 152.65603065490723, 'compression': 'zstd', 'compression_level': 17}


 90%|█████████ | 18/20 [09:01<02:04, 62.49s/it]

{'method': 'parquet', 'save_time': 77.91794109344482, 'read_time': 0.9199609756469727, 'file_size': 151.67846584320068, 'compression': 'zstd', 'compression_level': 18}


 95%|█████████▌| 19/20 [10:36<01:12, 72.46s/it]

{'method': 'parquet', 'save_time': 86.38836717605591, 'read_time': 0.9317858934402465, 'file_size': 151.54985809326172, 'compression': 'zstd', 'compression_level': 19}


100%|██████████| 20/20 [12:12<00:00, 36.64s/it]

{'method': 'parquet', 'save_time': 86.35913395881653, 'read_time': 0.9622390031814575, 'file_size': 151.54985809326172, 'compression': 'zstd', 'compression_level': 20}





In [8]:
# parquet, lz4
for cl in tqdm(range(1, 21)):
    r = evaluate_performance(df, "parquet", compression="lz4", compression_level=cl)
    print(r)
    eval_results.append(r)

  5%|▌         | 1/20 [00:08<02:32,  8.02s/it]

{'method': 'parquet', 'save_time': 1.338258981704712, 'read_time': 0.6679298162460328, 'file_size': 299.8812093734741, 'compression': 'lz4', 'compression_level': 1}


 10%|█         | 2/20 [00:15<02:21,  7.84s/it]

{'method': 'parquet', 'save_time': 1.3117198944091797, 'read_time': 0.6393234252929687, 'file_size': 299.8812093734741, 'compression': 'lz4', 'compression_level': 2}


 15%|█▌        | 3/20 [00:27<02:43,  9.64s/it]

{'method': 'parquet', 'save_time': 5.2876200675964355, 'read_time': 0.6485427141189575, 'file_size': 229.84929656982422, 'compression': 'lz4', 'compression_level': 3}


 20%|██        | 4/20 [00:40<02:57, 11.07s/it]

{'method': 'parquet', 'save_time': 6.83267879486084, 'read_time': 0.6433796167373658, 'file_size': 224.4717779159546, 'compression': 'lz4', 'compression_level': 4}


 25%|██▌       | 5/20 [00:55<03:08, 12.54s/it]

{'method': 'parquet', 'save_time': 8.747347116470337, 'read_time': 0.6408979892730713, 'file_size': 221.61809635162354, 'compression': 'lz4', 'compression_level': 5}


 30%|███       | 6/20 [01:12<03:16, 14.05s/it]

{'method': 'parquet', 'save_time': 10.597584247589111, 'read_time': 0.6371343851089477, 'file_size': 220.34614181518555, 'compression': 'lz4', 'compression_level': 6}


 35%|███▌      | 7/20 [01:31<03:23, 15.63s/it]

{'method': 'parquet', 'save_time': 12.45890498161316, 'read_time': 0.6426115036010742, 'file_size': 219.82338619232178, 'compression': 'lz4', 'compression_level': 7}


 40%|████      | 8/20 [01:52<03:25, 17.11s/it]

{'method': 'parquet', 'save_time': 13.872910976409912, 'read_time': 0.6405241250991821, 'file_size': 219.62268161773682, 'compression': 'lz4', 'compression_level': 8}


 45%|████▌     | 9/20 [02:13<03:23, 18.50s/it]

{'method': 'parquet', 'save_time': 15.134925842285156, 'read_time': 0.6417816162109375, 'file_size': 219.5131492614746, 'compression': 'lz4', 'compression_level': 9}


 50%|█████     | 10/20 [02:41<03:32, 21.25s/it]

{'method': 'parquet', 'save_time': 20.99379825592041, 'read_time': 0.6420857906341553, 'file_size': 218.33648586273193, 'compression': 'lz4', 'compression_level': 10}


 55%|█████▌    | 11/20 [03:12<03:39, 24.40s/it]

{'method': 'parquet', 'save_time': 25.107905864715576, 'read_time': 0.6419222831726075, 'file_size': 217.9263620376587, 'compression': 'lz4', 'compression_level': 11}


 60%|██████    | 12/20 [03:47<03:41, 27.71s/it]

{'method': 'parquet', 'save_time': 28.843078136444092, 'read_time': 0.644201683998108, 'file_size': 217.89891529083252, 'compression': 'lz4', 'compression_level': 12}


 65%|██████▌   | 13/20 [04:23<03:30, 30.04s/it]

{'method': 'parquet', 'save_time': 28.960288047790527, 'read_time': 0.6441783905029297, 'file_size': 217.89891529083252, 'compression': 'lz4', 'compression_level': 13}


 70%|███████   | 14/20 [04:58<03:09, 31.61s/it]

{'method': 'parquet', 'save_time': 28.83184289932251, 'read_time': 0.6413195848464965, 'file_size': 217.89891529083252, 'compression': 'lz4', 'compression_level': 14}


 75%|███████▌  | 15/20 [05:33<02:43, 32.78s/it]

{'method': 'parquet', 'save_time': 29.021782159805298, 'read_time': 0.6449590921401978, 'file_size': 217.89891529083252, 'compression': 'lz4', 'compression_level': 15}


 80%|████████  | 16/20 [06:09<02:14, 33.57s/it]

{'method': 'parquet', 'save_time': 28.96356725692749, 'read_time': 0.6445579051971435, 'file_size': 217.89891529083252, 'compression': 'lz4', 'compression_level': 16}


 85%|████████▌ | 17/20 [06:44<01:42, 34.12s/it]

{'method': 'parquet', 'save_time': 28.96080207824707, 'read_time': 0.6425777912139893, 'file_size': 217.89891529083252, 'compression': 'lz4', 'compression_level': 17}


 90%|█████████ | 18/20 [07:20<01:09, 34.50s/it]

{'method': 'parquet', 'save_time': 28.956976175308228, 'read_time': 0.6442868947982788, 'file_size': 217.89891529083252, 'compression': 'lz4', 'compression_level': 18}


 95%|█████████▌| 19/20 [07:55<00:34, 34.74s/it]

{'method': 'parquet', 'save_time': 28.874755859375, 'read_time': 0.6422530174255371, 'file_size': 217.89891529083252, 'compression': 'lz4', 'compression_level': 19}


100%|██████████| 20/20 [08:30<00:00, 25.54s/it]

{'method': 'parquet', 'save_time': 28.945737838745117, 'read_time': 0.6439600944519043, 'file_size': 217.89891529083252, 'compression': 'lz4', 'compression_level': 20}





In [9]:
# feather, zstd
for cl in tqdm(range(1, 21)):
    r = evaluate_performance(df, "feather", compression="zstd", compression_level=cl)
    print(r)
    eval_results.append(r)

  5%|▌         | 1/20 [00:08<02:50,  8.95s/it]

{'method': 'feather', 'save_time': 1.3659017086029053, 'read_time': 0.758134388923645, 'file_size': 203.28154182434082, 'compression': 'zstd', 'compression_level': 1}


 10%|█         | 2/20 [00:19<02:54,  9.71s/it]

{'method': 'feather', 'save_time': 1.9289109706878662, 'read_time': 0.8319602966308594, 'file_size': 188.03924751281738, 'compression': 'zstd', 'compression_level': 2}


 15%|█▌        | 3/20 [00:29<02:51, 10.08s/it]

{'method': 'feather', 'save_time': 2.118134021759033, 'read_time': 0.8396064043045044, 'file_size': 178.33611488342285, 'compression': 'zstd', 'compression_level': 3}


 20%|██        | 4/20 [00:40<02:44, 10.27s/it]

{'method': 'feather', 'save_time': 2.1852591037750244, 'read_time': 0.8379509687423706, 'file_size': 174.79282569885254, 'compression': 'zstd', 'compression_level': 4}


 25%|██▌       | 5/20 [00:52<02:44, 10.96s/it]

{'method': 'feather', 'save_time': 3.6861913204193115, 'read_time': 0.8503822803497314, 'file_size': 169.25403022766113, 'compression': 'zstd', 'compression_level': 5}


 30%|███       | 6/20 [01:05<02:45, 11.82s/it]

{'method': 'feather', 'save_time': 5.286766052246094, 'read_time': 0.8181221961975098, 'file_size': 163.3628101348877, 'compression': 'zstd', 'compression_level': 6}


 35%|███▌      | 7/20 [01:20<02:43, 12.62s/it]

{'method': 'feather', 'save_time': 6.230818033218384, 'read_time': 0.8027096033096314, 'file_size': 159.58596992492676, 'compression': 'zstd', 'compression_level': 7}


 40%|████      | 8/20 [01:36<02:43, 13.65s/it]

{'method': 'feather', 'save_time': 8.04234528541565, 'read_time': 0.7806519746780396, 'file_size': 157.31350135803223, 'compression': 'zstd', 'compression_level': 8}


 45%|████▌     | 9/20 [01:51<02:37, 14.31s/it]

{'method': 'feather', 'save_time': 8.042502164840698, 'read_time': 0.7712942838668824, 'file_size': 153.5480670928955, 'compression': 'zstd', 'compression_level': 9}


 50%|█████     | 10/20 [02:10<02:36, 15.60s/it]

{'method': 'feather', 'save_time': 10.937083721160889, 'read_time': 0.7562822103500366, 'file_size': 150.4430332183838, 'compression': 'zstd', 'compression_level': 10}


 55%|█████▌    | 11/20 [02:32<02:39, 17.76s/it]

{'method': 'feather', 'save_time': 15.215730905532837, 'read_time': 0.7433353900909424, 'file_size': 148.66246223449707, 'compression': 'zstd', 'compression_level': 11}


 60%|██████    | 12/20 [02:57<02:37, 19.69s/it]

{'method': 'feather', 'save_time': 16.657838821411133, 'read_time': 0.744694185256958, 'file_size': 148.19056129455566, 'compression': 'zstd', 'compression_level': 12}


 65%|██████▌   | 13/20 [03:47<03:23, 29.14s/it]

{'method': 'feather', 'save_time': 43.371659994125366, 'read_time': 0.7497128963470459, 'file_size': 146.080171585083, 'compression': 'zstd', 'compression_level': 13}


 70%|███████   | 14/20 [04:50<03:55, 39.19s/it]

{'method': 'feather', 'save_time': 55.01982116699219, 'read_time': 0.7415153026580811, 'file_size': 144.4615421295166, 'compression': 'zstd', 'compression_level': 14}


 75%|███████▌  | 15/20 [06:29<04:46, 57.23s/it]

{'method': 'feather', 'save_time': 91.66012907028198, 'read_time': 0.7364465951919555, 'file_size': 142.21407508850098, 'compression': 'zstd', 'compression_level': 15}


 80%|████████  | 16/20 [07:56<04:25, 66.36s/it]

{'method': 'feather', 'save_time': 80.07639598846436, 'read_time': 0.7488458156585693, 'file_size': 137.54537391662598, 'compression': 'zstd', 'compression_level': 16}


 85%|████████▌ | 17/20 [10:09<04:19, 86.35s/it]

{'method': 'feather', 'save_time': 125.38843894004822, 'read_time': 0.7451626062393188, 'file_size': 132.2216968536377, 'compression': 'zstd', 'compression_level': 17}


 90%|█████████ | 18/20 [12:43<03:33, 106.71s/it]

{'method': 'feather', 'save_time': 146.68037581443787, 'read_time': 0.74114511013031, 'file_size': 129.81019020080566, 'compression': 'zstd', 'compression_level': 18}


 95%|█████████▌| 19/20 [15:53<02:11, 131.66s/it]

{'method': 'feather', 'save_time': 182.27556467056274, 'read_time': 0.750898814201355, 'file_size': 128.0803165435791, 'compression': 'zstd', 'compression_level': 19}


100%|██████████| 20/20 [19:48<00:00, 59.42s/it] 

{'method': 'feather', 'save_time': 227.16688799858093, 'read_time': 0.7431152105331421, 'file_size': 119.38253974914551, 'compression': 'zstd', 'compression_level': 20}





In [10]:
# feather, lz4
for cl in tqdm(range(1, 21)):
    r = evaluate_performance(df, "feather", compression="lz4", compression_level=cl)
    print(r)
    eval_results.append(r)

  5%|▌         | 1/20 [00:06<02:02,  6.42s/it]

{'method': 'feather', 'save_time': 1.2404112815856934, 'read_time': 0.5180318117141723, 'file_size': 299.81385231018066, 'compression': 'lz4', 'compression_level': 1}


 10%|█         | 2/20 [00:12<01:54,  6.38s/it]

{'method': 'feather', 'save_time': 1.2343418598175049, 'read_time': 0.5106603860855102, 'file_size': 299.81385231018066, 'compression': 'lz4', 'compression_level': 2}


 15%|█▌        | 3/20 [00:23<02:19,  8.19s/it]

{'method': 'feather', 'save_time': 5.111583232879639, 'read_time': 0.5237424850463868, 'file_size': 229.60533332824707, 'compression': 'lz4', 'compression_level': 3}


 20%|██        | 4/20 [00:35<02:34,  9.67s/it]

{'method': 'feather', 'save_time': 6.697415113449097, 'read_time': 0.5230430126190185, 'file_size': 224.1416187286377, 'compression': 'lz4', 'compression_level': 4}


 25%|██▌       | 5/20 [00:48<02:47, 11.17s/it]

{'method': 'feather', 'save_time': 8.645272016525269, 'read_time': 0.517979907989502, 'file_size': 221.22650337219238, 'compression': 'lz4', 'compression_level': 5}


 30%|███       | 6/20 [01:04<02:58, 12.77s/it]

{'method': 'feather', 'save_time': 10.630670070648193, 'read_time': 0.524041223526001, 'file_size': 219.92561531066895, 'compression': 'lz4', 'compression_level': 6}


 35%|███▌      | 7/20 [01:22<03:07, 14.40s/it]

{'method': 'feather', 'save_time': 12.552356719970703, 'read_time': 0.5198963165283204, 'file_size': 219.39285469055176, 'compression': 'lz4', 'compression_level': 7}


 40%|████      | 8/20 [01:41<03:11, 15.93s/it]

{'method': 'feather', 'save_time': 14.0329430103302, 'read_time': 0.5181215047836304, 'file_size': 219.1904010772705, 'compression': 'lz4', 'compression_level': 8}


 45%|████▌     | 9/20 [02:02<03:11, 17.38s/it]

{'method': 'feather', 'save_time': 15.353935956954956, 'read_time': 0.5202113866806031, 'file_size': 219.07815742492676, 'compression': 'lz4', 'compression_level': 9}


 50%|█████     | 10/20 [02:28<03:20, 20.09s/it]

{'method': 'feather', 'save_time': 21.00790500640869, 'read_time': 0.5163185834884644, 'file_size': 217.88507270812988, 'compression': 'lz4', 'compression_level': 10}


 55%|█████▌    | 11/20 [02:59<03:29, 23.30s/it]

{'method': 'feather', 'save_time': 25.374151945114136, 'read_time': 0.5185709238052368, 'file_size': 217.48253059387207, 'compression': 'lz4', 'compression_level': 11}


 60%|██████    | 12/20 [03:35<03:37, 27.21s/it]

{'method': 'feather', 'save_time': 30.92774486541748, 'read_time': 0.5222226142883301, 'file_size': 217.457124710083, 'compression': 'lz4', 'compression_level': 12}


 65%|██████▌   | 13/20 [04:11<03:29, 29.90s/it]

{'method': 'feather', 'save_time': 30.962579011917114, 'read_time': 0.5129292011260986, 'file_size': 217.457124710083, 'compression': 'lz4', 'compression_level': 13}


 70%|███████   | 14/20 [04:47<03:10, 31.74s/it]

{'method': 'feather', 'save_time': 30.85590386390686, 'read_time': 0.5128146886825562, 'file_size': 217.457124710083, 'compression': 'lz4', 'compression_level': 14}


 75%|███████▌  | 15/20 [05:23<02:45, 33.04s/it]

{'method': 'feather', 'save_time': 30.91698718070984, 'read_time': 0.5137624740600586, 'file_size': 217.457124710083, 'compression': 'lz4', 'compression_level': 15}


 80%|████████  | 16/20 [05:59<02:15, 33.91s/it]

{'method': 'feather', 'save_time': 30.788990020751953, 'read_time': 0.512905502319336, 'file_size': 217.457124710083, 'compression': 'lz4', 'compression_level': 16}


 85%|████████▌ | 17/20 [06:35<01:43, 34.53s/it]

{'method': 'feather', 'save_time': 30.855582237243652, 'read_time': 0.5129684925079345, 'file_size': 217.457124710083, 'compression': 'lz4', 'compression_level': 17}


 90%|█████████ | 18/20 [07:11<01:09, 34.97s/it]

{'method': 'feather', 'save_time': 30.854987144470215, 'read_time': 0.5146545886993408, 'file_size': 217.457124710083, 'compression': 'lz4', 'compression_level': 18}


 95%|█████████▌| 19/20 [07:47<00:35, 35.28s/it]

{'method': 'feather', 'save_time': 30.856679916381836, 'read_time': 0.5138850927352905, 'file_size': 217.457124710083, 'compression': 'lz4', 'compression_level': 19}


100%|██████████| 20/20 [08:23<00:00, 25.16s/it]

{'method': 'feather', 'save_time': 30.903788089752197, 'read_time': 0.5150339126586914, 'file_size': 217.457124710083, 'compression': 'lz4', 'compression_level': 20}





In [11]:
# parquet
for c in ["gzip", "snappy", "lz4", "zstd", "brotli", None]:
    r = evaluate_performance(df, "parquet", compression=c)
    print(r)
    eval_results.append(r)

{'method': 'parquet', 'save_time': 19.627086877822876, 'read_time': 1.7965444087982179, 'file_size': 189.7540044784546, 'compression': 'gzip'}
{'method': 'parquet', 'save_time': 1.3376078605651855, 'read_time': 0.7331516027450562, 'file_size': 296.9213275909424, 'compression': 'snappy'}
{'method': 'parquet', 'save_time': 1.3070762157440186, 'read_time': 0.6384718894958497, 'file_size': 299.8812093734741, 'compression': 'lz4'}
{'method': 'parquet', 'save_time': 1.483922004699707, 'read_time': 0.8763885021209716, 'file_size': 205.0951271057129, 'compression': 'zstd'}
{'method': 'parquet', 'save_time': 18.214473962783813, 'read_time': 1.545258593559265, 'file_size': 159.6326036453247, 'compression': 'brotli'}
{'method': 'parquet', 'save_time': 1.3652260303497314, 'read_time': 0.7113039016723632, 'file_size': 296.9213275909424}


In [12]:
# feather
for c in ["lz4", "zstd", "uncompressed", None]:
    r = evaluate_performance(df, "feather", compression=c)
    print(r)
    eval_results.append(r)

{'method': 'feather', 'save_time': 1.2270021438598633, 'read_time': 0.5096915006637573, 'file_size': 299.81385231018066, 'compression': 'lz4'}
{'method': 'feather', 'save_time': 1.355456829071045, 'read_time': 0.7567508935928344, 'file_size': 203.28154182434082, 'compression': 'zstd'}
{'method': 'feather', 'save_time': 0.33922886848449707, 'read_time': 0.38898358345031736, 'file_size': 634.9402027130127, 'compression': 'uncompressed'}
{'method': 'feather', 'save_time': 1.2628107070922852, 'read_time': 0.5181000232696533, 'file_size': 299.81385231018066}


In [13]:
# csv
for c in ['zip', 'gzip', 'bz2', 'zstd', 'xz', 'tar', None]:
    r = evaluate_performance(df, "csv", compression=c)
    print(r)
    eval_results.append(r)

{'method': 'csv', 'save_time': 27.699738264083862, 'read_time': 5.631162214279175, 'file_size': 199.11504650115967, 'compression': 'zip'}
{'method': 'csv', 'save_time': 31.161946773529053, 'read_time': 7.01388680934906, 'file_size': 198.55400371551514, 'compression': 'gzip'}
{'method': 'csv', 'save_time': 62.240323066711426, 'read_time': 16.672200083732605, 'file_size': 153.71932125091553, 'compression': 'bz2'}
{'method': 'csv', 'save_time': 12.84618878364563, 'read_time': 5.495090508460999, 'file_size': 184.9435920715332, 'compression': 'zstd'}
{'method': 'csv', 'save_time': 211.0541648864746, 'read_time': 11.23839750289917, 'file_size': 131.52967834472656, 'compression': 'xz'}
{'method': 'csv', 'save_time': 10.89088487625122, 'read_time': 4.963031196594239, 'file_size': 691.806640625, 'compression': 'tar'}
{'method': 'csv', 'save_time': 10.828938961029053, 'read_time': 4.969729208946228, 'file_size': 691.7978973388672}


In [14]:
df = pd.DataFrame(eval_results)
df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level
0,parquet,1.716843,0.958934,205.095127,zstd,1.0
1,parquet,2.163881,0.996808,190.350016,zstd,2.0
2,parquet,2.461648,1.010983,181.888956,zstd,3.0
3,parquet,2.551957,1.007971,179.640458,zstd,4.0
4,parquet,4.067563,1.028516,174.038219,zstd,5.0
...,...,...,...,...,...,...
92,csv,62.240323,16.672200,153.719321,bz2,
93,csv,12.846189,5.495091,184.943592,zstd,
94,csv,211.054165,11.238398,131.529678,xz,
95,csv,10.890885,4.963031,691.806641,tar,


In [15]:
df["comp_ratio"] = mem_size_mb / df["file_size"]
df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio
0,parquet,1.716843,0.958934,205.095127,zstd,1.0,3.829709
1,parquet,2.163881,0.996808,190.350016,zstd,2.0,4.126370
2,parquet,2.461648,1.010983,181.888956,zstd,3.0,4.318320
3,parquet,2.551957,1.007971,179.640458,zstd,4.0,4.372371
4,parquet,4.067563,1.028516,174.038219,zstd,5.0,4.513116
...,...,...,...,...,...,...,...
92,csv,62.240323,16.672200,153.719321,bz2,,5.109668
93,csv,12.846189,5.495091,184.943592,zstd,,4.246996
94,csv,211.054165,11.238398,131.529678,xz,,5.971692
95,csv,10.890885,4.963031,691.806641,tar,,1.135367


In [16]:
df.to_pickle("./data/test_results_02.pkl.gz", compression="gzip")