In [1]:
import pandas as pd
import time
from typing import Literal
from functools import partial
from typing import Optional
import os
from tqdm import tqdm

In [2]:
df = pd.read_pickle("./data/data_category.pkl.gz")
df

Unnamed: 0,text,uuid,category_0,category_1,category_2,category_3,category_4,category_5,category_6,category_7,...,category_90,category_91,category_92,category_93,category_94,category_95,category_96,category_97,category_98,category_99
0,"Write an article based on this ""A man has been...",d2e7c46f-cd54-459d-824a-b7beac0da1a3,y,r,w,v,s,m,j,o,...,h,j,c,s,m,h,x,w,w,v
1,Answer the following question: - number is 54 ...,dafabcf7-caa7-4491-8f00-1fb817dbd4b4,s,e,g,q,t,m,n,r,...,m,b,v,t,u,o,e,o,f,a
2,Produce a long descriptive sentence that uses ...,dd1e9b91-feea-472c-9f65-2eda2d42c4c1,a,r,c,g,v,l,d,f,...,l,o,m,q,y,h,b,h,v,u
3,Write a title for this article:\n\nArbitration...,740923fb-5624-4f69-bcdc-874bb0800634,v,u,c,x,u,g,d,e,...,x,u,m,y,l,t,a,r,j,k
4,Read the following paragraph and determine if ...,fa18b94b-5cd7-443c-9394-83bc2c8f8051,k,w,d,b,m,d,i,w,...,i,b,l,z,g,a,j,e,w,x
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363486,"Q: Context: Dragon Ball Z (ドラゴンボール ゼット, Dorago...",29dc880d-b8cf-481e-9916-84d475c32876,n,r,a,r,c,p,u,f,...,c,v,q,r,w,p,m,h,x,e
363487,My question is: At the beginning of an academi...,8b23b7df-fa3c-4e41-9fdf-5991583442f9,s,n,c,g,j,q,i,r,...,f,n,b,k,b,f,r,q,m,r
363488,"Leo: Given the sentence ""A small child in wate...",a1e1482f-a178-497b-b9d5-04f978de6feb,o,m,r,t,j,g,d,n,...,a,g,j,j,h,s,q,x,o,v
363489,Explain simply why yes is the correct answer t...,e233b119-9dec-4916-b3bf-bed41ec2272b,y,e,u,a,e,a,k,f,...,y,p,j,n,l,d,p,b,s,a


In [3]:
# total memory usage in bytes without index
# see https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.memory_usage.html
mem_usage = df.memory_usage(index=False, deep=True).sum()
mem_usage

np.int64(743791694)

In [4]:
# total memory usage in m_bytes without index
mem_size_mb = mem_usage / 1024 / 1024
mem_size_mb

np.float64(709.3350353240967)

In [5]:
def evaluate_performance(
        df,
        method: Literal["csv", "parquet"],
        compression: Optional[str] = None,
        compression_level: Optional[int] = None,
    ) -> dict:
    filename = f"./test_data/data.{method}"

    # set save_fn and read_fn
    save_fn = None
    read_fn = None
    additional_save_params = {}
    if method == "csv":
        if compression is not None:
            additional_save_params["compression"] = compression
        assert compression_level is None
        save_fn = partial(df.to_csv, index=False, path_or_buf=filename, **additional_save_params)
        read_fn = partial(pd.read_csv, filepath_or_buffer=filename, **additional_save_params)
    elif method == "parquet":
        if compression is not None:
            additional_save_params["compression"] = compression
        if compression_level is not None:
            additional_save_params["compression_level"] = compression_level
        save_fn = partial(df.to_parquet, index=False, engine="pyarrow", path=filename, **additional_save_params)
        read_fn = partial(pd.read_parquet, path=filename, engine="pyarrow")
    elif method == "feather":
        if compression is not None:
            additional_save_params["compression"] = compression
        if compression_level is not None:
            additional_save_params["compression_level"] = compression_level
        save_fn = partial(df.to_feather, path=filename, **additional_save_params)
        read_fn = partial(pd.read_feather, path=filename)
    assert save_fn is not None
    assert read_fn is not None


    # write
    start_time = time.time()
    save_fn()
    save_time = time.time() - start_time
    # print("save_time:", save_time)

    # read
    start_time = time.time()
    for _ in range(10):
        read_fn()
    read_time = (time.time() - start_time) / 10
    # print("read_time:", read_time)

    # get size
    file_size = os.path.getsize(filename) / 1024 / 1024
    # print("file_size:", file_size)

    # delete file
    os.remove(filename)

    result = {
        "method": method,
        "save_time": save_time,
        "read_time": read_time,
        "file_size": file_size,
    }

    for k, v in additional_save_params.items():
        result[k] = v

    return result


In [6]:
eval_results = []

In [7]:
# parquet, zstd
for cl in tqdm(range(1, 21)):
    r = evaluate_performance(df, "parquet", compression="zstd", compression_level=cl)
    print(r)
    eval_results.append(r)

  5%|▌         | 1/20 [00:11<03:46, 11.93s/it]

{'method': 'parquet', 'save_time': 2.0900628566741943, 'read_time': 0.984036922454834, 'file_size': 226.8893756866455, 'compression': 'zstd', 'compression_level': 1}


 10%|█         | 2/20 [00:24<03:43, 12.44s/it]

{'method': 'parquet', 'save_time': 2.398170232772827, 'read_time': 1.0397315740585327, 'file_size': 212.13928413391113, 'compression': 'zstd', 'compression_level': 2}


 15%|█▌        | 3/20 [00:37<03:37, 12.80s/it]

{'method': 'parquet', 'save_time': 2.7862768173217773, 'read_time': 1.0443446159362793, 'file_size': 203.67618083953857, 'compression': 'zstd', 'compression_level': 3}


 20%|██        | 4/20 [00:51<03:27, 12.99s/it]

{'method': 'parquet', 'save_time': 2.8023130893707275, 'read_time': 1.0460953950881957, 'file_size': 201.42714405059814, 'compression': 'zstd', 'compression_level': 4}


 25%|██▌       | 5/20 [01:06<03:25, 13.72s/it]

{'method': 'parquet', 'save_time': 4.344430923461914, 'read_time': 1.0678874731063843, 'file_size': 195.82324886322021, 'compression': 'zstd', 'compression_level': 5}


 30%|███       | 6/20 [01:22<03:24, 14.61s/it]

{'method': 'parquet', 'save_time': 5.930858850479126, 'read_time': 1.0391349792480469, 'file_size': 190.7016887664795, 'compression': 'zstd', 'compression_level': 6}


 35%|███▌      | 7/20 [01:39<03:21, 15.49s/it]

{'method': 'parquet', 'save_time': 6.786940097808838, 'read_time': 1.0508110046386718, 'file_size': 188.47851753234863, 'compression': 'zstd', 'compression_level': 7}


 40%|████      | 8/20 [01:58<03:17, 16.48s/it]

{'method': 'parquet', 'save_time': 8.63676404953003, 'read_time': 0.9980435132980346, 'file_size': 186.61782932281494, 'compression': 'zstd', 'compression_level': 8}


 45%|████▌     | 9/20 [02:16<03:07, 17.08s/it]

{'method': 'parquet', 'save_time': 8.488038778305054, 'read_time': 0.9911326885223388, 'file_size': 186.20174884796143, 'compression': 'zstd', 'compression_level': 9}


 50%|█████     | 10/20 [02:37<03:01, 18.20s/it]

{'method': 'parquet', 'save_time': 10.920305252075195, 'read_time': 0.9763767719268799, 'file_size': 184.79971313476562, 'compression': 'zstd', 'compression_level': 10}


 55%|█████▌    | 11/20 [03:01<02:58, 19.86s/it]

{'method': 'parquet', 'save_time': 13.724972009658813, 'read_time': 0.9916614055633545, 'file_size': 184.05713081359863, 'compression': 'zstd', 'compression_level': 11}


 60%|██████    | 12/20 [03:24<02:48, 21.00s/it]

{'method': 'parquet', 'save_time': 13.789183139801025, 'read_time': 0.9810190916061401, 'file_size': 184.05713081359863, 'compression': 'zstd', 'compression_level': 12}


 65%|██████▌   | 13/20 [04:08<03:14, 27.71s/it]

{'method': 'parquet', 'save_time': 33.28212308883667, 'read_time': 0.9874844074249267, 'file_size': 182.33034420013428, 'compression': 'zstd', 'compression_level': 13}


 70%|███████   | 14/20 [04:55<03:22, 33.79s/it]

{'method': 'parquet', 'save_time': 38.096182107925415, 'read_time': 0.9736217975616455, 'file_size': 181.7610321044922, 'compression': 'zstd', 'compression_level': 14}


 75%|███████▌  | 15/20 [05:45<03:12, 38.55s/it]

{'method': 'parquet', 'save_time': 39.6819109916687, 'read_time': 0.9875153779983521, 'file_size': 181.68418884277344, 'compression': 'zstd', 'compression_level': 15}


 80%|████████  | 16/20 [06:53<03:09, 47.34s/it]

{'method': 'parquet', 'save_time': 56.57456922531128, 'read_time': 1.1199987888336183, 'file_size': 175.22446155548096, 'compression': 'zstd', 'compression_level': 16}


 85%|████████▌ | 17/20 [08:11<02:49, 56.66s/it]

{'method': 'parquet', 'save_time': 67.73674273490906, 'read_time': 1.0570938110351562, 'file_size': 174.4441385269165, 'compression': 'zstd', 'compression_level': 17}


 90%|█████████ | 18/20 [09:44<02:15, 67.54s/it]

{'method': 'parquet', 'save_time': 82.09143495559692, 'read_time': 1.076111912727356, 'file_size': 173.46569061279297, 'compression': 'zstd', 'compression_level': 18}


 95%|█████████▌| 19/20 [11:25<01:17, 77.60s/it]

{'method': 'parquet', 'save_time': 90.7236659526825, 'read_time': 1.0299107074737548, 'file_size': 173.33709907531738, 'compression': 'zstd', 'compression_level': 19}


100%|██████████| 20/20 [13:04<00:00, 39.24s/it]

{'method': 'parquet', 'save_time': 88.99277687072754, 'read_time': 1.0331578254699707, 'file_size': 173.33709907531738, 'compression': 'zstd', 'compression_level': 20}





In [8]:
# parquet, lz4
for cl in tqdm(range(1, 21)):
    r = evaluate_performance(df, "parquet", compression="lz4", compression_level=cl)
    print(r)
    eval_results.append(r)

  5%|▌         | 1/20 [00:09<02:52,  9.10s/it]

{'method': 'parquet', 'save_time': 1.7393920421600342, 'read_time': 0.735357689857483, 'file_size': 321.7554168701172, 'compression': 'lz4', 'compression_level': 1}


 10%|█         | 2/20 [00:17<02:36,  8.71s/it]

{'method': 'parquet', 'save_time': 1.5570900440216064, 'read_time': 0.6881155967712402, 'file_size': 321.7554168701172, 'compression': 'lz4', 'compression_level': 2}


 15%|█▌        | 3/20 [00:30<02:58, 10.49s/it]

{'method': 'parquet', 'save_time': 5.852352142333984, 'read_time': 0.6748246908187866, 'file_size': 251.7232542037964, 'compression': 'lz4', 'compression_level': 3}


 20%|██        | 4/20 [00:44<03:14, 12.13s/it]

{'method': 'parquet', 'save_time': 7.4448230266571045, 'read_time': 0.7179853916168213, 'file_size': 246.3456153869629, 'compression': 'lz4', 'compression_level': 4}


 25%|██▌       | 5/20 [01:01<03:25, 13.71s/it]

{'method': 'parquet', 'save_time': 9.448970079421997, 'read_time': 0.7053354978561401, 'file_size': 243.49229907989502, 'compression': 'lz4', 'compression_level': 5}


 30%|███       | 6/20 [01:19<03:33, 15.22s/it]

{'method': 'parquet', 'save_time': 11.215007781982422, 'read_time': 0.6931030988693238, 'file_size': 242.22041606903076, 'compression': 'lz4', 'compression_level': 6}


 35%|███▌      | 7/20 [01:39<03:38, 16.81s/it]

{'method': 'parquet', 'save_time': 12.96272897720337, 'read_time': 0.7105220079421997, 'file_size': 241.69765663146973, 'compression': 'lz4', 'compression_level': 7}


 40%|████      | 8/20 [02:01<03:39, 18.33s/it]

{'method': 'parquet', 'save_time': 14.592042922973633, 'read_time': 0.6989505052566528, 'file_size': 241.49703121185303, 'compression': 'lz4', 'compression_level': 8}


 45%|████▌     | 9/20 [02:23<03:36, 19.70s/it]

{'method': 'parquet', 'save_time': 15.710811138153076, 'read_time': 0.701621413230896, 'file_size': 241.3874912261963, 'compression': 'lz4', 'compression_level': 9}


 50%|█████     | 10/20 [02:52<03:44, 22.49s/it]

{'method': 'parquet', 'save_time': 21.542015075683594, 'read_time': 0.7178672790527344, 'file_size': 240.21079921722412, 'compression': 'lz4', 'compression_level': 10}


 55%|█████▌    | 11/20 [03:25<03:51, 25.70s/it]

{'method': 'parquet', 'save_time': 25.960198163986206, 'read_time': 0.702301287651062, 'file_size': 239.8005027770996, 'compression': 'lz4', 'compression_level': 11}


 60%|██████    | 12/20 [04:02<03:53, 29.17s/it]

{'method': 'parquet', 'save_time': 29.575100898742676, 'read_time': 0.7510624170303345, 'file_size': 239.77301216125488, 'compression': 'lz4', 'compression_level': 12}


 65%|██████▌   | 13/20 [04:39<03:39, 31.42s/it]

{'method': 'parquet', 'save_time': 29.63607621192932, 'read_time': 0.6959657907485962, 'file_size': 239.77301216125488, 'compression': 'lz4', 'compression_level': 13}


 70%|███████   | 14/20 [05:15<03:17, 32.98s/it]

{'method': 'parquet', 'save_time': 29.535764932632446, 'read_time': 0.7042984008789063, 'file_size': 239.77301216125488, 'compression': 'lz4', 'compression_level': 14}


 75%|███████▌  | 15/20 [05:52<02:50, 34.08s/it]

{'method': 'parquet', 'save_time': 29.560651779174805, 'read_time': 0.707607102394104, 'file_size': 239.77301216125488, 'compression': 'lz4', 'compression_level': 15}


 80%|████████  | 16/20 [06:29<02:19, 34.85s/it]

{'method': 'parquet', 'save_time': 29.65842294692993, 'read_time': 0.6959420919418335, 'file_size': 239.77301216125488, 'compression': 'lz4', 'compression_level': 16}


 85%|████████▌ | 17/20 [07:06<01:46, 35.50s/it]

{'method': 'parquet', 'save_time': 29.579817056655884, 'read_time': 0.7416252851486206, 'file_size': 239.77301216125488, 'compression': 'lz4', 'compression_level': 17}


 90%|█████████ | 18/20 [07:42<01:11, 35.86s/it]

{'method': 'parquet', 'save_time': 29.65169930458069, 'read_time': 0.703622579574585, 'file_size': 239.77301216125488, 'compression': 'lz4', 'compression_level': 18}


 95%|█████████▌| 19/20 [08:19<00:36, 36.15s/it]

{'method': 'parquet', 'save_time': 29.547273874282837, 'read_time': 0.7279452800750732, 'file_size': 239.77301216125488, 'compression': 'lz4', 'compression_level': 19}


100%|██████████| 20/20 [08:56<00:00, 26.82s/it]

{'method': 'parquet', 'save_time': 29.654156923294067, 'read_time': 0.7017038106918335, 'file_size': 239.77301216125488, 'compression': 'lz4', 'compression_level': 20}





In [9]:
# feather, zstd
for cl in tqdm(range(1, 21)):
    r = evaluate_performance(df, "feather", compression="zstd", compression_level=cl)
    print(r)
    eval_results.append(r)

  5%|▌         | 1/20 [00:08<02:49,  8.90s/it]

{'method': 'feather', 'save_time': 1.42557692527771, 'read_time': 0.7475739240646362, 'file_size': 224.01304054260254, 'compression': 'zstd', 'compression_level': 1}


 10%|█         | 2/20 [00:19<02:56,  9.80s/it]

{'method': 'feather', 'save_time': 1.9998857975006104, 'read_time': 0.8416632890701294, 'file_size': 208.77051734924316, 'compression': 'zstd', 'compression_level': 2}


 15%|█▌        | 3/20 [00:29<02:52, 10.14s/it]

{'method': 'feather', 'save_time': 2.1534488201141357, 'read_time': 0.8398640155792236, 'file_size': 199.06656074523926, 'compression': 'zstd', 'compression_level': 3}


 20%|██        | 4/20 [00:40<02:45, 10.37s/it]

{'method': 'feather', 'save_time': 2.249929904937744, 'read_time': 0.8472741842269897, 'file_size': 195.82847785949707, 'compression': 'zstd', 'compression_level': 4}


 25%|██▌       | 5/20 [00:52<02:46, 11.10s/it]

{'method': 'feather', 'save_time': 3.745828151702881, 'read_time': 0.8624408006668091, 'file_size': 190.4362735748291, 'compression': 'zstd', 'compression_level': 5}


 30%|███       | 6/20 [01:06<02:47, 11.96s/it]

{'method': 'feather', 'save_time': 5.374945878982544, 'read_time': 0.8255513191223145, 'file_size': 184.54472541809082, 'compression': 'zstd', 'compression_level': 6}


 35%|███▌      | 7/20 [01:21<02:46, 12.80s/it]

{'method': 'feather', 'save_time': 6.2978668212890625, 'read_time': 0.8235860347747803, 'file_size': 180.76762580871582, 'compression': 'zstd', 'compression_level': 7}


 40%|████      | 8/20 [01:37<02:46, 13.87s/it]

{'method': 'feather', 'save_time': 8.094537019729614, 'read_time': 0.8043187856674194, 'file_size': 178.49475288391113, 'compression': 'zstd', 'compression_level': 8}


 45%|████▌     | 9/20 [01:53<02:40, 14.55s/it]

{'method': 'feather', 'save_time': 8.095047950744629, 'read_time': 0.7954463005065918, 'file_size': 174.7295093536377, 'compression': 'zstd', 'compression_level': 9}


 50%|█████     | 10/20 [02:12<02:38, 15.87s/it]

{'method': 'feather', 'save_time': 11.006395101547241, 'read_time': 0.7807962894439697, 'file_size': 171.62451362609863, 'compression': 'zstd', 'compression_level': 10}


 55%|█████▌    | 11/20 [02:35<02:42, 18.07s/it]

{'method': 'feather', 'save_time': 15.334919929504395, 'read_time': 0.7732085943222046, 'file_size': 169.85757637023926, 'compression': 'zstd', 'compression_level': 11}


 60%|██████    | 12/20 [02:59<02:40, 20.01s/it]

{'method': 'feather', 'save_time': 16.838697910308838, 'read_time': 0.7611279964447022, 'file_size': 169.38566780090332, 'compression': 'zstd', 'compression_level': 12}


 65%|██████▌   | 13/20 [03:51<03:26, 29.50s/it]

{'method': 'feather', 'save_time': 43.71610498428345, 'read_time': 0.7621742010116577, 'file_size': 166.85210609436035, 'compression': 'zstd', 'compression_level': 13}


 70%|███████   | 14/20 [04:54<03:57, 39.64s/it]

{'method': 'feather', 'save_time': 55.471288204193115, 'read_time': 0.7572419166564941, 'file_size': 165.236421585083, 'compression': 'zstd', 'compression_level': 14}


 75%|███████▌  | 15/20 [06:33<04:47, 57.54s/it]

{'method': 'feather', 'save_time': 91.51796913146973, 'read_time': 0.7495280981063843, 'file_size': 162.98893928527832, 'compression': 'zstd', 'compression_level': 15}


 80%|████████  | 16/20 [08:01<04:26, 66.68s/it]

{'method': 'feather', 'save_time': 80.24370288848877, 'read_time': 0.7650845050811768, 'file_size': 158.30799293518066, 'compression': 'zstd', 'compression_level': 16}


 85%|████████▌ | 17/20 [10:14<04:20, 86.70s/it]

{'method': 'feather', 'save_time': 125.5472321510315, 'read_time': 0.7708266019821167, 'file_size': 152.97683906555176, 'compression': 'zstd', 'compression_level': 17}


 90%|█████████ | 18/20 [12:47<03:33, 106.59s/it]

{'method': 'feather', 'save_time': 145.36742210388184, 'read_time': 0.7521837711334228, 'file_size': 150.5761890411377, 'compression': 'zstd', 'compression_level': 18}


 95%|█████████▌| 19/20 [15:56<02:11, 131.41s/it]

{'method': 'feather', 'save_time': 181.56097602844238, 'read_time': 0.7660135984420776, 'file_size': 148.84340858459473, 'compression': 'zstd', 'compression_level': 19}


100%|██████████| 20/20 [19:51<00:00, 59.56s/it] 

{'method': 'feather', 'save_time': 227.24811697006226, 'read_time': 0.7567049026489258, 'file_size': 140.14563179016113, 'compression': 'zstd', 'compression_level': 20}





In [10]:
# feather, lz4
for cl in tqdm(range(1, 21)):
    r = evaluate_performance(df, "feather", compression="lz4", compression_level=cl)
    print(r)
    eval_results.append(r)

  5%|▌         | 1/20 [00:07<02:13,  7.02s/it]

{'method': 'feather', 'save_time': 1.2804429531097412, 'read_time': 0.5732852935791015, 'file_size': 334.1135501861572, 'compression': 'lz4', 'compression_level': 1}


 10%|█         | 2/20 [00:13<02:04,  6.94s/it]

{'method': 'feather', 'save_time': 1.4805629253387451, 'read_time': 0.5388987064361572, 'file_size': 334.1135501861572, 'compression': 'lz4', 'compression_level': 2}


 15%|█▌        | 3/20 [00:24<02:26,  8.62s/it]

{'method': 'feather', 'save_time': 5.169255018234253, 'read_time': 0.5446156024932861, 'file_size': 262.65227699279785, 'compression': 'lz4', 'compression_level': 3}


 20%|██        | 4/20 [00:36<02:39,  9.99s/it]

{'method': 'feather', 'save_time': 6.728070259094238, 'read_time': 0.535172700881958, 'file_size': 257.167688369751, 'compression': 'lz4', 'compression_level': 4}


 25%|██▌       | 5/20 [00:50<02:51, 11.46s/it]

{'method': 'feather', 'save_time': 8.654846906661987, 'read_time': 0.5401119947433471, 'file_size': 254.25290870666504, 'compression': 'lz4', 'compression_level': 5}


 30%|███       | 6/20 [01:06<03:02, 13.06s/it]

{'method': 'feather', 'save_time': 10.706410884857178, 'read_time': 0.5450427055358886, 'file_size': 252.95208168029785, 'compression': 'lz4', 'compression_level': 6}


 35%|███▌      | 7/20 [01:24<03:11, 14.71s/it]

{'method': 'feather', 'save_time': 12.628813028335571, 'read_time': 0.5471781253814697, 'file_size': 252.41932106018066, 'compression': 'lz4', 'compression_level': 7}


 40%|████      | 8/20 [01:44<03:14, 16.22s/it]

{'method': 'feather', 'save_time': 14.057636976242065, 'read_time': 0.538452410697937, 'file_size': 252.2168674468994, 'compression': 'lz4', 'compression_level': 8}


 45%|████▌     | 9/20 [02:05<03:14, 17.66s/it]

{'method': 'feather', 'save_time': 15.438199043273926, 'read_time': 0.5396882772445679, 'file_size': 252.10462379455566, 'compression': 'lz4', 'compression_level': 9}


 50%|█████     | 10/20 [02:31<03:23, 20.39s/it]

{'method': 'feather', 'save_time': 21.008697986602783, 'read_time': 0.5488072872161865, 'file_size': 250.90396308898926, 'compression': 'lz4', 'compression_level': 10}


 55%|█████▌    | 11/20 [03:02<03:32, 23.56s/it]

{'method': 'feather', 'save_time': 25.305948972702026, 'read_time': 0.5443089962005615, 'file_size': 250.50142097473145, 'compression': 'lz4', 'compression_level': 11}


 60%|██████    | 12/20 [03:36<03:35, 26.89s/it]

{'method': 'feather', 'save_time': 29.06900119781494, 'read_time': 0.5437150001525879, 'file_size': 250.46934700012207, 'compression': 'lz4', 'compression_level': 12}


 65%|██████▌   | 13/20 [04:11<03:24, 29.27s/it]

{'method': 'feather', 'save_time': 29.12043595314026, 'read_time': 0.5600850105285644, 'file_size': 250.46934700012207, 'compression': 'lz4', 'compression_level': 13}


 70%|███████   | 14/20 [04:46<03:05, 30.91s/it]

{'method': 'feather', 'save_time': 29.194515228271484, 'read_time': 0.5501016855239869, 'file_size': 250.46934700012207, 'compression': 'lz4', 'compression_level': 14}


 75%|███████▌  | 15/20 [05:20<02:40, 32.01s/it]

{'method': 'feather', 'save_time': 29.07554316520691, 'read_time': 0.5465172290802002, 'file_size': 250.46934700012207, 'compression': 'lz4', 'compression_level': 15}


 80%|████████  | 16/20 [05:55<02:10, 32.75s/it]

{'method': 'feather', 'save_time': 29.085169076919556, 'read_time': 0.5379619836807251, 'file_size': 250.46934700012207, 'compression': 'lz4', 'compression_level': 16}


 85%|████████▌ | 17/20 [06:29<01:39, 33.27s/it]

{'method': 'feather', 'save_time': 29.031739950180054, 'read_time': 0.5430895090103149, 'file_size': 250.46934700012207, 'compression': 'lz4', 'compression_level': 17}


 90%|█████████ | 18/20 [07:04<01:07, 33.61s/it]

{'method': 'feather', 'save_time': 29.050039052963257, 'read_time': 0.5340919017791748, 'file_size': 250.46934700012207, 'compression': 'lz4', 'compression_level': 18}


 95%|█████████▌| 19/20 [07:38<00:33, 33.87s/it]

{'method': 'feather', 'save_time': 29.07599902153015, 'read_time': 0.5415361881256103, 'file_size': 250.46934700012207, 'compression': 'lz4', 'compression_level': 19}


100%|██████████| 20/20 [08:13<00:00, 24.66s/it]

{'method': 'feather', 'save_time': 29.03005599975586, 'read_time': 0.5439633131027222, 'file_size': 250.46934700012207, 'compression': 'lz4', 'compression_level': 20}





In [11]:
# parquet
for c in ["gzip", "snappy", "lz4", "zstd", "brotli", None]:
    r = evaluate_performance(df, "parquet", compression=c)
    print(r)
    eval_results.append(r)

{'method': 'parquet', 'save_time': 20.333938121795654, 'read_time': 1.8917057037353515, 'file_size': 211.33220672607422, 'compression': 'gzip'}
{'method': 'parquet', 'save_time': 1.710979700088501, 'read_time': 0.8100784063339234, 'file_size': 318.714994430542, 'compression': 'snappy'}
{'method': 'parquet', 'save_time': 1.5627169609069824, 'read_time': 0.7078850030899048, 'file_size': 321.7554168701172, 'compression': 'lz4'}
{'method': 'parquet', 'save_time': 1.723479986190796, 'read_time': 0.9531194925308227, 'file_size': 226.8893756866455, 'compression': 'zstd'}
{'method': 'parquet', 'save_time': 18.56906819343567, 'read_time': 1.6577537775039672, 'file_size': 181.17698860168457, 'compression': 'brotli'}
{'method': 'parquet', 'save_time': 1.6790261268615723, 'read_time': 0.8445393800735473, 'file_size': 318.714994430542}


In [12]:
# feather
for c in ["lz4", "zstd", "uncompressed", None]:
    r = evaluate_performance(df, "feather", compression=c)
    print(r)
    eval_results.append(r)

{'method': 'feather', 'save_time': 1.3364830017089844, 'read_time': 0.5182675123214722, 'file_size': 334.1135501861572, 'compression': 'lz4'}
{'method': 'feather', 'save_time': 1.3837168216705322, 'read_time': 0.773916506767273, 'file_size': 224.01304054260254, 'compression': 'zstd'}
{'method': 'feather', 'save_time': 0.38083815574645996, 'read_time': 0.3962541103363037, 'file_size': 558.7613697052002, 'compression': 'uncompressed'}
{'method': 'feather', 'save_time': 1.3422012329101562, 'read_time': 0.5551566839218139, 'file_size': 334.1135501861572}


In [13]:
# csv
for c in ['zip', 'gzip', 'bz2', 'zstd', 'xz', 'tar', None]:
    r = evaluate_performance(df, "csv", compression=c)
    print(r)
    eval_results.append(r)

{'method': 'csv', 'save_time': 28.594516038894653, 'read_time': 5.931642293930054, 'file_size': 229.09690284729004, 'compression': 'zip'}
{'method': 'csv', 'save_time': 32.593398094177246, 'read_time': 7.220234298706055, 'file_size': 228.72325706481934, 'compression': 'gzip'}
{'method': 'csv', 'save_time': 39.74804401397705, 'read_time': 16.60398178100586, 'file_size': 172.7035903930664, 'compression': 'bz2'}
{'method': 'csv', 'save_time': 10.806816816329956, 'read_time': 5.758484935760498, 'file_size': 219.93363094329834, 'compression': 'zstd'}
{'method': 'csv', 'save_time': 251.98728919029236, 'read_time': 12.825430107116699, 'file_size': 153.80280303955078, 'compression': 'xz'}
{'method': 'csv', 'save_time': 9.18571400642395, 'read_time': 5.357082390785218, 'file_size': 593.701171875, 'compression': 'tar'}
{'method': 'csv', 'save_time': 8.582108736038208, 'read_time': 5.344513487815857, 'file_size': 593.6962213516235}


In [14]:
df = pd.DataFrame(eval_results)
df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level
0,parquet,2.090063,0.984037,226.889376,zstd,1.0
1,parquet,2.398170,1.039732,212.139284,zstd,2.0
2,parquet,2.786277,1.044345,203.676181,zstd,3.0
3,parquet,2.802313,1.046095,201.427144,zstd,4.0
4,parquet,4.344431,1.067887,195.823249,zstd,5.0
...,...,...,...,...,...,...
92,csv,39.748044,16.603982,172.703590,bz2,
93,csv,10.806817,5.758485,219.933631,zstd,
94,csv,251.987289,12.825430,153.802803,xz,
95,csv,9.185714,5.357082,593.701172,tar,


In [15]:
df["comp_ratio"] = mem_size_mb / df["file_size"]
df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio
0,parquet,2.090063,0.984037,226.889376,zstd,1.0,3.126348
1,parquet,2.398170,1.039732,212.139284,zstd,2.0,3.343723
2,parquet,2.786277,1.044345,203.676181,zstd,3.0,3.482661
3,parquet,2.802313,1.046095,201.427144,zstd,4.0,3.521546
4,parquet,4.344431,1.067887,195.823249,zstd,5.0,3.622323
...,...,...,...,...,...,...,...
92,csv,39.748044,16.603982,172.703590,bz2,,4.107240
93,csv,10.806817,5.758485,219.933631,zstd,,3.225223
94,csv,251.987289,12.825430,153.802803,xz,,4.611977
95,csv,9.185714,5.357082,593.701172,tar,,1.194768


In [16]:
df.to_pickle("./data/test_results_category_01.pkl.gz", compression="gzip")