In [1]:
import pandas as pd
import plotly.express as px

In [2]:
df = pd.read_pickle("./data/test_results_category_01.pkl.gz")
df["compression_level"] = df["compression_level"].fillna(0)
df["compression"] = df["compression"].fillna("default")
df["type"] = df["method"] + "_" + df["compression"]

df["save_time"] = df["save_time"].round(2)
df["read_time"] = df["read_time"].round(2)
df["comp_ratio"] = df["comp_ratio"].round(3)

# change type of column "compression_level" to int
df["compression_level"] = df["compression_level"].astype(int)

df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type
0,parquet,2.09,0.98,226.889376,zstd,1,3.126,parquet_zstd
1,parquet,2.40,1.04,212.139284,zstd,2,3.344,parquet_zstd
2,parquet,2.79,1.04,203.676181,zstd,3,3.483,parquet_zstd
3,parquet,2.80,1.05,201.427144,zstd,4,3.522,parquet_zstd
4,parquet,4.34,1.07,195.823249,zstd,5,3.622,parquet_zstd
...,...,...,...,...,...,...,...,...
92,csv,39.75,16.60,172.703590,bz2,0,4.107,csv_bz2
93,csv,10.81,5.76,219.933631,zstd,0,3.225,csv_zstd
94,csv,251.99,12.83,153.802803,xz,0,4.612,csv_xz
95,csv,9.19,5.36,593.701172,tar,0,1.195,csv_tar


In [3]:
def method_compression_with_level(row) -> str:
    level = row["compression_level"] if row["compression_level"] > 0 else "default"
    return f"{row['method']}_{row['compression']}_{level}"

df["method_compression_with_level"] = df.apply(method_compression_with_level, axis=1)
df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type,method_compression_with_level
0,parquet,2.09,0.98,226.889376,zstd,1,3.126,parquet_zstd,parquet_zstd_1
1,parquet,2.40,1.04,212.139284,zstd,2,3.344,parquet_zstd,parquet_zstd_2
2,parquet,2.79,1.04,203.676181,zstd,3,3.483,parquet_zstd,parquet_zstd_3
3,parquet,2.80,1.05,201.427144,zstd,4,3.522,parquet_zstd,parquet_zstd_4
4,parquet,4.34,1.07,195.823249,zstd,5,3.622,parquet_zstd,parquet_zstd_5
...,...,...,...,...,...,...,...,...,...
92,csv,39.75,16.60,172.703590,bz2,0,4.107,csv_bz2,csv_bz2_default
93,csv,10.81,5.76,219.933631,zstd,0,3.225,csv_zstd,csv_zstd_default
94,csv,251.99,12.83,153.802803,xz,0,4.612,csv_xz,csv_xz_default
95,csv,9.19,5.36,593.701172,tar,0,1.195,csv_tar,csv_tar_default


In [4]:
df_default_cl = df[df["compression_level"] == 0].copy()

# oder df_default_cl by method_compression_with_level
df_default_cl.sort_values("method_compression_with_level", ascending=True, inplace=True)

df_default_cl

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type,method_compression_with_level
92,csv,39.75,16.6,172.70359,bz2,0,4.107,csv_bz2,csv_bz2_default
96,csv,8.58,5.34,593.696221,default,0,1.195,csv_default,csv_default_default
91,csv,32.59,7.22,228.723257,gzip,0,3.101,csv_gzip,csv_gzip_default
95,csv,9.19,5.36,593.701172,tar,0,1.195,csv_tar,csv_tar_default
94,csv,251.99,12.83,153.802803,xz,0,4.612,csv_xz,csv_xz_default
90,csv,28.59,5.93,229.096903,zip,0,3.096,csv_zip,csv_zip_default
93,csv,10.81,5.76,219.933631,zstd,0,3.225,csv_zstd,csv_zstd_default
89,feather,1.34,0.56,334.11355,default,0,2.123,feather_default,feather_default_default
86,feather,1.34,0.52,334.11355,lz4,0,2.123,feather_lz4,feather_lz4_default
88,feather,0.38,0.4,558.76137,uncompressed,0,1.269,feather_uncompressed,feather_uncompressed_default


In [5]:
fig = px.bar(df_default_cl, x="method_compression_with_level", y="comp_ratio", text_auto=True,
             labels={"comp_ratio": "compression ratio",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots_category/pandas_ff_bar_comp_ratio.svg")


FileNotFoundError: [Errno 2] No such file or directory: 'plots_category/pandas_ff_bar_comp_ratio.svg'

## Insights

- best compression has
  - bz2
  - xz
  - brotli

## Next Steps

- check save times
- check read times

In [None]:
fig = px.bar(df_default_cl, x="method_compression_with_level", y="save_time", text_auto=True,
             labels={"save_time": "time to save data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots_category/pandas_ff_bar_save_time.svg")


## Insights

- xz has a very long save time
- xz save time is so much longer than bz2 that it does not justify the slightly better compression that bz2
- xz is out
- bz2 also has a long save time
- brotli save time seems to be ok but not fastest

## Next Steps

- also check read times

In [None]:
fig = px.bar(df_default_cl, x="method_compression_with_level", y="read_time", text_auto=True,
             labels={"read_time": "time to read data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots_category/pandas_ff_bar_read_time.svg")

## Insights

- bz2 has a very long read time
- xz also has a long read time
- brotli read time seems to be ok but not fastest

## Next Steps

- what about the comprssion level?

In [None]:
interesting_method_compression_with_level = [
    "csv_bz2_default",
    "csv_xz_default",
    "parquet_brotli_default",
    "parquet_zstd_1",
    "parquet_zstd_5",
    "parquet_zstd_10",
    "parquet_zstd_18",
    "parquet_lz4_1",
    "parquet_lz4_5",
    "parquet_lz4_10",
    "parquet_lz4_18",
    "feather_zstd_1",
    "feather_zstd_5",
    "feather_zstd_10",
    "feather_zstd_18",
    "feather_lz4_1",
    "feather_lz4_5",
    "feather_lz4_10",
    "feather_lz4_18",
]

df_interesting_levels = df[df["method_compression_with_level"].isin(interesting_method_compression_with_level)].copy()
#df_interesting_levels.sort_values("method_compression_with_level", ascending=True, inplace=True)
df_interesting_levels

In [None]:
fig = px.bar(df_interesting_levels, x="method_compression_with_level", y="comp_ratio", text_auto=True,
             labels={"comp_ratio": "compression ratio",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots_category/pandas_ff_bar_comp_ratio_feather.svg")


In [None]:
fig = px.bar(df_interesting_levels, x="method_compression_with_level", y="save_time", text_auto=True,
             labels={"save_time": "time to save data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots_category/pandas_ff_bar_save_time_feather.svg")

In [None]:
fig = px.bar(df_interesting_levels, x="method_compression_with_level", y="read_time", text_auto=True,
             labels={"read_time": "time to read data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots_category/pandas_ff_bar_read_time_feather.svg")

In [None]:
feather_df = df[(df["method"].isin(["feather", "parquet"])) & (df["compression_level"] > 0) &
                (df["compression"] == "zstd")].copy()
#feather_df = df[df["method"].isin(["feather", "parquet"])].copy()

feather_df

In [None]:
fig = px.scatter(feather_df, x="compression_level", y="comp_ratio", color='type',
                 hover_data=['save_time', "read_time"],
                 symbol="type",
                 labels={"comp_ratio": "compression ratio",
                         "compression_level": "level of compression"})
fig.update_traces(marker_size=8)
fig.show()
fig.write_image("./plots_category/pandas_ff_scatter_comp_ratio_feather.svg")

In [None]:
fig = px.scatter(feather_df, x="compression_level", y="save_time", color='type', hover_data=['comp_ratio', "read_time"],
                 symbol="type",
                 labels={"save_time": "time to save data in seconds",
                     "compression_level": "level of compression"})
fig.update_traces(marker_size=8)
fig.show()
fig.write_image("./plots_category/pandas_ff_scatter_save_time_feather.svg")

In [None]:
fig = px.scatter(feather_df, x="compression_level", y="read_time", color='type',
                 hover_data=['comp_ratio', "save_time"],
                 symbol="type",
                 labels={"read_time": "time to read data in seconds",
                     "compression_level": "level of compression"})
fig.update_traces(marker_size=8)
fig.show()
fig.write_image("./plots_category/pandas_ff_scatter_read_time_feather.svg")