In [1]:
import pandas as pd
import plotly.express as px

In [2]:
df = pd.read_pickle("./data/test_results_date_01.pkl.gz")
df["compression_level"] = df["compression_level"].fillna(0)
df["compression"] = df["compression"].fillna("default")
df["type"] = df["method"] + "_" + df["compression"]

df["save_time"] = df["save_time"].round(2)
df["read_time"] = df["read_time"].round(2)
df["comp_ratio"] = df["comp_ratio"].round(3)

# change type of column "compression_level" to int
df["compression_level"] = df["compression_level"].astype(int)

df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type
0,parquet,9.79,1.11,454.351955,zstd,1,5.757,parquet_zstd
1,parquet,9.50,1.13,438.708549,zstd,2,5.962,parquet_zstd
2,parquet,10.22,1.13,427.473406,zstd,3,6.119,parquet_zstd
3,parquet,10.76,1.13,422.640075,zstd,4,6.189,parquet_zstd
4,parquet,13.00,1.14,413.288786,zstd,5,6.329,parquet_zstd
...,...,...,...,...,...,...,...,...
92,csv,95.70,34.35,253.524595,bz2,0,10.318,csv_bz2
93,csv,37.43,15.71,362.189476,zstd,0,7.222,csv_zstd
94,csv,564.32,26.31,257.165257,xz,0,10.172,csv_xz
95,csv,34.24,15.28,1217.675781,tar,0,2.148,csv_tar


In [3]:
def method_compression_with_level(row) -> str:
    level = row["compression_level"] if row["compression_level"] > 0 else "default"
    return f"{row['method']}_{row['compression']}_{level}"

df["method_compression_with_level"] = df.apply(method_compression_with_level, axis=1)
df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type,method_compression_with_level
0,parquet,9.79,1.11,454.351955,zstd,1,5.757,parquet_zstd,parquet_zstd_1
1,parquet,9.50,1.13,438.708549,zstd,2,5.962,parquet_zstd,parquet_zstd_2
2,parquet,10.22,1.13,427.473406,zstd,3,6.119,parquet_zstd,parquet_zstd_3
3,parquet,10.76,1.13,422.640075,zstd,4,6.189,parquet_zstd,parquet_zstd_4
4,parquet,13.00,1.14,413.288786,zstd,5,6.329,parquet_zstd,parquet_zstd_5
...,...,...,...,...,...,...,...,...,...
92,csv,95.70,34.35,253.524595,bz2,0,10.318,csv_bz2,csv_bz2_default
93,csv,37.43,15.71,362.189476,zstd,0,7.222,csv_zstd,csv_zstd_default
94,csv,564.32,26.31,257.165257,xz,0,10.172,csv_xz,csv_xz_default
95,csv,34.24,15.28,1217.675781,tar,0,2.148,csv_tar,csv_tar_default


In [4]:
df_default_cl = df[df["compression_level"] == 0].copy()

# oder df_default_cl by method_compression_with_level
df_default_cl.sort_values("method_compression_with_level", ascending=True, inplace=True)

df_default_cl

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type,method_compression_with_level
92,csv,95.7,34.35,253.524595,bz2,0,10.318,csv_bz2,csv_bz2_default
96,csv,33.82,14.74,1217.669496,default,0,2.148,csv_default,csv_default_default
91,csv,108.02,17.66,368.608737,gzip,0,7.096,csv_gzip,csv_gzip_default
95,csv,34.24,15.28,1217.675781,tar,0,2.148,csv_tar,csv_tar_default
94,csv,564.32,26.31,257.165257,xz,0,10.172,csv_xz,csv_xz_default
90,csv,72.98,15.97,370.082006,zip,0,7.068,csv_zip,csv_zip_default
93,csv,37.43,15.71,362.189476,zstd,0,7.222,csv_zstd,csv_zstd_default
89,feather,7.16,0.62,576.733133,default,0,4.536,feather_default,feather_default_default
86,feather,7.6,0.64,576.733133,lz4,0,4.536,feather_lz4,feather_lz4_default
88,feather,6.69,0.52,801.368258,uncompressed,0,3.264,feather_uncompressed,feather_uncompressed_default


In [5]:
fig = px.bar(df_default_cl, x="method_compression_with_level", y="comp_ratio", text_auto=True,
             labels={"comp_ratio": "compression ratio",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots_date/pandas_ff_bar_comp_ratio.svg")


## Insights

- best compression has
  - bz2
  - xz
  - brotli

## Next Steps

- check save times
- check read times

In [6]:
fig = px.bar(df_default_cl, x="method_compression_with_level", y="save_time", text_auto=True,
             labels={"save_time": "time to save data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots_date/pandas_ff_bar_save_time.svg")


## Insights

- xz has a very long save time
- xz save time is so much longer than bz2 that it does not justify the slightly better compression that bz2
- xz is out
- bz2 also has a long save time
- brotli save time seems to be ok but not fastest

## Next Steps

- also check read times

In [7]:
fig = px.bar(df_default_cl, x="method_compression_with_level", y="read_time", text_auto=True,
             labels={"read_time": "time to read data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots_date/pandas_ff_bar_read_time.svg")

## Insights

- bz2 has a very long read time
- xz also has a long read time
- brotli read time seems to be ok but not fastest

## Next Steps

- what about the comprssion level?

In [8]:
interesting_method_compression_with_level = [
    "csv_bz2_default",
    "csv_xz_default",
    "parquet_brotli_default",
    "parquet_zstd_1",
    "parquet_zstd_5",
    "parquet_zstd_10",
    "parquet_zstd_18",
    "parquet_lz4_1",
    "parquet_lz4_5",
    "parquet_lz4_10",
    "parquet_lz4_18",
    "feather_zstd_1",
    "feather_zstd_5",
    "feather_zstd_10",
    "feather_zstd_18",
    "feather_lz4_1",
    "feather_lz4_5",
    "feather_lz4_10",
    "feather_lz4_18",
]

df_interesting_levels = df[df["method_compression_with_level"].isin(interesting_method_compression_with_level)].copy()
#df_interesting_levels.sort_values("method_compression_with_level", ascending=True, inplace=True)
df_interesting_levels

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type,method_compression_with_level
0,parquet,9.79,1.11,454.351955,zstd,1,5.757,parquet_zstd,parquet_zstd_1
4,parquet,13.0,1.14,413.288786,zstd,5,6.329,parquet_zstd,parquet_zstd_5
9,parquet,20.76,1.1,400.22748,zstd,10,6.536,parquet_zstd,parquet_zstd_10
17,parquet,104.99,1.1,370.356761,zstd,18,7.063,parquet_zstd,parquet_zstd_18
20,parquet,8.3,0.82,590.422794,lz4,1,4.43,parquet_lz4,parquet_lz4_1
24,parquet,21.31,0.8,475.681746,lz4,5,5.499,parquet_lz4,parquet_lz4_5
29,parquet,36.41,0.8,472.393462,lz4,10,5.537,parquet_lz4,parquet_lz4_10
37,parquet,45.05,0.8,471.955799,lz4,18,5.542,parquet_lz4,parquet_lz4_18
40,feather,7.81,0.87,437.126841,zstd,1,5.984,feather_zstd,feather_zstd_1
44,feather,10.41,0.97,394.070826,zstd,5,6.638,feather_zstd,feather_zstd_5


In [9]:
fig = px.bar(df_interesting_levels, x="method_compression_with_level", y="comp_ratio", text_auto=True,
             labels={"comp_ratio": "compression ratio",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots_date/pandas_ff_bar_comp_ratio_feather.svg")


In [10]:
fig = px.bar(df_interesting_levels, x="method_compression_with_level", y="save_time", text_auto=True,
             labels={"save_time": "time to save data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots_date/pandas_ff_bar_save_time_feather.svg")

In [11]:
fig = px.bar(df_interesting_levels, x="method_compression_with_level", y="read_time", text_auto=True,
             labels={"read_time": "time to read data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots_date/pandas_ff_bar_read_time_feather.svg")

In [12]:
feather_df = df[(df["method"].isin(["feather", "parquet"])) & (df["compression_level"] > 0) &
                (df["compression"] == "zstd")].copy()
#feather_df = df[df["method"].isin(["feather", "parquet"])].copy()

feather_df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type,method_compression_with_level
0,parquet,9.79,1.11,454.351955,zstd,1,5.757,parquet_zstd,parquet_zstd_1
1,parquet,9.5,1.13,438.708549,zstd,2,5.962,parquet_zstd,parquet_zstd_2
2,parquet,10.22,1.13,427.473406,zstd,3,6.119,parquet_zstd,parquet_zstd_3
3,parquet,10.76,1.13,422.640075,zstd,4,6.189,parquet_zstd,parquet_zstd_4
4,parquet,13.0,1.14,413.288786,zstd,5,6.329,parquet_zstd,parquet_zstd_5
5,parquet,14.69,1.13,407.768621,zstd,6,6.415,parquet_zstd,parquet_zstd_6
6,parquet,15.58,1.11,404.410295,zstd,7,6.468,parquet_zstd,parquet_zstd_7
7,parquet,17.48,1.09,402.135481,zstd,8,6.505,parquet_zstd,parquet_zstd_8
8,parquet,17.27,1.09,401.631297,zstd,9,6.513,parquet_zstd,parquet_zstd_9
9,parquet,20.76,1.1,400.22748,zstd,10,6.536,parquet_zstd,parquet_zstd_10


In [13]:
fig = px.scatter(feather_df, x="compression_level", y="comp_ratio", color='type',
                 hover_data=['save_time', "read_time"],
                 symbol="type",
                 labels={"comp_ratio": "compression ratio",
                         "compression_level": "level of compression"})
fig.update_traces(marker_size=8)
fig.show()
fig.write_image("./plots_date/pandas_ff_scatter_comp_ratio_feather.svg")

In [14]:
fig = px.scatter(feather_df, x="compression_level", y="save_time", color='type', hover_data=['comp_ratio', "read_time"],
                 symbol="type",
                 labels={"save_time": "time to save data in seconds",
                     "compression_level": "level of compression"})
fig.update_traces(marker_size=8)
fig.show()
fig.write_image("./plots_date/pandas_ff_scatter_save_time_feather.svg")

In [15]:
fig = px.scatter(feather_df, x="compression_level", y="read_time", color='type',
                 hover_data=['comp_ratio', "save_time"],
                 symbol="type",
                 labels={"read_time": "time to read data in seconds",
                     "compression_level": "level of compression"})
fig.update_traces(marker_size=8)
fig.show()
fig.write_image("./plots_date/pandas_ff_scatter_read_time_feather.svg")