In [1]:
import pandas as pd
import plotly.express as px

In [2]:
df = pd.read_pickle("./data/test_results_02.pkl.gz")
df["compression_level"] = df["compression_level"].fillna(0)
df["compression"] = df["compression"].fillna("default")
df["type"] = df["method"] + "_" + df["compression"]

df["save_time"] = df["save_time"].round(2)
df["read_time"] = df["read_time"].round(2)
df["comp_ratio"] = df["comp_ratio"].round(3)

# change type of column "compression_level" to int
df["compression_level"] = df["compression_level"].astype(int)

df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type
0,parquet,1.72,0.96,205.095127,zstd,1,3.830,parquet_zstd
1,parquet,2.16,1.00,190.350016,zstd,2,4.126,parquet_zstd
2,parquet,2.46,1.01,181.888956,zstd,3,4.318,parquet_zstd
3,parquet,2.55,1.01,179.640458,zstd,4,4.372,parquet_zstd
4,parquet,4.07,1.03,174.038219,zstd,5,4.513,parquet_zstd
...,...,...,...,...,...,...,...,...
92,csv,62.24,16.67,153.719321,bz2,0,5.110,csv_bz2
93,csv,12.85,5.50,184.943592,zstd,0,4.247,csv_zstd
94,csv,211.05,11.24,131.529678,xz,0,5.972,csv_xz
95,csv,10.89,4.96,691.806641,tar,0,1.135,csv_tar


In [3]:
def method_compression_with_level(row) -> str:
    level = row["compression_level"] if row["compression_level"] > 0 else "default"
    return f"{row['method']}_{row['compression']}_{level}"

df["method_compression_with_level"] = df.apply(method_compression_with_level, axis=1)
df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type,method_compression_with_level
0,parquet,1.72,0.96,205.095127,zstd,1,3.830,parquet_zstd,parquet_zstd_1
1,parquet,2.16,1.00,190.350016,zstd,2,4.126,parquet_zstd,parquet_zstd_2
2,parquet,2.46,1.01,181.888956,zstd,3,4.318,parquet_zstd,parquet_zstd_3
3,parquet,2.55,1.01,179.640458,zstd,4,4.372,parquet_zstd,parquet_zstd_4
4,parquet,4.07,1.03,174.038219,zstd,5,4.513,parquet_zstd,parquet_zstd_5
...,...,...,...,...,...,...,...,...,...
92,csv,62.24,16.67,153.719321,bz2,0,5.110,csv_bz2,csv_bz2_default
93,csv,12.85,5.50,184.943592,zstd,0,4.247,csv_zstd,csv_zstd_default
94,csv,211.05,11.24,131.529678,xz,0,5.972,csv_xz,csv_xz_default
95,csv,10.89,4.96,691.806641,tar,0,1.135,csv_tar,csv_tar_default


In [4]:
df_default_cl = df[df["compression_level"] == 0].copy()

# oder df_default_cl by method_compression_with_level
df_default_cl.sort_values("method_compression_with_level", ascending=True, inplace=True)

df_default_cl

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type,method_compression_with_level
92,csv,62.24,16.67,153.719321,bz2,0,5.11,csv_bz2,csv_bz2_default
96,csv,10.83,4.97,691.797897,default,0,1.135,csv_default,csv_default_default
91,csv,31.16,7.01,198.554004,gzip,0,3.956,csv_gzip,csv_gzip_default
95,csv,10.89,4.96,691.806641,tar,0,1.135,csv_tar,csv_tar_default
94,csv,211.05,11.24,131.529678,xz,0,5.972,csv_xz,csv_xz_default
90,csv,27.7,5.63,199.115047,zip,0,3.945,csv_zip,csv_zip_default
93,csv,12.85,5.5,184.943592,zstd,0,4.247,csv_zstd,csv_zstd_default
89,feather,1.26,0.52,299.813852,default,0,2.62,feather_default,feather_default_default
86,feather,1.23,0.51,299.813852,lz4,0,2.62,feather_lz4,feather_lz4_default
88,feather,0.34,0.39,634.940203,uncompressed,0,1.237,feather_uncompressed,feather_uncompressed_default


In [5]:
fig = px.bar(df_default_cl, x="method_compression_with_level", y="comp_ratio", text_auto=True,
             labels={"comp_ratio": "compression ratio",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots/pandas_ff_bar_comp_ratio.svg")


## Insights

- best compression has
  - bz2
  - xz
  - brotli

## Next Steps

- check save times
- check read times

In [6]:
fig = px.bar(df_default_cl, x="method_compression_with_level", y="save_time", text_auto=True,
             labels={"save_time": "time to save data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots/pandas_ff_bar_save_time.svg")


## Insights

- xz has a very long save time
- xz save time is so much longer than bz2 that it does not justify the slightly better compression that bz2
- xz is out
- bz2 also has a long save time
- brotli save time seems to be ok but not fastest

## Next Steps

- also check read times

In [7]:
fig = px.bar(df_default_cl, x="method_compression_with_level", y="read_time", text_auto=True,
             labels={"read_time": "time to read data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots/pandas_ff_bar_read_time.svg")

## Insights

- bz2 has a very long read time
- xz also has a long read time
- brotli read time seems to be ok but not fastest

## Next Steps

- what about the comprssion level?

In [8]:
interesting_method_compression_with_level = [
    "csv_bz2_default",
    "csv_xz_default",
    "parquet_brotli_default",
    "parquet_zstd_1",
    "parquet_zstd_5",
    "parquet_zstd_10",
    "parquet_zstd_18",
    "parquet_lz4_1",
    "parquet_lz4_5",
    "parquet_lz4_10",
    "parquet_lz4_18",
    "feather_zstd_1",
    "feather_zstd_5",
    "feather_zstd_10",
    "feather_zstd_18",
    "feather_lz4_1",
    "feather_lz4_5",
    "feather_lz4_10",
    "feather_lz4_18",
]

df_interesting_levels = df[df["method_compression_with_level"].isin(interesting_method_compression_with_level)].copy()
#df_interesting_levels.sort_values("method_compression_with_level", ascending=True, inplace=True)
df_interesting_levels

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type,method_compression_with_level
0,parquet,1.72,0.96,205.095127,zstd,1,3.83,parquet_zstd,parquet_zstd_1
4,parquet,4.07,1.03,174.038219,zstd,5,4.513,parquet_zstd,parquet_zstd_5
9,parquet,10.36,0.88,163.014119,zstd,10,4.818,parquet_zstd,parquet_zstd_10
17,parquet,77.92,0.92,151.678466,zstd,18,5.178,parquet_zstd,parquet_zstd_18
20,parquet,1.34,0.67,299.881209,lz4,1,2.619,parquet_lz4,parquet_lz4_1
24,parquet,8.75,0.64,221.618096,lz4,5,3.544,parquet_lz4,parquet_lz4_5
29,parquet,20.99,0.64,218.336486,lz4,10,3.597,parquet_lz4,parquet_lz4_10
37,parquet,28.96,0.64,217.898915,lz4,18,3.605,parquet_lz4,parquet_lz4_18
40,feather,1.37,0.76,203.281542,zstd,1,3.864,feather_zstd,feather_zstd_1
44,feather,3.69,0.85,169.25403,zstd,5,4.641,feather_zstd,feather_zstd_5


In [9]:
fig = px.bar(df_interesting_levels, x="method_compression_with_level", y="comp_ratio", text_auto=True,
             labels={"comp_ratio": "compression ratio",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots/pandas_ff_bar_comp_ratio_feather.svg")


In [10]:
fig = px.bar(df_interesting_levels, x="method_compression_with_level", y="save_time", text_auto=True,
             labels={"save_time": "time to save data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots/pandas_ff_bar_save_time_feather.svg")

In [11]:
fig = px.bar(df_interesting_levels, x="method_compression_with_level", y="read_time", text_auto=True,
             labels={"read_time": "time to read data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots/pandas_ff_bar_read_time_feather.svg")

In [12]:
feather_df = df[(df["method"].isin(["feather", "parquet"])) & (df["compression_level"] > 0) &
                (df["compression"] == "zstd")].copy()
#feather_df = df[df["method"].isin(["feather", "parquet"])].copy()

feather_df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type,method_compression_with_level
0,parquet,1.72,0.96,205.095127,zstd,1,3.83,parquet_zstd,parquet_zstd_1
1,parquet,2.16,1.0,190.350016,zstd,2,4.126,parquet_zstd,parquet_zstd_2
2,parquet,2.46,1.01,181.888956,zstd,3,4.318,parquet_zstd,parquet_zstd_3
3,parquet,2.55,1.01,179.640458,zstd,4,4.372,parquet_zstd,parquet_zstd_4
4,parquet,4.07,1.03,174.038219,zstd,5,4.513,parquet_zstd,parquet_zstd_5
5,parquet,5.71,0.96,168.916452,zstd,6,4.65,parquet_zstd,parquet_zstd_6
6,parquet,6.36,0.94,166.693084,zstd,7,4.712,parquet_zstd,parquet_zstd_7
7,parquet,8.06,0.9,164.832357,zstd,8,4.765,parquet_zstd,parquet_zstd_8
8,parquet,8.05,0.89,164.416287,zstd,9,4.777,parquet_zstd,parquet_zstd_9
9,parquet,10.36,0.88,163.014119,zstd,10,4.818,parquet_zstd,parquet_zstd_10


In [13]:
fig = px.scatter(feather_df, x="compression_level", y="comp_ratio", color='type',
                 hover_data=['save_time', "read_time"],
                 symbol="type",
                 labels={"comp_ratio": "compression ratio",
                         "compression_level": "level of compression"})
fig.update_traces(marker_size=8)
fig.show()
fig.write_image("./plots/pandas_ff_scatter_comp_ratio_feather.svg")

In [14]:
fig = px.scatter(feather_df, x="compression_level", y="save_time", color='type', hover_data=['comp_ratio', "read_time"],
                 symbol="type",
                 labels={"save_time": "time to save data in seconds",
                     "compression_level": "level of compression"})
fig.update_traces(marker_size=8)
fig.show()
fig.write_image("./plots/pandas_ff_scatter_save_time_feather.svg")

In [15]:
fig = px.scatter(feather_df, x="compression_level", y="read_time", color='type',
                 hover_data=['comp_ratio', "save_time"],
                 symbol="type",
                 labels={"read_time": "time to read data in seconds",
                     "compression_level": "level of compression"})
fig.update_traces(marker_size=8)
fig.show()
fig.write_image("./plots/pandas_ff_scatter_read_time_feather.svg")