In [1]:
import pandas as pd
import plotly.express as px

In [2]:
df = pd.read_pickle("./data/test_results_03.pkl.gz")
df["compression_level"] = df["compression_level"].fillna(0)
df["compression"] = df["compression"].fillna("default")
df["type"] = df["method"] + "_" + df["compression"]

df["save_time"] = df["save_time"].round(2)
df["read_time"] = df["read_time"].round(2)
df["comp_ratio"] = df["comp_ratio"].round(3)

# change type of column "compression_level" to int
df["compression_level"] = df["compression_level"].astype(int)

df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type
0,parquet,2.31,1.06,276.737055,zstd,1,2.838,parquet_zstd
1,parquet,2.56,1.04,262.036419,zstd,2,2.998,parquet_zstd
2,parquet,2.83,1.03,253.571889,zstd,3,3.098,parquet_zstd
3,parquet,2.80,1.01,251.290937,zstd,4,3.126,parquet_zstd
4,parquet,4.28,1.02,245.694203,zstd,5,3.197,parquet_zstd
...,...,...,...,...,...,...,...,...
92,csv,49.02,20.00,230.634853,bz2,0,3.406,csv_bz2
93,csv,14.66,6.11,289.558606,zstd,0,2.713,csv_zstd
94,csv,331.15,15.68,212.404030,xz,0,3.698,csv_xz
95,csv,11.37,5.13,691.074219,tar,0,1.137,csv_tar


In [3]:
def method_compression_with_level(row) -> str:
    level = row["compression_level"] if row["compression_level"] > 0 else "default"
    return f"{row['method']}_{row['compression']}_{level}"

df["method_compression_with_level"] = df.apply(method_compression_with_level, axis=1)
df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type,method_compression_with_level
0,parquet,2.31,1.06,276.737055,zstd,1,2.838,parquet_zstd,parquet_zstd_1
1,parquet,2.56,1.04,262.036419,zstd,2,2.998,parquet_zstd,parquet_zstd_2
2,parquet,2.83,1.03,253.571889,zstd,3,3.098,parquet_zstd,parquet_zstd_3
3,parquet,2.80,1.01,251.290937,zstd,4,3.126,parquet_zstd,parquet_zstd_4
4,parquet,4.28,1.02,245.694203,zstd,5,3.197,parquet_zstd,parquet_zstd_5
...,...,...,...,...,...,...,...,...,...
92,csv,49.02,20.00,230.634853,bz2,0,3.406,csv_bz2,csv_bz2_default
93,csv,14.66,6.11,289.558606,zstd,0,2.713,csv_zstd,csv_zstd_default
94,csv,331.15,15.68,212.404030,xz,0,3.698,csv_xz,csv_xz_default
95,csv,11.37,5.13,691.074219,tar,0,1.137,csv_tar,csv_tar_default


In [4]:
df_default_cl = df[df["compression_level"] == 0].copy()

# oder df_default_cl by method_compression_with_level
df_default_cl.sort_values("method_compression_with_level", ascending=True, inplace=True)

df_default_cl

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type,method_compression_with_level
92,csv,49.02,20.0,230.634853,bz2,0,3.406,csv_bz2,csv_bz2_default
96,csv,11.35,5.12,691.066633,default,0,1.137,csv_default,csv_default_default
91,csv,36.43,8.3,298.669783,gzip,0,2.63,csv_gzip,csv_gzip_default
95,csv,11.37,5.13,691.074219,tar,0,1.137,csv_tar,csv_tar_default
94,csv,331.15,15.68,212.40403,xz,0,3.698,csv_xz,csv_xz_default
90,csv,32.37,6.15,298.976316,zip,0,2.627,csv_zip,csv_zip_default
93,csv,14.66,6.11,289.558606,zstd,0,2.713,csv_zstd,csv_zstd_default
89,feather,1.25,0.53,382.216593,default,0,2.055,feather_default,feather_default_default
86,feather,1.29,0.53,382.216593,lz4,0,2.055,feather_lz4,feather_lz4_default
88,feather,0.34,0.39,634.940203,uncompressed,0,1.237,feather_uncompressed,feather_uncompressed_default


In [5]:
fig = px.bar(df_default_cl, x="method_compression_with_level", y="comp_ratio", text_auto=True,
             labels={"comp_ratio": "compression ratio",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots/pandas_ff_bar_comp_ratio.svg")


## Insights

- best compression has
  - bz2
  - xz
  - brotli

## Next Steps

- check save times
- check read times

In [6]:
fig = px.bar(df_default_cl, x="method_compression_with_level", y="save_time", text_auto=True,
             labels={"save_time": "time to save data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots/pandas_ff_bar_save_time.svg")


## Insights

- xz has a very long save time
- xz save time is so much longer than bz2 that it does not justify the slightly better compression that bz2
- xz is out
- bz2 also has a long save time
- brotli save time seems to be ok but not fastest

## Next Steps

- also check read times

In [7]:
fig = px.bar(df_default_cl, x="method_compression_with_level", y="read_time", text_auto=True,
             labels={"read_time": "time to read data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots/pandas_ff_bar_read_time.svg")

## Insights

- bz2 has a very long read time
- xz also has a long read time
- brotli read time seems to be ok but not fastest

## Next Steps

- what about the comprssion level?

In [8]:
interesting_method_compression_with_level = [
    "csv_bz2_default",
    "csv_xz_default",
    "parquet_brotli_default",
    "parquet_zstd_1",
    "parquet_zstd_5",
    "parquet_zstd_10",
    "parquet_zstd_18",
    "parquet_lz4_1",
    "parquet_lz4_5",
    "parquet_lz4_10",
    "parquet_lz4_18",
    "feather_zstd_1",
    "feather_zstd_5",
    "feather_zstd_10",
    "feather_zstd_18",
    "feather_lz4_1",
    "feather_lz4_5",
    "feather_lz4_10",
    "feather_lz4_18",
]

df_interesting_levels = df[df["method_compression_with_level"].isin(interesting_method_compression_with_level)].copy()
#df_interesting_levels.sort_values("method_compression_with_level", ascending=True, inplace=True)
df_interesting_levels

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type,method_compression_with_level
0,parquet,2.31,1.06,276.737055,zstd,1,2.838,parquet_zstd,parquet_zstd_1
4,parquet,4.28,1.02,245.694203,zstd,5,3.197,parquet_zstd,parquet_zstd_5
9,parquet,11.04,0.94,234.672159,zstd,10,3.347,parquet_zstd,parquet_zstd_10
17,parquet,85.79,1.0,223.130116,zstd,18,3.52,parquet_zstd,parquet_zstd_18
20,parquet,1.54,0.68,375.513327,lz4,1,2.092,parquet_lz4,parquet_lz4_1
24,parquet,10.05,0.67,297.175719,lz4,5,2.643,parquet_lz4,parquet_lz4_5
29,parquet,23.11,0.67,293.824037,lz4,10,2.673,parquet_lz4,parquet_lz4_10
37,parquet,33.36,0.67,293.376779,lz4,18,2.677,parquet_lz4,parquet_lz4_18
40,feather,1.63,0.79,274.08423,zstd,1,2.866,feather_zstd,feather_zstd_1
44,feather,3.86,0.87,239.80018,zstd,5,3.275,feather_zstd,feather_zstd_5


In [9]:
fig = px.bar(df_interesting_levels, x="method_compression_with_level", y="comp_ratio", text_auto=True,
             labels={"comp_ratio": "compression ratio",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots/pandas_ff_bar_comp_ratio_feather.svg")


In [10]:
fig = px.bar(df_interesting_levels, x="method_compression_with_level", y="save_time", text_auto=True,
             labels={"save_time": "time to save data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots/pandas_ff_bar_save_time_feather.svg")

In [11]:
fig = px.bar(df_interesting_levels, x="method_compression_with_level", y="read_time", text_auto=True,
             labels={"read_time": "time to read data in seconds",
                     "method_compression_with_level": "format_compression_level"},)
fig.show()
fig.write_image("./plots/pandas_ff_bar_read_time_feather.svg")

In [12]:
feather_df = df[(df["method"].isin(["feather", "parquet"])) & (df["compression_level"] > 0) &
                (df["compression"] == "zstd")].copy()
#feather_df = df[df["method"].isin(["feather", "parquet"])].copy()

feather_df

Unnamed: 0,method,save_time,read_time,file_size,compression,compression_level,comp_ratio,type,method_compression_with_level
0,parquet,2.31,1.06,276.737055,zstd,1,2.838,parquet_zstd,parquet_zstd_1
1,parquet,2.56,1.04,262.036419,zstd,2,2.998,parquet_zstd,parquet_zstd_2
2,parquet,2.83,1.03,253.571889,zstd,3,3.098,parquet_zstd,parquet_zstd_3
3,parquet,2.8,1.01,251.290937,zstd,4,3.126,parquet_zstd,parquet_zstd_4
4,parquet,4.28,1.02,245.694203,zstd,5,3.197,parquet_zstd,parquet_zstd_5
5,parquet,5.91,1.0,240.567246,zstd,6,3.265,parquet_zstd,parquet_zstd_6
6,parquet,6.74,0.99,238.347815,zstd,7,3.295,parquet_zstd,parquet_zstd_7
7,parquet,8.51,0.97,236.484128,zstd,8,3.321,parquet_zstd,parquet_zstd_8
8,parquet,8.6,0.97,236.070246,zstd,9,3.327,parquet_zstd,parquet_zstd_9
9,parquet,11.04,0.94,234.672159,zstd,10,3.347,parquet_zstd,parquet_zstd_10


In [13]:
fig = px.scatter(feather_df, x="compression_level", y="comp_ratio", color='type',
                 hover_data=['save_time', "read_time"],
                 symbol="type",
                 labels={"comp_ratio": "compression ratio",
                         "compression_level": "level of compression"})
fig.update_traces(marker_size=8)
fig.show()
fig.write_image("./plots/pandas_ff_scatter_comp_ratio_feather.svg")

In [14]:
fig = px.scatter(feather_df, x="compression_level", y="save_time", color='type', hover_data=['comp_ratio', "read_time"],
                 symbol="type",
                 labels={"save_time": "time to save data in seconds",
                     "compression_level": "level of compression"})
fig.update_traces(marker_size=8)
fig.show()
fig.write_image("./plots/pandas_ff_scatter_save_time_feather.svg")

In [15]:
fig = px.scatter(feather_df, x="compression_level", y="read_time", color='type',
                 hover_data=['comp_ratio', "save_time"],
                 symbol="type",
                 labels={"read_time": "time to read data in seconds",
                     "compression_level": "level of compression"})
fig.update_traces(marker_size=8)
fig.show()
fig.write_image("./plots/pandas_ff_scatter_read_time_feather.svg")