## Импорт библиотек

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import warnings
import yaml
import matplotlib.pyplot as plt
import pandas as pd
warnings.filterwarnings("ignore")
from scripts.create_nab_dataset import create_nab_data
from scripts.create_country_df import get_country_df
from sz.SZ3.tools.pysz.pysz import SZ
from settings import RESULTS_PATH, MODEL_PARAMETERS_PATH
from compress.sprintz_encode import compress_sprintz, decompress_sprintz, get_compress_info_sprintz
from compress.bypass import spatial_clustering
from compress.general_functions import get_errors, get_peak_resource, cnn_resource_usage
from compress.xor_encode import xor_compress_df, get_compress_info_xor, decompress_xor_df
from compress.lz4_encode import LZ4_compress_df, LZ4_decompress_df, get_compress_info_lz4
from compress.spatial_lz4 import spatial_clustering_PCA_LZ4, spatial_lz4_decompress, get_compress_info_spatial_PCA_LZ4
from compress.spatial_xor import spatial_clustering_xor, spatial_XOR_decompress, get_compress_info_spatial_xor
from compress.spatial_sprintz import spatial_clustering_sprintz, spatial_sprintz_decompress, get_compress_info_spatial_sprintz
from compress.sz3_encode import compress_sz3_df, decompress_sz3, get_compress_info_sz3, compress_sz3_all
from compress.cnn_encode import compress_cnn_sz3, decomress_cnn_sz3, get_compress_info_cnn_sz3, compress_cnn_cluster

## Загрузка тестовых данных

In [2]:
df_speed = create_nab_data()
df_wind, wind_geo_dict = get_country_df()
with open(MODEL_PARAMETERS_PATH, 'r') as f:
    parameters = yaml.safe_load(f)
parameters = parameters["country_df"]

In [3]:
spatial_clustering(df_wind, wind_geo_dict, parameters["cor_lvl"])

{'sensor_0': ['sensor_0', 'sensor_4', 'sensor_3', 'sensor_5'],
 'sensor_1': ['sensor_1'],
 'sensor_2': ['sensor_2']}

## Коэффициент сжатия и ошибки декодирования

In [4]:
comp = {}
mape = {}
mse = {}

In [5]:
print("Sprintz \n")
speed_sprintz_res = compress_sprintz(df=df_speed, chunk_size=8)
decode_res = decompress_sprintz(speed_sprintz_res, num_cols = 5, chunk_size=7)
decode_res.columns = df_speed.columns
comp["sprintz"] = get_compress_info_sprintz(df_speed, speed_sprintz_res)
print("\n")

print("XOR \n")
compressed_df = xor_compress_df(df_wind)
decompressed_df = decompress_xor_df(compressed_df)
decompressed_df.columns = [f'sensor_{i}' for i in range(df_wind.shape[1])]
comp["xor"] = get_compress_info_xor(df_wind, compressed_df)
print("\n")

print("LZ4 \n")
compressed_df = LZ4_compress_df(df_wind)
decompressed_df = LZ4_decompress_df(compressed_df)
comp["lz4"] = get_compress_info_lz4(df_wind, compressed_df)
print("\n")

print("Spatial XOR \n")
df = df_wind.round(15).copy()
res = spatial_clustering_xor(df, wind_geo_dict, parameters["cor_lvl"])
decompressed_df = spatial_XOR_decompress(res)
comp["spatial_xor"] = get_compress_info_spatial_xor(df, res)
print("\n")

print("Spatial Sprintz \n")
df = df_wind*100
df = df.astype(int)
res = spatial_clustering_sprintz(df.copy(), wind_geo_dict, parameters["cor_lvl"])
clust_dict = spatial_clustering(
    df, wind_geo_dict, parameters["cor_lvl"])
decode_res = spatial_sprintz_decompress(res,
                                        clust_dict)
comp["spatial_sprintz"] = get_compress_info_spatial_sprintz(df, res)
print("\n")

print("PCA \n")
res = spatial_clustering_PCA_LZ4(df_wind, wind_geo_dict, parameters["cor_lvl"])
comp["pca"] = get_compress_info_spatial_PCA_LZ4(df_wind, res)
clusters = spatial_clustering(df_wind, wind_geo_dict, parameters["cor_lvl"])
dec_res = spatial_lz4_decompress(res, clusters)
mse["pca"], mape["pca"] = get_errors(df_wind, dec_res)
print("\n")

print("SZ3 separately \n")
data_enc = compress_sz3_df(df_wind, parameters["er_abs_sz3"])
comp["sz3_separately"] = get_compress_info_sz3(df_wind, data_enc)
data = df_wind.values.transpose()
shape = [d.shape for d in data]
type = data[0].dtype
data_dec = decompress_sz3(data_enc, shape, type)
mse["sz3_separately"], mape["sz3_separately"] = get_errors(df_wind, data_dec)
print("\n")

print("SZ3 together")
data = df_wind.values.transpose()
data_enc = compress_sz3_all(df_wind, parameters["er_abs_sz3"])
comp["sz3_together"] = get_compress_info_sz3(df_wind, data_enc)
data_dec = decompress_sz3(data_enc, data.shape, data.dtype)
mse["sz3_together"], mape["sz3_together"] = get_errors(df_wind, data_dec)
print("\n")

print("CNN ZSTD \n")
enc_df = compress_cnn_sz3(df_wind,
                          wind_geo_dict,
                          cor_lvl=parameters["cor_lvl"],
                          use_dwt=False,
                          window_size=parameters["window_size"],
                          num_epochs=parameters["num_epochs"],
                          extra_layer=parameters["extra_layer"],
                          conv_filter=parameters["conv_filter"],
                          plot_flag=False,
                          er_abs_sz3=parameters["er_abs_sz3"],
                          model_compress="zstd")
comp["CNN_zstd"] = get_compress_info_cnn_sz3(df_wind, enc_df)
dec_df = decomress_cnn_sz3(enc_df, (5371,), use_dwt=False)
dec_df = dec_df.sort_index(axis=1)
mse["CNN_zstd"], mape["CNN_zstd"] = get_errors(df_wind, dec_df)
print("\n")

print("CNN LZ4 \n")
enc_df = compress_cnn_sz3(df_wind,
                          wind_geo_dict,
                          cor_lvl=parameters["cor_lvl"],
                          use_dwt=False,
                          window_size=parameters["window_size"],
                          num_epochs=parameters["num_epochs"],
                          extra_layer=parameters["extra_layer"],
                          conv_filter=parameters["conv_filter"],
                          plot_flag=False,
                          er_abs_sz3=parameters["er_abs_sz3"],
                          model_compress="lz4")
comp["CNN_lz4"] = get_compress_info_cnn_sz3(df_wind, enc_df)
dec_df = decomress_cnn_sz3(enc_df, (5371,), use_dwt=False, window_size=64, model_compress="lz4")
dec_df = dec_df.sort_index(axis=1)
mse["CNN_lz4"], mape["CNN_lz4"] = mse["CNN_zstd"], mape["CNN_zstd"]
print("\n")

print("CNN(dwt) ZSTD \n")
enc_df = compress_cnn_sz3(df_wind,
                          wind_geo_dict,
                          cor_lvl=parameters["cor_lvl"],
                          use_dwt=True,
                          window_size=parameters["window_size"],
                          num_epochs=parameters["num_epochs_dwt"],
                          conv_filter=parameters["conv_filter_dwt"],
                          plot_flag=False,
                          er_abs_sz3=parameters["er_abs_sz3"],
                          model_compress="zstd")
comp["CNN_dwt_zstd"] = get_compress_info_cnn_sz3(df_wind, enc_df)
dec_df = decomress_cnn_sz3(enc_df, (5371,), use_dwt=True, model_compress="zstd")
dec_df = dec_df.sort_index(axis=1)
mse["CNN_dwt_zstd"], mape["CNN_dwt_zstd"] = get_errors(df_wind, dec_df)
print("\n")

print("CNN(dwt) LZ4 \n")
enc_df = compress_cnn_sz3(df_wind,
                          wind_geo_dict,
                          cor_lvl=parameters["cor_lvl"],
                          use_dwt=True,
                          window_size=parameters["window_size"],
                          num_epochs=parameters["num_epochs_dwt"],
                          conv_filter=parameters["conv_filter_dwt"],
                          plot_flag=False,
                          er_abs_sz3=parameters["er_abs_sz3"],
                          model_compress="lz4")
comp["CNN_dwt_lz4"] = get_compress_info_cnn_sz3(df_wind, enc_df)
dec_df = decomress_cnn_sz3(enc_df, (5371,), use_dwt=True, model_compress="lz4")
dec_df = dec_df.sort_index(axis=1)
mse["CNN_dwt_lz4"], mape["CNN_dwt_lz4"] = mse["CNN_dwt_zstd"], mape["CNN_dwt_zstd"]

Sprintz 

Размер исходных данных: 2179 байт 

Размер сжатых данных: 1973 байт 

Коэффициент сжатия: 1.104


XOR 

Размер исходных данных: 257808 байт 

Размер сжатых XOR данных: 249461 байт 

Коэффициент сжатия: 1.033


LZ4 

Размер исходных данных: 257808 байт 

Размер сжатых данных: 191643 байт 

Коэффициент сжатия: 1.345


Spatial XOR 

Размер исходных данных: 257808 байт 

Размер сжатых данных: 255656.625 байт 

Коэффициент сжатия: 1.008


Spatial Sprintz 

Размер исходных данных: 16428 байт 

Размер сжатых данных: 42420 байт 

Коэффициент сжатия: 0.387


PCA 

Размер исходных данных: 257808 байт 

Размер сжатых данных: 148768 байт 

Коэффициент сжатия: 1.733
MSE: 0.001079 

MAPE: 13.78 % 



SZ3 separately 

Размер исходных данных: 257808 байт 

Размер сжатых данных: 12635 байт 

Коэффициент сжатия: 20.404
MSE: 0.000271 

MAPE: 20.64 % 



SZ3 together
Размер исходных данных: 257808 байт 

Размер сжатых данных: 11523 байт 

Коэффициент сжатия: 22.373
MSE: 0.000263 

MAPE: 22.64 % 

In [6]:
res_country_df = pd.DataFrame({
    'алгоритм': list(comp.keys()),
    'коэффициент сжатия': list(comp.values()),
    'mse': [mse.get(key, "-") for key in comp.keys()],
    'mape': [mape.get(key, "-") for key in comp.keys()]
})
res_country_df.to_excel(RESULTS_PATH / "res_country_df.xlsx", index=False)
res_country_df

Unnamed: 0,алгоритм,коэффициент сжатия,mse,mape
0,sprintz,1.104,-,-
1,xor,1.033,-,-
2,lz4,1.345,-,-
3,spatial_xor,1.008,-,-
4,spatial_sprintz,0.387,-,-
5,pca,1.733,0.001079,13.78
6,sz3_separately,20.404,0.000271,20.64
7,sz3_together,22.373,0.000263,22.64
8,CNN_zstd,23.163,0.004245,35.04
9,CNN_lz4,21.473,0.004245,35.04


## Замер времени

In [7]:
time_dict = {}

In [8]:
print("Sprintz \n")
speed_sprintz_res = compress_sprintz(df=df_speed, chunk_size=8)
time_dict['sprintz (enc)'] = %timeit -o compress_sprintz(df=df_speed, chunk_size=8)
time_dict['sprintz (dec)'] = %timeit -o  decompress_sprintz(speed_sprintz_res, num_cols = 5, chunk_size=7)

print("\n")
print("XOR \n")
compressed_df = xor_compress_df(df_wind)
time_dict['xor (enc)'] = %timeit -o xor_compress_df(df_wind)
time_dict['xor (dec)'] = %timeit -o decompress_xor_df(compressed_df)

print("\n")
print("LZ4 \n")
compressed_df = LZ4_compress_df(df_wind)
time_dict['lz4 (enc)'] = %timeit -o LZ4_compress_df(df_wind)
time_dict['lz4 (dec)'] = %timeit -o LZ4_decompress_df(compressed_df)

print("\n")
print("PCA \n")
res = spatial_clustering_PCA_LZ4(df_wind, wind_geo_dict, parameters["cor_lvl"])
time_dict['pca (enc)'] = %timeit -o spatial_clustering_PCA_LZ4(df_wind, wind_geo_dict, parameters["cor_lvl"])
clusters = spatial_clustering(df_wind, wind_geo_dict, parameters["cor_lvl"])
time_dict['pca (dec)'] = %timeit -o spatial_lz4_decompress(res, clusters)

print("\n")
print("SZ3 \n")
data_enc = compress_sz3_df(df_wind, parameters["er_abs_sz3"])
time_dict['sz3 (enc)'] = %timeit -o compress_sz3_df(df_wind, parameters["er_abs_sz3"])
data = df_wind.values.transpose()
shape = [d.shape for d in data]
type = data[0].dtype
time_dict['sz3 (dec)'] = %timeit -o decompress_sz3(data_enc, shape, type)

print("\n")
print("CNN \n")
cluster = ['sensor_0', 'sensor_4', 'sensor_3', 'sensor_5']
enc_df = {}
enc_df[tuple(cluster)] = compress_cnn_cluster(df_wind[cluster], use_dwt=False,
                                window_size=parameters["window_size"],
                                num_epochs=parameters["num_epochs"],
                                extra_layer=parameters["extra_layer"],
                                conv_filter=parameters["conv_filter"], plot_flag=False,
                                er_abs_sz3=parameters["er_abs_sz3"],  model_compress="zstd")
time_dict['cnn zstd (enc)'] = %timeit -o compress_cnn_cluster(df_wind[cluster], use_dwt=False, num_epochs=parameters["num_epochs"], extra_layer=parameters["extra_layer"], conv_filter=parameters["conv_filter"], plot_flag=False, er_abs_sz3=parameters["er_abs_sz3"],  model_compress="zstd")
time_dict['cnn lz4 (enc)'] = %timeit -o compress_cnn_cluster(df_wind[cluster], use_dwt=False, num_epochs=parameters["num_epochs"], extra_layer=parameters["extra_layer"], conv_filter=parameters["conv_filter"], plot_flag=False, er_abs_sz3=parameters["er_abs_sz3"],  model_compress="lz4")
time_dict['cnn zstd (dec)'] = %timeit -o decomress_cnn_sz3(enc_df, (5371,), use_dwt=False, model_compress="zstd")
enc_df[tuple(cluster)] = compress_cnn_cluster(df_wind[cluster], use_dwt=False,
                                window_size=parameters["window_size"],
                                num_epochs=parameters["num_epochs"],
                                extra_layer=parameters["extra_layer"],
                                conv_filter=parameters["conv_filter"], plot_flag=False,
                                er_abs_sz3=parameters["er_abs_sz3"], model_compress="lz4")
time_dict['cnn lz4 (dec)'] = %timeit -o decomress_cnn_sz3(enc_df, (5371,), use_dwt=False, model_compress="lz4")


Sprintz 

294 ms ± 97.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
130 ms ± 15.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


XOR 

229 ms ± 7.36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
293 ms ± 51.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


LZ4 

804 µs ± 73.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
475 µs ± 112 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


PCA 

5.96 ms ± 912 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
97.5 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


SZ3 

7.36 ms ± 620 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
93.8 ms ± 168 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


CNN 

Время обучения: 92.86 секунд
Size of compressed model (bytes): 3634
Время обучения: 117.91 секунд
Size of compressed model (bytes): 3643
Время обучения: 101.65 секунд
Size of compressed model (bytes): 3634
Время обучения: 96.43 секунд
Size of compres

In [9]:
rows = []
for key, value in time_dict.items():
    mean = value.average
    rows.append({'Method': key, 'Mean (s)': mean})
time_df = pd.DataFrame(rows)
time_df.to_excel(RESULTS_PATH / "time_usage.xlsx", index=False)
time_df

Unnamed: 0,Method,Mean (s)
0,sprintz (enc),0.293924
1,sprintz (dec),0.130208
2,xor (enc),0.229465
3,xor (dec),0.293414
4,lz4 (enc),0.000804
5,lz4 (dec),0.000475
6,pca (enc),0.005963
7,pca (dec),0.097542
8,sz3 (enc),0.007359
9,sz3 (dec),0.093816


## Замер потребления оперативной памяти (RAM)

In [10]:
ram_usage = {}

In [11]:
# SPRINTZ
print("Sprintz")
speed_sprintz_res, ram_usage["SPRINTZ (enc)"] = get_peak_resource(compress_sprintz, df_speed)
_, ram_usage["SPRINTZ (dec)"] = get_peak_resource(decompress_sprintz,
                                               speed_sprintz_res, num_cols=5,
                                               chunk_size=7)
# XOR
print("\n")
print("XOR")
compressed_df, ram_usage["XOR (enc)"] = get_peak_resource(xor_compress_df, df_wind)
_, ram_usage["XOR (dec)"] = get_peak_resource(decompress_xor_df, compressed_df)
# LZ4
print("\n")
print("lz4")
compressed_df, ram_usage["LZ4 (enc)"] = get_peak_resource(LZ4_compress_df, df_wind)
_, ram_usage["LZ4 (dec)"] = get_peak_resource(LZ4_decompress_df, compressed_df)
# Spatial clustering + PCA + LZ4
print("\n")
print("PCA")
res, ram_usage["PCA (enc)"] = get_peak_resource(spatial_clustering_PCA_LZ4, df_wind,
                                           wind_geo_dict, parameters["cor_lvl"])
clusters = spatial_clustering(df_wind, wind_geo_dict, parameters["cor_lvl"])
_, ram_usage["PCA (dec)"] = get_peak_resource(spatial_lz4_decompress, res, clusters)
# CNN
print("\n")
print("CNN")
cluster = ['sensor_0', 'sensor_4', 'sensor_3', 'sensor_5']
enc_df = {}
enc_df[tuple(cluster)], ram_usage["CNN (enc)"] = cnn_resource_usage(compress_cnn_cluster,
                                df_wind[cluster],
                                use_dwt=False,
                                window_size=parameters["window_size"],
                                num_epochs=parameters["num_epochs"],
                                extra_layer=parameters["extra_layer"],
                                conv_filter=parameters["conv_filter"], plot_flag=False,
                                er_abs_sz3=parameters["er_abs_sz3"],  model_compress="zstd")
_, ram_usage["CNN (dec)"] = cnn_resource_usage(decomress_cnn_sz3, enc_df, (5371,), use_dwt=False)

Sprintz


XOR


lz4


PCA


CNN
Время обучения: 89.26 секунд
Size of compressed model (bytes): 3647


In [12]:
df_ram = pd.DataFrame([
    {"Алгоритм": k, "Пиковое RAM (KiB)": v}
    for k, v in ram_usage.items()
])
df_ram.to_excel(RESULTS_PATH / "ram_usage.xlsx", index=False)
df_ram

Unnamed: 0,Алгоритм,Пиковое RAM (KiB)
0,SPRINTZ (enc),75.63
1,SPRINTZ (dec),305.3
2,XOR (enc),3716.54
3,XOR (dec),3529.65
4,LZ4 (enc),271.95
5,LZ4 (dec),507.16
6,PCA (enc),503.91
7,PCA (dec),3785.88
8,CNN (enc),18780.0
9,CNN (dec),2832.0
