In [1]:
import sys, os
app_dir = r'D:\code\projects\IGI.ML.Server'
sys.path.append(app_dir)

In [2]:
from ml_app.modelling.session_register import active_sessions
from ml_app.modelling.entities.model import ModelOptions, ModelClass, ModelType
from ml_app.modelling.entities.option_types import PreprocessingOptions, TrainingOptions, NullReplacement

# Setup

1. Create a model (for each model type using a large training file)
2. Look at raw object size vs str encoded size using current code.

In [3]:
fpath = 'non_linear_darcy_weisbach.csv'
session = active_sessions.create_session_from_filepath(fpath)

INFO:     Created session with key 0dd45fda-7c73-48bf-8ee6-06992d40cf52


In [4]:
# build options
preproc = PreprocessingOptions(
    standardise=True, normalise=False, 
    null_replacement=NullReplacement.auto)
training = TrainingOptions(training_split=0.8, random_seed=42)

def create_model_options(model_type: ModelType):
    """create model options with same input except for model type"""
    return ModelOptions(
        preproc=preproc,
        training=training,
        result_column='Pressure Drop (Pascals)',
        model_class=ModelClass.Regression,
        model_type=model_type)

lr_options = create_model_options(ModelType.LinearRegression)
gbr_options = create_model_options(ModelType.GradientBoostingRegressor)
gpr_options = create_model_options(ModelType.GaussianProcessRegressor)

In [5]:
# train models
lr_model = session.build_experiment(lr_options).model
gbr_model = session.build_experiment(gbr_options).model
gpr_model = session.build_experiment(gpr_options).model

In [6]:
# create serialsied model strings (with current process)
from ml_app.dtos.serialise.pickler import get_pickled_model_string

cols = session.data.columns
lr_str = get_pickled_model_string(model=lr_model, col_info=cols)
gbr_str = get_pickled_model_string(model=gbr_model, col_info=cols)
gpr_str = get_pickled_model_string(model=gpr_model, col_info=cols)

In [23]:
from datetime import datetime
nowish = datetime.now().strftime("%y%m%d_%H%M")
log_file = f"compression_tests_10k_6_cols_{nowish}.csv"
log_counter = 0

# create log file to capture sizes etc
def append_log(test_desc, lr_bytes, gbr_bytes, gpr_bytes):
    global log_counter
    if log_counter == 0:
        # write headers
        with open(log_file, 'w') as f:
            desc = "Test Description"
            f.write(f"{desc: <46}\tLR (MB)  \t GBR (MB)\t GPR (MB)\n")
    to_mb = lambda by: round(by/1024/1024, 3)
    with open(log_file, 'a') as f:
        f.write(f"{test_desc: <46}\t{to_mb(lr_bytes): >9.2f}\t"
                f"{to_mb(gbr_bytes): >9.2f}\t{to_mb(gpr_bytes): >9.2f}\n")
    log_counter += 1

In [24]:
# write object size
import sys, pickle
sizes = [sys.getsizeof(pickle.dumps(m)) 
        for m in (lr_model, gbr_model, gpr_model)]
append_log('Model object just pickled (no further changes)', *sizes)

# write size of serialised string
str_sizes = [sys.getsizeof(s) for s in (lr_str, gbr_str, gpr_str)]
append_log('Serialised (incl pickle, dto conv, b64 & hmac)', *str_sizes)

# Attempt 1

Try with zlib and compare

In [25]:
import zlib, codecs

compressed_bytes = [zlib.compress(x.encode()) 
                    for x in (lr_str, gbr_str, gpr_str)]
compressed_str = [codecs.encode(c, "base64").decode() for c in compressed_bytes]
comp_str_sizes = map(sys.getsizeof, compressed_str)
append_log('Serialised (as above +zlib default level -b64)', *comp_str_sizes)


In [26]:
# try zlib level 9
compressed_bytes = [zlib.compress(x.encode(), level=9) 
                    for x in (lr_str, gbr_str, gpr_str)]
compressed_str = [codecs.encode(c, "base64").decode() for c in compressed_bytes]
comp_str_sizes = map(sys.getsizeof, compressed_str)
append_log('Serialised (as above +zlib level 9 -b64)', *comp_str_sizes)