# CIFAR 10 Compression + Split Resnet

We want to find out how to maximize the perforamnce given a total amount of memory. 

Chart Layout:

* x-axis: number of memory-slots
* y-axis: model acc
* legend: different models

The following experiments might be interesting: How much do different compression techniques change the performance of the architecture?

1. (CutR/ Nothing) + Nothing + SplitR (Baseline)
    * already performed `split_resnet_res_con_exp.ipynb`
2. (CutR/ Nothing) + Thinning + SplitR
3. (CutR/ Nothing) + Quantization + SplitR
    * 3b. (CutR/ Nothing) + transfer-quantization + SplitR
4. (CutR/ Nothing) + Sparse Coding + SplitR
5. (No encoding) + Convolutional Autoencoder + SplitR
6. (CutR/ Nothing) + Fully Connected Autoencoder + SplitR

## Note about the sizes

CIFAR10-Dataset

| cut...                    | output shape | output size | bits (bytes) to adress output coordinates |
|---------------------------|--------------|-------------|-------------------------------------------|
| after Block 1             | 8x8x64       |  4096       | 12 (2)                                    |
| after Block 2             | 4x4x128      |  2048       | 11 (2)                                    |
| after Block 3             | 2x2x256      |  1024       | 10 (2)                                    |
| after Block 4             | 1x1x512      |   512       |  9 (2)                                    |

We assume

* one float takes up 4 Byte of Memory
* uints are used for output-coordinates


In [None]:
MEMORY_BASE_SIZE = 2**10
BACKBONE_BLOCK = 3
SEED = 0

OUTPUT_SIZE_ENCODED = 1024
OUTPUT_SIZE_UNENCODED = 32*32*3
FLOAT_SIZE_BYTE = 4
UINT_SIZE_BYTE = 1
COORDINATE_SIZE_BYTE = 2

LOG_DIR = '/home/marwei/code/EncodedGDumb/logs/'
DATA_DIR = '/daten/marwei/pytorch'

TOTAL_DS_SIZE = 50000   # number of samples in cifar

In [None]:
from pathlib import Path

memory_size_byte = 10 * 1024 * 1024

def write_save(name, contents):
    out_path = Path('..', 'scripts', name).resolve()
    if out_path.exists():
        print('File already exits, nothing has been overwritten')
    else:
        with open(out_path, 'w') as f:
            f.write(contents)
        
def write_save(name, contents):
    out_path = Path('..', 'scripts', name).resolve()
    with open(out_path, 'w') as f:
        f.write(contents)

# Formulation 1: Baseline

with encoder

In [None]:
from math import floor

n_memory_samples = floor(memory_size_byte / (OUTPUT_SIZE_ENCODED * FLOAT_SIZE_BYTE))
print(n_memory_samples)
assert(n_memory_samples < TOTAL_DS_SIZE)
n_enc = f"cifar10_m{n_memory_samples}_cutr{BACKBONE_BLOCK}_splitr{BACKBONE_BLOCK}__s{SEED}"
l_enc =  ["python3 src/main.py",
          "--dataset", "CIFAR10",
          "--num_classes_per_task", "2",
          "--num_tasks", "5",
          "--seed", str(SEED),
          "--memory_size", str(n_memory_samples),
          "--num_passes", "128",
          "--sampler", "greedy_sampler",
          "--encoder", "cutr",
          "--encoding_block", str(BACKBONE_BLOCK),
          "--compressor", "none",
          "--backbone", "resnet",
          "--backbone_block", str(BACKBONE_BLOCK),
          "--data_dir", DATA_DIR,
          "--log_dir", LOG_DIR,
          "--exp_name", n_enc]
exps_base = [" ".join(l_enc)]

del n_memory_samples

without encoder

In [None]:
n_memory_samples = floor(memory_size_byte / (OUTPUT_SIZE_UNENCODED * UINT_SIZE_BYTE))
print(n_memory_samples)
assert(n_memory_samples < TOTAL_DS_SIZE)

n_unenc = f"cifar10_m{n_memory_samples}_resnet__s{SEED}"
l_unenc =  ["python3 src/main.py",
            "--dataset", "CIFAR10",
            "--num_classes_per_task", "2",
            "--num_tasks", "5",
            "--seed", str(SEED),
            "--memory_size", str(n_memory_samples),
            "--num_passes", "128",
            "--sampler", "greedy_sampler",
            "--encoder", "none",
            "--compressor", "none",
            "--backbone", "resnet",
            "--backbone_block", "0",
            "--data_dir", DATA_DIR,
            "--log_dir", LOG_DIR,
            "--exp_name", n_unenc]
exps_base.append(" ".join(l_unenc))

del n_memory_samples

In [None]:
write_save('cifar10_splirR_compression_base.sh', '\n'.join(exps_base))

# Formulation 2: Thinning

## Setup


In [None]:
from pathlib import Path

compression_factors = [0.5, 0.8, 0.9, 0.95]

with encoder

In [None]:
from math import floor

exps_thinning = []
for this_compression_factor in compression_factors:

    n_elements_per_sample = floor(OUTPUT_SIZE_ENCODED * (1-this_compression_factor))
    sample_size_byte = n_elements_per_sample * FLOAT_SIZE_BYTE + n_elements_per_sample * COORDINATE_SIZE_BYTE
    n_memory_samples = floor(memory_size_byte / sample_size_byte)
    print(n_memory_samples)
    assert(n_memory_samples < TOTAL_DS_SIZE)
    
    n = f"cifar10_m{n_memory_samples}_cutr{BACKBONE_BLOCK}_thinning{int(this_compression_factor*100)}_splitr{BACKBONE_BLOCK}__s{SEED}"
    l =  ["python3 src/main.py",
          "--dataset", "CIFAR10",
          "--num_classes_per_task", "2",
          "--num_tasks", "5",
          "--seed", str(SEED),
          "--memory_size", str(n_memory_samples),
          "--num_passes", "128",
          "--sampler", "greedy_sampler",
          "--encoder", "cutr",
          "--encoding_block", str(BACKBONE_BLOCK),
          "--compressor", "thinning",
          "--compression_factor", str(this_compression_factor),
          "--backbone", "resnet",
          "--backbone_block", str(BACKBONE_BLOCK),
          "--data_dir", DATA_DIR,
          "--log_dir", LOG_DIR,
          "--exp_name", n]
    exps_thinning.append(" ".join(l))

without encoder

In [None]:
for this_compression_factor in compression_factors:

    n_elements_per_sample = floor(OUTPUT_SIZE_UNENCODED * (1-this_compression_factor))
    sample_size_byte = n_elements_per_sample * UINT_SIZE_BYTE + n_elements_per_sample * COORDINATE_SIZE_BYTE
    n_memory_samples = floor(memory_size_byte / sample_size_byte)
    print(n_memory_samples)
    assert(n_memory_samples < TOTAL_DS_SIZE)

    n = f"cifar10_m{n_memory_samples}_thinning{int(this_compression_factor*100)}_resnet__s{SEED}"
    l =  ["python3 src/main.py",
          "--dataset", "CIFAR10",
          "--num_classes_per_task", "2",
          "--num_tasks", "5",
          "--seed", str(SEED),
          "--memory_size", str(n_memory_samples),
          "--num_passes", "128",
          "--sampler", "greedy_sampler",
          "--encoder", "none",
          "--compressor", "thinning",
          "--compression_factor", str(this_compression_factor),
          "--backbone", "resnet",
          "--data_dir", DATA_DIR,
          "--log_dir", LOG_DIR,
          "--exp_name", n]
    exps_thinning.append(" ".join(l))

In [None]:
write_save('cifar10_splitR_compression_thinning.sh', '\n'.join(exps_thinning))

# Formulation 3: local Quantization

In [None]:
n_states_list = [2, 4, 8, 16, 32]

with encoder

In [None]:
from math import log2, floor, ceil

exps_quantization = []
for n_states in n_states_list:

    bytes_quantil_mids = n_states * FLOAT_SIZE_BYTE
    bit_for_compressed_number = ceil(log2(n_states))
    sample_size_byte = ceil(OUTPUT_SIZE_ENCODED * bit_for_compressed_number / 8) + bytes_quantil_mids
    n_memory_samples = floor(memory_size_byte / sample_size_byte)
    assert(n_memory_samples < TOTAL_DS_SIZE)
    
    n = f"cifar10_m{n_memory_samples}_cutr{BACKBONE_BLOCK}_quantization{n_states}_splitr{BACKBONE_BLOCK}__s{SEED}"
    l =  ["python3 src/main.py",
          "--dataset", "CIFAR10",
          "--num_classes_per_task", "2",
          "--num_tasks", "5",
          "--seed", str(SEED),
          "--memory_size", str(n_memory_samples),
          "--num_passes", "128",
          "--sampler", "greedy_sampler",
          "--encoder", "cutr",
          "--encoding_block", str(BACKBONE_BLOCK),
          "--compressor", "quantization",
          "--strategy", "local",
          "--n_states", str(n_states),
          "--backbone", "resnet",
          "--backbone_block", str(BACKBONE_BLOCK),
          "--data_dir", DATA_DIR,
          "--log_dir", LOG_DIR,
          "--exp_name", n]
    exps_quantization.append(" ".join(l))

without encoder

In [None]:
for n_states in n_states_list:

    bytes_quantil_mids = n_states * UINT_SIZE_BYTE
    bit_for_compressed_number = ceil(log2(n_states))
    sample_size_byte = ceil(OUTPUT_SIZE_UNENCODED * bit_for_compressed_number / 8) + bytes_quantil_mids
    n_memory_samples = floor(memory_size_byte / sample_size_byte)
    assert(n_memory_samples < TOTAL_DS_SIZE)
    
    n = f"cifar10_m{n_memory_samples}_quantization{n_states}_resnet__s{SEED}"
    l =  ["python3 src/main.py",
          "--dataset", "CIFAR10",
          "--num_classes_per_task", "2",
          "--num_tasks", "5",
          "--seed", str(SEED),
          "--memory_size", str(n_memory_samples),
          "--num_passes", "128",
          "--sampler", "greedy_sampler",
          "--encoder", "none",
          "--compressor", "quantization",
          "--strategy", "local",
          "--n_states", str(n_states),
          "--backbone", "resnet",
          "--data_dir", DATA_DIR,
          "--log_dir", LOG_DIR,
          "--exp_name", n]
    exps_quantization.append(" ".join(l))

In [None]:
write_save('cifar10_splitR_compression_quantization_local.sh', '\n'.join(exps_quantization))

# Formulation 3b: Transfer Quantization

In [None]:
n_states_list = [2, 4, 8, 16, 32]

with encoder

In [None]:
from math import log2, floor, ceil

exps_quantization = []
for n_states in n_states_list:

    available_mem = memory_size_byte - n_states * FLOAT_SIZE_BYTE
    bit_for_compressed_number = ceil(log2(n_states))
    sample_size_byte = ceil(OUTPUT_SIZE_ENCODED * bit_for_compressed_number / 8)
    n_memory_samples = floor(available_mem / sample_size_byte)
    print(n_memory_samples)
    assert(n_memory_samples < TOTAL_DS_SIZE)
    
    n = f"cifar10_m{n_memory_samples}_cutr{BACKBONE_BLOCK}_quantization{n_states}_transTinyImagenet_splitr{BACKBONE_BLOCK}__s{SEED}"
    l =  ["python3 src/main.py",
          "--dataset", "CIFAR10",
          "--num_classes_per_task", "2",
          "--num_tasks", "5",
          "--seed", str(SEED),
          "--memory_size", str(n_memory_samples),
          "--num_passes", "128",
          "--sampler", "greedy_sampler",
          "--encoder", "cutr",
          "--encoding_block", str(BACKBONE_BLOCK),
          "--compressor", "quantization",
          "--n_states", str(n_states),
          "--strategy", "tiny_imagenet_transfer",
          "--backbone", "resnet",
          "--backbone_block", str(BACKBONE_BLOCK),
          "--data_dir", DATA_DIR,
          "--log_dir", LOG_DIR,
          "--exp_name", n]
    exps_quantization.append(" ".join(l))

without encoder

In [None]:
for n_states in n_states_list:

    available_mem = memory_size_byte - n_states * FLOAT_SIZE_BYTE
    bit_for_compressed_number = ceil(log2(n_states))
    sample_size_byte = ceil(OUTPUT_SIZE_UNENCODED * bit_for_compressed_number / 8)
    n_memory_samples = floor(available_mem / sample_size_byte)
    print(n_memory_samples)
    assert(n_memory_samples < TOTAL_DS_SIZE)
    
    n = f"cifar10_m{n_memory_samples}_quantization{n_states}_transTinyImagenet_resnet__s{SEED}"
    l =  ["python3 src/main.py",
          "--dataset", "CIFAR10",
          "--num_classes_per_task", "2",
          "--num_tasks", "5",
          "--seed", str(SEED),
          "--memory_size", str(n_memory_samples),
          "--num_passes", "128",
          "--sampler", "greedy_sampler",
          "--encoder", "none",
          "--compressor", "quantization",
          "--n_states", str(n_states),
          "--strategy", "tiny_imagenet_transfer",
          "--backbone", "resnet",
          "--data_dir", DATA_DIR,
          "--log_dir", LOG_DIR,
          "--exp_name", n]
    exps_quantization.append(" ".join(l))

In [None]:
write_save('cifar10_splitR_compression_quantization_trans.sh', '\n'.join(exps_quantization))

# Formulation 5: Conv Autoencoder

We do not encode the sample before compression because the spacial dimensions will be very low so we cannot perform convolution and pooling.

In [None]:
latent_sizes = [1, 2, 4, 8, 16]

AE_SIZE = {
    1: 0.00452423095703125,
    2: 0.0056304931640625,
    4: 0.007843017578125,
    8: 0.01226806640625,
    16: 0.0211181640625,
}

In [None]:
from math import floor

exps_autoencoder = []
for latent_size in latent_sizes:
    n_numbers = 8*8*latent_size
    byte_per_sample = n_numbers * FLOAT_SIZE_BYTE
    n_memory_samples = floor((memory_size_byte - AE_SIZE[latent_size]) / byte_per_sample)
    assert(n_memory_samples < TOTAL_DS_SIZE)
    print(n_memory_samples)
    n = f"cifar10_m{n_memory_samples}_ae{latent_size}_resnet__s{SEED}"
    l =  ["python3 src/main.py",
          "--dataset", "CIFAR10",
          "--num_classes_per_task", "2",
          "--num_tasks", "5",
          "--seed", str(SEED),
          "--memory_size", str(n_memory_samples),
          "--num_passes", "128",
          "--sampler", "greedy_sampler",
          "--encoder", "none",
          "--compressor", "autoencoder",
          "--latent_channels", str(latent_size),
          "--backbone", "resnet",
          "--data_dir", DATA_DIR,
          "--log_dir", LOG_DIR,
          "--exp_name", n]
    exps_autoencoder.append(" ".join(l))

    

In [None]:
write_save("cifar10_splitR_compression_convae.sh", '\n'.join(exps_autoencoder))

# Formulation 6: Fully Connected Autoencoder

Note:

* The autoencoder size when compressor==none is bigger than the abailable memory of 4 MiB
* a bottleneck_size=2 produces more samples then there are in the dataset

In [None]:
bottleneck_sizes = [2, 4, 8, 16, 32, 64]
encoders = ['none', 'cutr3']

SIZE_FCAE_NONE_MB = {
    64: 52.08887481689453,
    32: 43.0804443359375,
    16: 36.295753479003906,
    8: 30.972145080566406,
    4: 26.713584899902344,
    2: 23.205177307128906,
}
SIZE_FCAE_CUTR_MB = {
    64: 8.331954956054688,
    32: 6.5640106201171875,
    16: 5.3397216796875,
    8: 4.450263977050781,
    4: 3.7687225341796875,
    2: 3.2361679077148438,
}

In [None]:
from math import floor

exps_fcae = []

for encoder in encoders:
    for bottleneck_size in bottleneck_sizes:
        if encoder == 'none':
            available_mem_byte = memory_size_byte - SIZE_FCAE_NONE_MB[bottleneck_size] * 1024**2
            byte_per_sample = bottleneck_size * FLOAT_SIZE_BYTE
            n_memory_samples = floor(available_mem_byte / byte_per_sample)
            n = f"cifar10_m{n_memory_samples}_fcae{bottleneck_size}_resnet__s{SEED}"
            e = [
                "--encoder", "none",
                "--backbone", "resnet",
                "--exp_name", n
            ]
        elif encoder == 'cutr3':
            available_mem_byte = memory_size_byte - SIZE_FCAE_CUTR_MB[bottleneck_size] * 1024**2
            byte_per_sample = bottleneck_size * FLOAT_SIZE_BYTE
            n_memory_samples = floor(available_mem_byte / byte_per_sample)
            n = f"cifar10_m{n_memory_samples}_cutr{BACKBONE_BLOCK}_fcae{bottleneck_size}_resnet__s{SEED}"
            e = [
                "--encoder", "cutr",
                "--encoding_block", str(BACKBONE_BLOCK),
                "--backbone", "resnet",
                "--backbone_block", str(BACKBONE_BLOCK),
                "--exp_name", n        
            ]
        else:
            raise ValueError('Unknown Encoder')

        if available_mem_byte <= 0:
            print('Model to large')
            continue
        if n_memory_samples > TOTAL_DS_SIZE:
            print(f'Not enough elements in dataset for {bottleneck_size=}')
            continue
        print(n_memory_samples)

        l =  ["python3 src/main.py",
            "--dataset", "CIFAR10",
            "--num_classes_per_task", "2",
            "--num_tasks", "5",
            "--seed", str(SEED),
            "--memory_size", str(n_memory_samples),
            "--num_passes", "128",
            "--sampler", "greedy_sampler",
            "--compressor", "fcae",
            "--data_dir", DATA_DIR,
            "--log_dir", LOG_DIR,
            "--bottleneck_neurons", str(bottleneck_size)
        ] + e

        exps_fcae.append(' '.join(l))

In [None]:
write_save('cifar10_splitR_compression_fcae.sh', "\n".join(exps_fcae))

# Results

In [None]:
from pathlib import Path

parent_dir = Path('/home/marwei/code/Archived_Logs/cifar10_compression_cutr')

In [None]:
import re

log_paths = [Path(d, 'checkpoint.log') for d in parent_dir.glob('*')]
c = []

for exp in log_paths:
    with open(exp) as infile:
        loglines = infile.read().splitlines()

    mem_size = int(re.findall(r"memory_size=(\d+)", loglines[0])[0])
    try:
        final_acc = float(re.findall(r"Acc: \[(.*?)\]", loglines[-1])[0])
    except IndexError:
        print(f'could not read {d}')
    compressor = re.findall(r"compressor=\'(.*?)\'", loglines[0])[0]
    encoder = re.findall(r"encoder=\'(.*?)\'", loglines[0])[0]

    if compressor == 'thinning':
        compressor_param = float(re.findall(r"compression_factor=(.*?),", loglines[0])[0])
        compressor_name = 'Thinning'
    elif compressor == 'quantization':
        compressor_param = int(re.findall(r"n_states=(\d+)", loglines[0])[0])
        try:
            strategy = re.findall(r"strategy=\'(.*?)\'", loglines[0])[0]
        except IndexError:
            strategy = 'local'
        
        if strategy == 'tiny_imagenet_transfer':
            compressor_name = 'Quantization (transfer)'
        elif strategy == 'local':
            compressor_name = 'Quantization (local)'
        else:
            raise ValueError('Unknown Strategy')
    elif compressor == 'autoencoder' or compressor == 'convae':
        compressor_param = int(re.findall(r"latent_channels=(\d+)", loglines[0])[0])
        compressor = 'convae'
        compressor_name = 'Conv. Autoencoder'
    elif compressor == 'fcae':
        compressor_param = int(re.findall(r"bottleneck_neurons=(\d+)", loglines[0])[0])
        compressor_name = 'FC Autoencoder'
    elif compressor == 'none':
        compressor_param = ''
        compressor_name = 'No Compression'
    else:
        raise ValueError(f'Unknown Compressor: {compressor}')

    if encoder == 'cutr':
        encoding_block = int(re.findall(r"encoding_block=(\d+)", loglines[0])[0])
        encoder_name = f'CutR18({encoding_block})'
    elif encoder == 'none':
        encoder_name = 'ResNet-18'
    else:
        raise ValueError('Unknown Encoder')


    c.append({
        'mem_size': mem_size,
        'final_acc': final_acc,
        'compressor': compressor,
        'compressor_name': compressor_name,
        'annotation': compressor_param,
        'encoder': encoder,
        'encoder_name': encoder_name,
    })


In [None]:
import pandas as pd

df = pd.DataFrame.from_records(c)

df.loc[(df['encoder']=='none') & (df['compressor']=='none'), 'annotation'] = 'GDumb'

df.loc[df['compressor'] == 'none', 'order_comp'] = 0
df.loc[df['compressor'] == 'thinning', 'order_comp'] = 1
df.loc[df['compressor'] == 'quantization local', 'order_comp'] = 2
df.loc[df['compressor'] == 'quantization transfer', 'order_comp'] = 3
df.loc[df['compressor'] == 'convae', 'order_comp'] = 4
df.loc[df['compressor'] == 'fcae', 'order_comp'] = 5

df.loc[df['encoder'] == 'none', 'order_enc'] = 0
df.loc[df['encoder'] == 'cutr', 'order_enc'] = 1


# df.sort_values(['order_enc', 'order_comp', 'mem_size'], ascending=True, inplace=True)

In [None]:
import plotly.express as px
from plot_utils import science_config_png, science_config_svg, science_template

fig = px.line(
    df.sort_values('mem_size'),
    x='mem_size',
    y='final_acc',
    color='compressor_name',
    facet_col='encoder_name',
    text='annotation',
    markers=True,
    log_x=True,
    template=science_template,
    title=f'',
    labels={
        'mem_size': 'Number of Memory Slots',
        'final_acc': 'Accuracy',
        'name': 'Name',
        'compressor_param': 'Parameter',
        'compressor': 'Compressor',
        'compressor_name': 'Compressor',
        'encoder_name': 'Encoder',
        'none': 'no Compression',
    },
    category_orders={
        'compressor_name': ['No Compression', 'Thinning', 'Quantization (local)', 'Quantization (transfer)', 'Conv. Autoencoder', 'FC Autoencoder']
    }
)

In [None]:
def update_annotation(a):
    text: str = a.text
    if '=' in text:
        a.update(text=text.split('=')[-1])

fig.for_each_annotation(update_annotation)

In [None]:
fig.show(renderer='browser', config={
    'displaylogo': False,
    'toImageButtonOptions': {
        'format': 'svg', # one of png, svg, jpeg, webp
        'filename': 'plot',
    }
})

In [None]:
fig_small = px.line(
    df,
    x='mem_size',
    y='final_acc',
    color='compressor_name',
    facet_col='encoder_name',
    markers=True,
    log_x=True,
    template=science_template,
    labels={
        'mem_size': 'Number of Memory Slots',
        'final_acc': 'Accuracy',
        'name': 'Name',
        'compressor_param': 'Parameter',
        'compressor': 'Compressor',
        'compressor_name': '',
        'encoder_name': 'Encoder',
        'none': 'no Compression',
    }
)
fig_small.for_each_annotation(update_annotation)
fig_small.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.05,
    xanchor="left",
    x=0
))
fig_small.show(renderer='browser', config={
    'displaylogo': False,
    'toImageButtonOptions': {
        'format': 'svg', # one of png, svg, jpeg, webp
        'filename': 'plot',
    }
})