# Train Quantization

We analyze the TinyImagenet-Dataset to find the max and min values in all the original data and the data after cutr3-encoding. Because of outliers (especially after encoding) we also save the max and min 5 %. this is necessary to use the quantization-compression with global quantiles.

This Notebook has no outputs. We hardcode the results in quantization.py -> _get_statistics.

We also cannto stora an array of all tensor values, so this notebook is just an approximation

First some params for this Notebook:

In [None]:
DATA_DIR = '/data/marwei/pytorch/'
LOG_DIR = '/Users/marwei/code/encodedgdumb/logs/'
args = ['--dataset', 'TinyImagenet',
        '--num_classes_per_task', '5',
        '--num_tasks', '20',
        '--seed', '0',
        '--memory_size', '100',
        '--num_passes', '256',
        '--encoder', 'cutr',
        '--encoding_block', '3.5',
        '--compressor', 'none',
        '--backbone', 'mlp',
        '--batch_size', '100',
        '--data_dir', DATA_DIR,
        '--log_dir', LOG_DIR,
        '--exp_name', 'test',
        '--device', 'cuda'
]


change the dir so the inputs work

In [None]:
import os
import argparse
import numpy as np

os.chdir('../src')

from opts import add_general_args, parse_general_args
from datasets import get_dataset
from encoders import get_encoder, get_encoder_arg_fn

os.chdir('../')


In [None]:
general_args = parse_general_args(args)
add_encoder_args = get_encoder_arg_fn(general_args.encoder)
parser = argparse.ArgumentParser()
add_general_args(parser)
add_encoder_args(parser)
opt = parser.parse_args(args)


In [None]:

dataset = get_dataset(opt)
encoder = get_encoder(opt, dataset.info()).to(opt.device)
train_loader, _ = dataset.get_task_loaders()

In [None]:
from dataclasses import dataclass
import torch

@dataclass
class OutlierStatistic:
    min = torch.tensor(float('inf'))
    min5 = torch.tensor(float('inf'))   # outlier corrected
    max = torch.tensor(-float('inf'))
    max5 = torch.tensor(-float('inf'))  # outlier corrected

    def update(self, min, min5, max, max5):
        if min < self.min:
            self.min = min
        if min5 < self.min5:
            self.min5 = min5
        if max > self.max:
            self.max = max
        if max5 > self.max5:
            self.max5 = max5

    def to_dict(self):
        return {
            'min': self.min.item(),
            'min5': self.min5.item(),
            'max': self.max.item(),
            'max5': self.max5.item()
        }
    

and go

In [None]:
s_unencoded = OutlierStatistic()
s_encoded = OutlierStatistic()

for i, (data, _) in enumerate(train_loader):
    du = data.to(opt.device)
    de = encoder(du)

    nu = int(du.numel() * 0.05)     # number 
    ne = int(de.numel() * 0.05)

    su, _ = du.view(du.numel()).sort()
    se, _ = de.view(de.numel()).sort()

    s_unencoded.update(
        min=su[0],
        min5=su[nu],
        max=su[-1],
        max5=su[-nu]
    )
    s_encoded.update(
        min=se[0],
        min5=se[ne],
        max=se[-1],
        max5=se[-ne]
    )

    if i % 100 == 0:
        print(f"{i} / {len(train_loader)}")


In [None]:
print(s_encoded.to_dict())