In [1]:
import torch
import matplotlib.pyplot as plt
import struct
from collections import  defaultdict

from transformers import GPT2LMHeadModel
from transformers import AutoTokenizer
from transformers import LlamaForCausalLM
from transformers import T5ForConditionalGeneration

import tqdm

In [2]:
def get_exponent(model, nbits):
    exponent = {}
    for idx, (name, param) in tqdm.tqdm(enumerate(model.named_parameters())):
        # assert param.dtype is torch.float16
        if param.ndim == 2 and param.shape[0] != 1 and param.shape[1] != 1:
            r, c = param.shape
            tensor_data = list(
                map(lambda x: int(
                        "{}".format(
                                bin(
                                    int.from_bytes(
                                    struct.pack('>e', x), byteorder='big'
                                    )
                                )[3: 3+nbits]      # torch.float16 [3: 8] torch.bfloat16[3: 11]
                            ), 
                            base=2
                    ),
                    param.abs().neg().reshape(-1).tolist()
                )
            )
            tensor = torch.tensor(tensor_data, dtype=torch.int8).reshape(r, c)
            exponent[name] = tensor
    return exponent

In [3]:
### Get the exponent ###
models_hub = {
    "t5": {
        "path": "/home/styaeng/project/delta-compress/pretrained_model/t5",       ### 这里要写成下载后的模型权重文件所在的路径
        "hdlr": T5ForConditionalGeneration.from_pretrained
    },
    "gpt2": {
        "path": "/home/styaeng/project/delta-compress/pretrained_model/gpt2",       ### 这里要写成下载后的模型权重文件所在的路径
        "hdlr": GPT2LMHeadModel.from_pretrained
    },
    "llama2": {
        "path": "/home/styaeng/project/delta-compress/pretrained_model/llama2",       ### 这里要写成下载后的模型权重文件所在的路径
        "hdlr": LlamaForCausalLM.from_pretrained
    },
}

In [4]:
t5_model = models_hub['t5']['hdlr'](models_hub['t5']['path'])
t5_exponent = get_exponent(t5_model, nbits=5)

131it [00:23,  5.64it/s]


In [5]:
# gpt2_model = models_hub['gpt2']['hdlr'](models_hub['gpt2']['path'])
# gpt2_exponent = get_exponent(gpt2_model, nbits=5)

In [6]:
# llama2_model = models_hub['llama2']['hdlr'](models_hub['llama2']['path'])
# llama2_exponent = get_exponent(llama2_model, nbits=5)

In [7]:
PageSize = 4 * 1024 * 8 # bits
FP16 = 16   # bits

'''
2 bits
    [0 | 1 | 2 | 3]
3 bits
    [0 | 1 | 2 | 3 | 4 | 5 | 6 | 7]
4 bits
    [0-15]
5 bits
    [0]
'''
'''
in bits and in pages
compression ratio
saved storage space
fraction
# how many pages will be read in average and maximum
'''
pattern = [
    [0, 1, 2, 3],
    [0, 1, 2, 3, 4, 5, 6, 7],
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
    [i for i in range(2**5)],
]

In [105]:
tensor_gran_pattern = defaultdict(dict)
for k, tensor in t5_exponent.items():
    row, col = tensor.shape
    tensor_gran_pattern[k] = defaultdict(dict)
    if tensor.numel() * FP16 <= PageSize:
        tensor_gran_pattern[k] == "LTP"
    else:
        for shift in range(3, -1, -1):
            idx = 3 - shift
            pat = pattern[idx]
            tmp = tensor >> shift
            for elem in pat:
                tensor_gran_pattern[k][idx+2][elem] = torch.count_nonzero(tmp == elem).item()

In [181]:
### parser the list tensor_gran_pattern in bit
bits = [5, 4, 3, 2]
road_map_bits = defaultdict(dict)
for name, candidate in tensor_gran_pattern.items():
    # print(name+':')
    max_ratio = 0
    if len(candidate) == 0:
        # compressed_model_size += 1
        road_map_bits[name]["max_ratio"] = [
            ("uncompressed", t5_exponent[name].numel() * FP16)
        ]
        # print(f"Orig_page:{1}\tunnecessary to compress")
        continue
    else:
        for bit in bits:
            stats = sorted(candidate[bit].items(), key=lambda x: x[1], reverse=True)
            total_bits = {}
            cnt_stat = 0
            for pat, cnt in stats:
                after_compressed_bits = cnt * (FP16 - bit) + bit
                pattern_page = (after_compressed_bits + PageSize - 1) // PageSize
                cnt_stat += cnt
                if pattern_page == 1:
                    frac_bits = PageSize - (cnt * (FP16 - bit) + bit)
                    if frac_bits > after_compressed_bits:
                        break
                elif pattern_page > 0:
                    total_bits[pat] = after_compressed_bits
            road_map_bits[name][bit] = total_bits

            stats = dict(stats)
            original_tensor_bits = sum(stats.values()) * FP16
            compressed_part = sum(total_bits.values())
            uncompressed_part = (sum(stats.values()) - cnt_stat) * FP16
            compressed_bits = compressed_part + uncompressed_part
            ratio = (original_tensor_bits - compressed_bits)/original_tensor_bits
            if ratio > max_ratio:
                max_ratio = ratio
                road_map_bits[name]['max_ratio'] = [
                    ("bits", bit),
                    ("compression_ratio", max_ratio),
                    ("compressed_bits", compressed_bits),
                ]
                # print(f"Bit:{bit}\tOrig_bits:{original_tensor_bits}\tcompressed_bits:{compressed_bits}\tcompression_ratio:{ratio* 100 :.2f}")

# whole model compression_ratio in tensor granularity
# bits
storage_space_bits = 0
compressed_storage_space_bits = 0
for k, v in t5_exponent.items():
    storage_space_bits += v.numel() * FP16
    compressed_storage_space_bits += road_map_bits[k]['max_ratio'][-1][-1]
print(f"model_compression_ratio: {(storage_space_bits - compressed_storage_space_bits) / storage_space_bits * 100:.2f} %")
print(f"saved_space: {(storage_space_bits - compressed_storage_space_bits) / (2**20 * 8):.3f} MB")

model_compression_ratio: 31.58 %
saved_space: 36.441 MB


In [160]:
### parser the list tensor_gran_pattern in page
bits = [5, 4, 3, 2]
road_map = defaultdict(dict)
# results = defaultdict(dict)
for name, candidate in tensor_gran_pattern.items():
    # print(name+':')
    max_ratio = 0
    if len(candidate) == 0:
        # compressed_model_size += 1
        road_map[name]["max_ratio"] = [
            ("compressed_page", 1),
            ("page_size", 1)
        ]
        # print(f"Orig_page:{1}\tunnecessary to compress")
        continue
    else:
        for bit in bits:
            stats = sorted(candidate[bit].items(), key=lambda x: x[1], reverse=True)
            total_page = {}
            cnt_stat = 0
            for pat, cnt in stats:
                after_compressed_bit = cnt * (FP16 - bit) + bit
                pattern_page = (after_compressed_bit + PageSize - 1) // PageSize
                cnt_stat += cnt
                if pattern_page == 1:
                    frac_bits = PageSize - (cnt * (FP16 - bit) + bit)
                    if frac_bits > after_compressed_bit:
                        break
                elif pattern_page > 0:
                    total_page[pat] = pattern_page
            road_map[name][bit] = total_page

            stats = dict(stats)
            original_tensor_page = (sum(list(stats.values())) * FP16 + PageSize - 1) // PageSize
            compressed_part = sum(list(total_page.values()))
            uncompressed_part = ((sum(stats.values()) - cnt_stat) * FP16 + PageSize - 1) // PageSize
            compressed_page = compressed_part + uncompressed_part
            ratio = (original_tensor_page - compressed_page)/original_tensor_page
            if ratio > max_ratio:
                max_ratio = ratio
                road_map[name]['max_ratio'] = [
                    ("compressed_page", compressed_page),
                    ("bits", bit),
                    ("compression_ratio", max_ratio)
                ]
                # print(f"Bit:{bit}\tOrig_page:{original_tensor_page}\tcompressed_page:{compressed_page}\tcompression_ratio:{ratio* 100 :.2f}")

# whole model compression_ratio in tensor granularity
# page
compressed_ratio = 0
storage_space = 0
compressed_storage_space = 0
for k, v in t5_exponent.items():
    storage_space += (v.numel() * FP16 + PageSize) // PageSize
    compressed_storage_space += road_map[k]['max_ratio'][0][-1]
print(f"model_compression_ratio: {(storage_space - compressed_storage_space) / storage_space * 100:.2f} %")
print(f"saved_space: {(storage_space - compressed_storage_space) * PageSize/(2**20 * 8):.3f} MB")

# how many pages will be read in average
average_access = 0
for name, tensor in t5_exponent.items():
    avg_count = 0
    if len(road_map[name]['max_ratio']) == 2:
        avg_count += 1
    elif len(road_map[name]['max_ratio']) == 3:
        max_bits = road_map[name]['max_ratio'][1][-1]
        count = 0
        for idx, cnt in enumerate(list(road_map[name][max_bits].values())):
            count += cnt
            avg_count += tensor.shape[1] * count * cnt/(sum(road_map[name][max_bits].values()) + 1)
        
        avg_count += 1 * 1/(sum(road_map[name][max_bits].values()) + 1)
        # avg_count *= tensor.shape[1]
        
    average_access += avg_count
print(f"model_average_access: {average_access/len(road_map):.2f}")

# how many pages will be read in maxmium
max_access = 0
for name, tensor in t5_exponent.items():
    max_count = 0
    if len(road_map[name]['max_ratio']) == 2:
        max_count += 1
    elif len(road_map[name]['max_ratio']) == 3:
        max_count += sum(road_map[name][max_bits].values()) + 1
    max_access += (max_count * tensor.shape[1])
print(f"model_max_access: {max_access/len(road_map):.2f}")

model_compression_ratio: 30.23 %
saved_space: 34.996 MB
model_average_access: 102655.00
model_max_access: 173118.22


In [143]:
### parser the list tensor_gran_pattern in page to get the fraction
bits = [5, 4, 3, 2]
road_map = defaultdict(dict)
for name, candidate in tensor_gran_pattern.items():
    # print(name+':')
    max_ratio = 0
    frac_bits = 0
    if len(candidate) == 0:
        # compressed_model_size += 1
        original_bits = t5_exponent[name].numel() * FP16
        frac_bits = PageSize - original_bits
        road_map[name]["max_ratio"] = [
            ("page_size", 1),
            ("frac_bits", frac_bits)
        ]
        continue
    else:
        for bit in bits:
            stats = sorted(candidate[bit].items(), key=lambda x: x[1], reverse=True)
            total_page = {}
            cnt_stat = 0
            frac_bits = {}
            for pat, cnt in stats:
                after_compressed_bit = cnt * (FP16 - bit) + bit
                pattern_page = (after_compressed_bit + PageSize - 1) // PageSize
                cnt_stat += cnt
                frac_bits[pat] = pattern_page * PageSize - (cnt * (FP16 - bit) + bit)
                if pattern_page == 1:
                    if frac_bits[pat] > after_compressed_bit:
                        break
                elif pattern_page > 0:
                    total_page[pat] = pattern_page
            road_map[name][bit] = total_page

            stats = dict(stats)
            original_tensor_page = (sum(list(stats.values())) * FP16 + PageSize - 1) // PageSize
            compressed_part = sum(list(total_page.values()))
            uncompressed_part = ((sum(stats.values()) - cnt_stat) * FP16 + PageSize - 1) // PageSize
            compressed_page = compressed_part + uncompressed_part
            ratio = (original_tensor_page - compressed_page)/original_tensor_page
            if  max_ratio < ratio:
                max_ratio = ratio
                original_bits = t5_exponent[name].numel() * FP16
                road_map[name]['max_ratio'] = [
                    ("bits", bit),
                    ("compression_ratio", max_ratio),
                    ("frac_bits", frac_bits)
                ]
                # print(f"Bit:{bit}\tOrig_page:{original_tensor_page}\tcompressed_page:{compressed_page}\tcompression_ratio:{ratio* 100 :.2f}")

# whole model compression_ratio in tensor granularity
# page
for name, tensor in t5_exponent.items():
    print(name)
    print(\
        road_map[name]['max_ratio'][-1][-1]
    )

shared.weight
{19: 22240, 18: 12663, 20: 29040, 17: 16225, 16: 31767, 15: 26556, 14: 20442, 21: 15296, 13: 11, 12: 29075, 11: 15030, 10: 24012, 9: 29020, 8: 13512, 7: 7276, 6: 19706}
encoder.block.0.layer.0.SelfAttention.q.weight
{10: 26665, 11: 20809, 9: 26816, 8: 31741, 7: 27467, 12: 6129, 6: 14303, 5: 22191, 4: 11610, 3: 21961}
encoder.block.0.layer.0.SelfAttention.k.weight
{13: 2235, 14: 26331, 12: 24671, 11: 458, 10: 27874, 15: 26588, 9: 12664, 8: 22488, 7: 12820, 6: 21774}
encoder.block.0.layer.0.SelfAttention.v.weight
{13: 28908, 14: 17993, 12: 17851, 11: 491, 10: 28611, 15: 9342, 9: 13577, 8: 23918, 7: 12160, 6: 21510}
encoder.block.0.layer.0.SelfAttention.o.weight
{13: 14908, 14: 32498, 12: 12321, 11: 21601, 15: 659, 10: 22452, 9: 28548, 8: 29594, 16: 7419, 7: 14866, 6: 23963}
encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight
28672
encoder.block.0.layer.1.DenseReluDense.wi.weight
{14: 27458, 13: 4023, 12: 21133, 15: 16320, 11: 13219, 10: 21000, 9: 4948, 8: 2

In [24]:
### Get the percentage in tile granularity ###
tile_gran_pattern = []
PageSize = 4 * 1024 * 8 # bits
FP16 = 16   # bits
for k, tensor in t5_exponent.items():
    tile_pattern = defaultdict(dict)
    row, col = tensor.shape
    if col * FP16 <= PageSize:
        tile_pattern[k] = "LTP"     # a.k.a of "Less Than PageSize"
    else:
        for row_idx in range(row):
            tile = tensor[row_idx]
            for shift in range(3, -1, -1):
                tile_pattern[k][row_idx] = defaultdict(dict)
                idx = 3 - shift
                pat = pattern[idx]
                tmp = tile >> shift
                for elem in pat:
                    tile_pattern[k][row_idx][shift][elem] = torch.count_nonzero(tmp == elem).item()
    tile_gran_pattern.append(tile_pattern)

In [19]:
### Get the percentage in matrix granularity ###
PageSize = 4 * 1024 * 8 # bits
FP16 = 16   # bits
mat_gran_pattern = []
for name, tensor in t5_exponent.items():
    row, col = tensor.shape
    higher_cur = 0
    lower_cur = higher_cur + PageSize // (col * FP16)
    tensor_pattern = defaultdict(dict)
    tensor_pattern['name'] = name
    while lower_cur <= row:
        tile_pattern = {}
        tile = tensor[higher_cur: lower_cur]
        for shift in range(3, -1, -1):
            idx = 3 - shift
            pat = pattern[idx]
            tmp = tile >> shift
            tile_pattern[idx+2] = {}
            for elem in pat:
                tile_pattern[idx+2][elem] = torch.count_nonzero(tmp == elem).item()
        for bit in range(2, 5):
            values = torch.tensor(list(tile_pattern[bit].values())).sort()[0]
            values = torch.sort(values, descending=True)[0]
            wasted_bits = (tile.numel() * FP16 + PageSize - 1) // PageSize * PageSize - values.sum() * FP16
            flag = False
            compressed_count = 0
            for grp_count, v in enumerate(values):
                v = v.item()
                wasted_bits -= (v * (FP16 - bit) + bit - v * FP16)
                compressed_count += v
                original_bit_count = tile.numel() * FP16
                original_page_count = (original_bit_count + PageSize - 1) // PageSize
                compressed_bit_count = (v * (FP16 - bit) + bit) * grp_count + (values.sum().item() - compressed_count) * FP16
                compressed_page_count = (compressed_bit_count + PageSize - 1) // PageSize
                frac = compressed_page_count * PageSize - compressed_bit_count
                if wasted_bits >= PageSize:
                    flag = True
                    break
            if flag:
                break
        if lower_cur == row:
            tensor_pattern[f"{higher_cur}:{lower_cur}"]["original_bit_count"] = original_bit_count
            tensor_pattern[f"{higher_cur}:{lower_cur}"]["compressed_bit_count"] = compressed_bit_count
            tensor_pattern[f"{higher_cur}:{lower_cur}"]["compressed_page_count"] = compressed_page_count
            break
        if not flag:
            if (row - higher_cur) * col * FP16 < PageSize:
                lower_cur = row
            else:
                lower_cur += 1
        else:
            if frac > compressed_bit_count:
                if (row - higher_cur) * col * FP16 < PageSize:
                    lower_cur = row
                else:
                    lower_cur += 1
            else:
                tensor_pattern[f"{higher_cur}:{lower_cur}"]["original_bit_count"] = original_bit_count
                tensor_pattern[f"{higher_cur}:{lower_cur}"]["compressed_bit_count"] = compressed_bit_count
                tensor_pattern[f"{higher_cur}:{lower_cur}"]["compressed_page_count"] = compressed_page_count
                if (row - higher_cur) * col * FP16 < PageSize:
                    lower_cur = row
                else:
                    higher_cur = lower_cur
                    lower_cur = higher_cur + PageSize // (col * FP16)
                    if lower_cur > row:
                        lower_cur = row
    mat_gran_pattern.append(tensor_pattern)

In [23]:
compressed_ratio = 0
cp = 0
ucp = 0
for idx, (k, v) in enumerate(t5_exponent.items()):
    stats = list(mat_gran_pattern[idx].values())[1:]
    for stat in stats:
        cp += stat['compressed_page_count']
    ucp += (v.numel() * FP16 // PageSize)
        # compressed_ratio += cp_r
print(f"Compression ratio = {(ucp - cp) / ucp * 100:.2f}%")


Compression ratio = 47.02%


In [23]:
### Get the percentage in matrix granularity ###
PageSize = 4 * 1024 * 8 # bits
FP16 = 16   # bits
mat_gran_pattern = []
for name, tensor in gpt2_exponent.items():
    row, col = tensor.shape
    higher_cur = 0
    lower_cur = higher_cur + PageSize // (col * FP16)
    tensor_pattern = defaultdict(dict)
    tensor_pattern['name'] = name
    while lower_cur <= row:
        tile_pattern = {}
        tile = tensor[higher_cur: lower_cur]
        for shift in range(3, -1, -1):
            idx = 3 - shift
            pat = pattern[idx]
            tmp = tile >> shift
            tile_pattern[idx+2] = {}
            for elem in pat:
                tile_pattern[idx+2][elem] = torch.count_nonzero(tmp == elem).item()
        for bit in range(2, 5):
            values = torch.tensor(list(tile_pattern[bit].values())).sort()[0]
            values = torch.sort(values, descending=True)[0]
            wasted_bits = (tile.numel() * FP16 + PageSize - 1) // PageSize * PageSize - values.sum() * FP16
            flag = False
            compressed_count = 0
            for grp_count, v in enumerate(values):
                v = v.item()
                wasted_bits -= (v * (FP16 - bit) + bit - v * FP16)
                compressed_count += v
                original_bit_count = tile.numel() * FP16
                original_page_count = (original_bit_count + PageSize - 1) // PageSize
                compressed_bit_count = (v * (FP16 - bit) + bit) * grp_count + (values.sum().item() - compressed_count) * FP16
                compressed_page_count = (compressed_bit_count + PageSize - 1) // PageSize
                frac = compressed_page_count * PageSize - compressed_bit_count
                if wasted_bits >= PageSize:
                    flag = True
                    break
        if lower_cur == row:
            tensor_pattern[f"{higher_cur}:{lower_cur}"]["original_bit_count"] = original_bit_count
            tensor_pattern[f"{higher_cur}:{lower_cur}"]["compressed_bit_count"] = compressed_bit_count
            tensor_pattern[f"{higher_cur}:{lower_cur}"]["compressed_page_count"] = compressed_page_count
            break
        if not flag:
            if (row - higher_cur) * col * FP16 < PageSize:
                lower_cur = row
            else:
                lower_cur += 1
        else:
            if frac > compressed_bit_count:
                if (row - higher_cur) * col * FP16 < PageSize:
                    lower_cur = row
                else:
                    lower_cur += 1
            else:
                tensor_pattern[f"{higher_cur}:{lower_cur}"]["original_bit_count"] = original_bit_count
                tensor_pattern[f"{higher_cur}:{lower_cur}"]["compressed_bit_count"] = compressed_bit_count
                tensor_pattern[f"{higher_cur}:{lower_cur}"]["compressed_page_count"] = compressed_page_count
                if (row - higher_cur) * col * FP16 < PageSize:
                    lower_cur = row
                else:
                    higher_cur = lower_cur
                    lower_cur = higher_cur + PageSize // (col * FP16)
                    if lower_cur > row:
                        lower_cur = row
    mat_gran_pattern.append(tensor_pattern)

In [24]:
compressed_ratio = 0
cp = 0
ucp = 0
for idx, (k, v) in enumerate(gpt2_exponent.items()):
    stats = list(mat_gran_pattern[idx].values())[1:]
    for stat in stats:
        cp += stat['compressed_page_count']
    ucp += (v.numel() * FP16 // PageSize)
        # compressed_ratio += cp_r
print(f"Compression ratio = {(ucp - cp) / ucp * 100:.2f}%")

Compression ratio = 19.54%


In [None]:
### Get the percentage in matrix granularity ###
PageSize = 4 * 1024 * 8 # bits
FP16 = 16   # bits
mat_gran_pattern = []
for name, tensor in llama2_exponent.items():
    row, col = tensor.shape
    higher_cur = 0
    lower_cur = higher_cur + PageSize // (col * FP16)
    tensor_pattern = defaultdict(dict)
    tensor_pattern['name'] = name
    while lower_cur <= row:
        tile_pattern = {}
        tile = tensor[higher_cur: lower_cur]
        for shift in range(3, -1, -1):
            idx = 3 - shift
            pat = pattern[idx]
            tmp = tile >> shift
            tile_pattern[idx+2] = {}
            for elem in pat:
                tile_pattern[idx+2][elem] = torch.count_nonzero(tmp == elem).item()
        for bit in range(2, 5):
            values = torch.tensor(list(tile_pattern[bit].values())).sort()[0]
            values = torch.sort(values, descending=True)[0]
            wasted_bits = (tile.numel() * FP16 + PageSize - 1) // PageSize * PageSize - values.sum() * FP16
            flag = False
            compressed_count = 0
            for grp_count, v in enumerate(values):
                v = v.item()
                wasted_bits -= (v * (FP16 - bit) + bit - v * FP16)
                compressed_count += v
                original_bit_count = tile.numel() * FP16
                original_page_count = (original_bit_count + PageSize - 1) // PageSize
                compressed_bit_count = (v * (FP16 - bit) + bit) * grp_count + (values.sum().item() - compressed_count) * FP16
                compressed_page_count = (compressed_bit_count + PageSize - 1) // PageSize
                frac = compressed_page_count * PageSize - compressed_bit_count
                if wasted_bits >= PageSize:
                    flag = True
                    break
        if lower_cur == row:
            tensor_pattern[f"{higher_cur}:{lower_cur}"]["original_bit_count"] = original_bit_count
            tensor_pattern[f"{higher_cur}:{lower_cur}"]["compressed_bit_count"] = compressed_bit_count
            tensor_pattern[f"{higher_cur}:{lower_cur}"]["compressed_page_count"] = compressed_page_count
            break
        if not flag:
            if (row - higher_cur) * col * FP16 < PageSize:
                lower_cur = row
            else:
                lower_cur += 1
        else:
            if frac > compressed_bit_count:
                if (row - higher_cur) * col * FP16 < PageSize:
                    lower_cur = row
                else:
                    lower_cur += 1
            else:
                tensor_pattern[f"{higher_cur}:{lower_cur}"]["original_bit_count"] = original_bit_count
                tensor_pattern[f"{higher_cur}:{lower_cur}"]["compressed_bit_count"] = compressed_bit_count
                tensor_pattern[f"{higher_cur}:{lower_cur}"]["compressed_page_count"] = compressed_page_count
                if (row - higher_cur) * col * FP16 < PageSize:
                    lower_cur = row
                else:
                    higher_cur = lower_cur
                    lower_cur = higher_cur + PageSize // (col * FP16)
                    if lower_cur > row:
                        lower_cur = row
    mat_gran_pattern.append(tensor_pattern)

In [None]:
compressed_ratio = 0
cp = 0
ucp = 0
for idx, (k, v) in enumerate(llama2_exponent.items()):
    stats = list(mat_gran_pattern[idx].values())[1:]
    for stat in stats:
        cp += stat['compressed_page_count']
    ucp += (v.numel() * FP16 // PageSize)
        # compressed_ratio += cp_r
print(f"Compression ratio = {(ucp - cp) / ucp * 100:.2f}%")