In [94]:
import os
from pathlib import Path
from functools import lru_cache
import numpy as np
import pandas as pd
from collections import Counter

In [25]:
REPO_ROOT = os.path.realpath(os.path.join(os.getcwd(), ".."))
REPO_ROOT

'/home/khizbud/latenciaga'

In [26]:
class TpuData:
    # Compiler optimization
    OPTIM = ["layout", "tile"]
    # Source
    SRC = ["xla", "nlp"]
    # Search strategy
    SEARCH = ["default", "random"]
    # Dataset split
    SPLIT = ["train", "valid", "test"]
    # Collection
    COLL = [
        "layout-nlp-default",
        "layout-nlp-random",
        "layout-xla-default",
        "layout-xla-random",
        "tile-xla"
    ]

    def __init__(
            self,
            data_root: Path,
            coll: str,
            split: str,
            ):

        super().__init__()

        assert os.path.exists(data_root)

        self.data_root = data_root
        assert coll in self.COLL
        self.coll = coll
        assert split in self.SPLIT
        self.split = split

        self.data_dir = self._get_coll_root(coll)
        file_name_list = []
        for file in os.listdir(self.data_dir):
            if not file.endswith(".npz"):
                continue
            data_file = str(self.data_dir/file)
            file_name_list.append(data_file)
        self.file_name_list = file_name_list
        
    def _get_coll_root(self, coll: str) -> Path:
        """Parse the collection and return the corresponding data root.
        
        Parameters:
            coll: collection

        Return
            data_root: data root of the collection
        """
        coll_terms = coll.split("-")
        if len(coll_terms) == 3:
            optim, src, search = coll_terms
            assert search in self.SEARCH
            data_root = self.data_root/f"{optim}/{src}/{search}/{self.split}"
        else:
            optim, src = coll_terms
            data_root = self.data_root/f"{optim}/{src}/{self.split}"
        
        assert optim in self.OPTIM
        assert src in self.SRC

        return data_root

    @lru_cache(maxsize=1)
    def __getitem__(self, idx: int):
        file_path = self.file_name_list[idx]
        npz_dict = dict(np.load(file_path))
        fname_wo_ext = os.path.splitext(os.path.basename(file_path))[0]
        return {**npz_dict, 'fname_wo_ext': fname_wo_ext}

    def __len__(self):
        return len(self.file_name_list)



In [63]:
np.set_printoptions(edgeitems=30, linewidth=100)

In [95]:
data_root = Path("/home/khizbud/latenciaga/data/npz_all/npz")

In [101]:
# dataset = TpuData(data_root, "layout-xla-random", "train")
dataset = TpuData(data_root, "layout-nlp-random", "train")

max_covar_coef_list = []
print(f"{len(dataset)=}")
for ig, graph in enumerate(dataset):
    if ig >= 100:
        break
    print("-"*80)
    print(f"{ig=}")
    config_runtime = graph['config_runtime']
    print(f"{config_runtime.shape=}")
    node_config_feat = graph['node_config_feat'].astype(int)
    # print(f"{node_config_feat.shape=}")
    # print(node_config_feat.shape, node_config_feat.dtype)
    config_hash_list = []
    for c in range(node_config_feat.shape[0]):
        config = np.ascontiguousarray(node_config_feat[c])
        config_hash_list.append(hash(config.tobytes()))
    config_hashes = np.array(config_hash_list)
    # print(config_hashes.shape, config_hashes.dtype)
    unique_vals, unique_index, unique_inverse_index, unique_counts = \
        np.unique(config_hashes, axis=0, return_index=True, return_inverse=True, return_counts=True)
    # print(f"{unique_index=}")
    print(f"{unique_inverse_index.shape=}")
    # print(f"{unique_inverse_index=}")
    print(f"{unique_counts=}")
    print(f"{len(unique_counts)=}")
    print(f"{np.sum(unique_counts == 1) / len(config_hashes)=}")
    duplicate_groups = dict()
    covar_coefs = dict()
    for label in range(len(unique_counts)):
        mask = unique_inverse_index == label
        duplicate_runtimes = config_runtime[mask]
        duplicate_groups[label] = duplicate_runtimes
        # covar_coef = np.std(duplicate_runtimes) / np.mean(duplicate_runtimes)
        covar_coef = np.max(duplicate_runtimes) / np.min(duplicate_runtimes) - 1
        covar_coefs[label] = covar_coef
        # print(f"{covar_coef=}")
    max_covar_coef = max([v for k, v in covar_coefs.items()])
    print("max covar coef", max_covar_coef)
    max_covar_coef_list.append(max_covar_coef)
    # print(duplicate_groups[0])
    print("grand max covar coef", max(max_covar_coef_list))

len(dataset)=207
ig=0
--------------------------------------------------------------------------------
config_runtime.shape=(37768,)
unique_inverse_index.shape=(37768,)
unique_counts=array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       ..., 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1])
len(unique_counts)=37729
np.sum(unique_counts == 1) / len(config_hashes)=0.9989409023511968
max covar coef 0.0008934606529489297
grand max covar coef 0.0008934606529489297
ig=1
--------------------------------------------------------------------------------
config_runtime.shape=(13136,)
unique_inverse_index.shape=(13136,)
unique_counts=array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       ..., 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1])
len(unique_counts)=13097
np.sum(unique_counts == 1) / len(config_