In [1]:
from omegaconf import OmegaConf

In [2]:
data_cfg = OmegaConf.load("/root/similaritybench/nlp/config/dataset/sst2.yaml")
model_cfg = OmegaConf.load("/root/similaritybench/nlp/config/model/multibert.yaml")

print(model_cfg)
print(data_cfg)

{'name': 'google/multiberts-seed_${.seed}', 'name_human': 'multibert-${.seed}', 'seed': 0, 'remove_sos_token': False, 'token_pos': 0, 'kwargs': {'tokenizer_name': 'google/multiberts-seed_${..seed}', 'model_type': None}}
{'path': 'sst2', 'name': None, 'split': 'test', 'prompt_template': None, 'feature_column': ['sentence'], 'target_column': 'label', 'finetuning': {'num_labels': 2, 'trainer': {'_target_': 'transformers.Trainer', 'args': {'_target_': 'transformers.TrainingArguments', 'output_dir': '${hydra:runtime.output_dir}', 'overwrite_output_dir': True, 'warmup_ratio': 0.1, 'evaluation_strategy': 'steps', 'eval_steps': 1000, 'save_steps': 1000, 'per_device_train_batch_size': 64, 'per_device_eval_batch_size': 64, 'seed': 123456789, 'num_train_epochs': 10, 'save_total_limit': 2, 'load_best_model_at_end': True}}, 'eval_dataset': ['validation']}}


In [3]:
from repsim.nlp import get_dataset, get_tokenizer
import transformers

In [4]:
ds = get_dataset(data_cfg.path, data_cfg.name, )

In [5]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_cfg.kwargs.tokenizer_name)
tokenizer


BertTokenizerFast(name_or_path='google/multiberts-seed_0', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [11]:
from typing import Any
from datasets import DatasetDict
from functools import partial
import numpy as np


class MemorizableLabelAdder:
    def __init__(
        self, dataset: DatasetDict, p: float, new_n_labels: int, label_column: str, seed: int = 1234567890
    ) -> None:
        self.dataset = dataset
        self.p = p
        self.new_n_labels = new_n_labels
        self.label_column = label_column
        self.new_label_column = "memorizable_label"

        self.seed = seed
        self.rng = np.random.default_rng(seed)

    def add_labels(self):
        for key, ds in self.dataset.items():
            n_existing_labels = len(np.unique(ds[self.label_column]))
            new_labels = np.arange(n_existing_labels, n_existing_labels + self.new_n_labels)
            idxs = np.arange(len(ds))
            idxs_new_labels = self.rng.choice(idxs, size=int(self.p * len(ds)))

            def _new_labels(example: dict[str, Any]):
                curr_label = example[self.label_column]
                if example["idx"] in idxs_new_labels:
                    new_label = self.rng.choice(new_labels)
                else:
                    new_label = curr_label
                return {self.new_label_column: new_label}

            self.dataset[key] = ds.map(_new_labels)
        return self.dataset, self.new_label_column


adder = MemorizableLabelAdder(ds, 0.75, 5, "label", seed=123)
new_ds, new_label_col = adder.add_labels()

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [12]:
new_ds["validation"][:10]

{'idx': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 'sentence': ["it 's a charming and often affecting journey . ",
  'unflinchingly bleak and desperate ',
  'allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . ',
  "the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . ",
  "it 's slow -- very , very slow . ",
  'although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women . ',
  'a sometimes tedious film . ',
  "or doing last year 's taxes with your ex-wife . ",
  "you do n't have to know about music to appreciate the film 's easygoing blend of comedy and romance . ",
  "in exactly 89 minutes , most of which passed as slowly as if i 'd been sitting naked on an igloo , formula 51 sank from quirky to jerky to utter turkey . "],
 'label': [1, 0, 1, 1, 0, 1, 0, 0, 1, 0],
 'memorizable_label': [1, 0, 1, 3, 3, 6, 5, 0, 2, 0]}

In [13]:
adder.dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label', 'memorizable_label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label', 'memorizable_label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label', 'memorizable_label'],
        num_rows: 1821
    })
})

In [14]:
def new_label_eq_old_label(example: dict[str, Any]) -> dict[str, str]:
    label = example["label"]
    new_label = example[new_label_col]
    return {"label_eq": label == new_label}

x = new_ds["train"].map(new_label_eq_old_label)
sum(x["label_eq"])/len(x["label_eq"])


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

0.47207827881631503

ground truth label distribution

In [15]:
counts = np.unique(ds["train"]["label"], return_counts=True)[1]
counts/counts.sum()

array([0.44217435, 0.55782565])

distribution after scrambling labels

In [16]:
counts = np.unique(new_ds["train"]["memorizable_label"], return_counts=True)[1]
counts/counts.sum()

array([0.2088524 , 0.26322588, 0.10398076, 0.10430741, 0.10585161,
       0.10699491, 0.10678703])