In [None]:
from sampling.utils import create_history

import pandas as pd
import numpy as np
import torch
import tqdm

In [None]:
def load_model(name, revision=None, device=None):
    from transformers import AutoTokenizer
    def update_model_and_tokenizer(model, tokenizer):
        pass

    model_kwargs = {}
    tokenizer_kwargs = {}

    # Load GPT2 model
    if "gpt2" in name:
        from transformers import GPT2LMHeadModel
        model_class = GPT2LMHeadModel

        def update_model_and_tokenizer(model, tokenizer):
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.pad_token_id = tokenizer.eos_token_id
            model.config.pad_token_id = model.config.eos_token_id

    elif "gpt-neo" in name:
        from transformers import GPTNeoForCausalLM
        model_class = GPTNeoForCausalLM

        def update_model_and_tokenizer(model, tokenizer):
            tokenizer.pad_token = tokenizer.eos_token
            model.config.pad_token_id = model.config.eos_token_id

    elif "pythia" in name:
        # GPTNeoXTokenizerFast
        from transformers import GPTNeoXForCausalLM
        model_class = GPTNeoXForCausalLM
        if revision:
            model_kwargs.update(revision=revision)
    else:
        raise ValueError(f"Undefined: {name}")

    model = model_class.from_pretrained(name)
    tokenizer = AutoTokenizer.from_pretrained(name, padding_side="left")
    update_model_and_tokenizer(model, tokenizer)

    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    model.to(device)
    return model, tokenizer, device


MODEL_NAME, MODEL_REVISION = "EleutherAI/pythia-70m", ""
MODEL, TOKENIZER, DEVICE = load_model(MODEL_NAME, MODEL_REVISION, None)

In [None]:
def generate(
    prefix: str, num_sequences: int, batch_size: int, model, tokenizer, device, seed=None, **sampling_kwargs,
):
    if seed is not None:
        np.random.seed(seed)
        torch.manual_seed(seed)

    default_kwargs = dict(
        return_dict_in_generate=True,
        output_scores=True,
        pad_token_id=tokenizer.eos_token_id,
    )
    default_kwargs.update(sampling_kwargs)

    seqs = []
    seq_scores = []
    seq_trans_scores = []
    seq_entr_scores = []
    for start in range(0, num_sequences, batch_size):
        size = min(batch_size, num_sequences-start)

        input_ids = (
            tokenizer(
                # prefix, return_tensors="pt", add_special_tokens=False
                tokenizer.bos_token + prefix, return_tensors="pt", add_special_tokens=False
            ).input_ids
            if prefix is not None
            else None
        )
        # input_ids = create_history(size, prefix, tokenizer.bos_token_id).to(device)
        input_ids = create_history(size, input_ids, tokenizer.bos_token_id).to(device)
        attention_mask = torch.ones_like(input_ids).to(device)

        # Generate sequences
        outputs = model.generate(input_ids, attention_mask=attention_mask, **default_kwargs)
        sequences = outputs.sequences

        # Compute each sequence probability
        results = model(sequences, attention_mask=torch.ones_like(sequences), labels=sequences)
        batch_score = -results.loss.cpu().detach().numpy()

        # Based on the discussion at
        # https://discuss.huggingface.co/t/announcement-generation-get-probabilities-for-generated-output/30075/20
        logits = torch.log_softmax(results.logits, dim=-1).detach()

        # collect the probability of the generated token
        # -- probability at index 0 corresponds to the token at index 1
        logits, input_ids = logits[:, :-1, :], sequences[:,1:,None]

        # Scores per token of the template
        batch_seq_scores = torch.gather(logits, 2, input_ids).squeeze(-1)

        _avg_loss = batch_seq_scores.mean(dim=-1).mean().item()
        assert np.abs(_avg_loss - batch_score) <= 1e-5, f"Loss does not match: (batch: {input_ids})), {_avg_loss} - {batch_score} > 1e-6"

        seqs.extend(sequences.detach().cpu().numpy().tolist())
        seq_scores.extend(batch_seq_scores.sum(dim=-1).detach().cpu().numpy().tolist())
        seq_trans_scores.extend(batch_seq_scores.cpu().detach().numpy())
        
        # Compute entropy
        probits = torch.softmax(logits, dim=-1)
        torch_entropy = -1 * torch.mul(probits, logits).sum(dim=-1)
        seq_entr_scores.extend(torch_entropy.detach().cpu().detach().numpy())

    return seqs, seq_scores, seq_trans_scores, seq_entr_scores


In [None]:
sampled_seq, sampled_scores, sampled_seq_trans_scores, sampled_seq_entr_scores = generate(
        prefix=None,
        num_sequences=16,
        batch_size=8,
        model=MODEL,
        tokenizer=TOKENIZER,
        device=DEVICE,
        seed=24,
        do_sample=True,
        num_beams=1,
        max_new_tokens=100,
)

In [None]:
results = {"sequence_log_prob": [], "sequence": [], "seq_trans_log_probs": [], "seq_entropy_log_probs": []}
results["sequence_log_prob"].extend(sampled_scores)
results["sequence"].extend(TOKENIZER.batch_decode(sampled_seq, skip_special_tokens=True))
results["seq_trans_log_probs"].extend(sampled_seq_trans_scores)
results["seq_entropy_log_probs"].extend(sampled_seq_entr_scores)
pd.DataFrame(results)

In [1]:
from transformers.utils import logging
# logging.set_verbosity_info(

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
2048*64

131072

In [5]:
32768 / 4

8192.0

In [7]:
import pandas as pd

d = pd.read_csv("/extra/ucinlp1/cbelem/experiment-ro-prompts/generations/uncond/EleutherAI__pythia-70m-top_p.csv", index_col=0)

In [None]:
d = pd.read_csv("/extra/ucinlp1/cbelem/experiment-ro-prompts/generations/uncond/EleutherAI__pythia-70m-top_p.csv", index_col=0)

In [None]:
d.head()