In [4]:
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
%load_ext autoreload
%autoreload 2

import llminference as L
import llminference.experiments as xp
import torch
torch.set_num_threads(32)

# High-level Experiment API

In [5]:
out = xp.run_one(xp.Experiment(
    "test",
    task=xp.Task("squad", shots=1, samples=10, confusion_contexts=0),
    model="EleutherAI/pythia-410m",
    execution=xp.Execution(device="cpu", dtype="float32", batch_size=10, pipeline_stages=1, wandb=False),
    sparsity=xp.Sparsity("ann", k=64, local_k=16, score="sparse_q", rank=16, reallocate_to_mean_value=True),
))
display({k: v for k, v in out.items() if k not in {"model_config", "results"}})

# Manual evaluation

This codebase provides & interfaces with multiple harnesses for evaluating language models, with a particular focus on text generation.

In [6]:
adapter = L.Adapter.from_pretrained("EleutherAI/pythia-410m")

## SQuAD

We evaluate SQuAD using a custom harness. It is quite bare-bones, so it's easy to get hands-on with the data & results.

In [3]:
from llminference.tasks import qa

squad_data = qa.SQuAD.data()
examples = [qa.add_few_shot_prompt(squad_data[i], k=1, prompt_template=qa.get_default_prompt_template(adapter.model.config._name_or_path, shots=1))
            for i in range(10)]
display(examples[3])
results = list(qa.evaluate(adapter, examples, batch_size=10))
display(results[3])
print("accuracy", sum(r["match"] for r in results) / len(results))

## Outcompare

Outcompare is a custom harness for comparing the greedy generations of a language model against a reference output (e.g. the same model, before quantisation to low-precision).

Note - this requires data from `generate_outcompre_datasets.py`, or the `data` branch.

In [None]:
outcompare_data = L.tasks.outcompare.Dataset.load("../data/pythia-410m.json")
display(L.tasks.outcompare.evaluate(adapter.model, outcompare_data, batch_size=16, limit=64))

Now, deliberately mess up the model & see what happens

In [None]:
adapter.model.gpt_neox.layers[4].attention.dense.weight.data.fill_(0)
display(L.tasks.outcompare.evaluate(adapter.model, outcompare_data, batch_size=16, limit=64))

In [None]:
print()