# Test inference on our trained model

This was originally a notebook on Kaggle, can't promise it works on Colab
due to different virtual environments

## pip install and import statements 

In [1]:
# according to hf modelcard need transformers >= 4.43.0 onward for Llama 3.1
!pip install -q transformers==4.43.1

In [2]:
!pip install -q accelerate bitsandbytes peft flash-attn

In [3]:
import huggingface_hub
from tqdm import tqdm
import gc
import re

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

## Login in to Huggingface

In [4]:
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Define which hf things to download

In [5]:
dataset_name = "LLMsForHepth/arxiv_hepth_first_overfit"  # 5 abstracts
# dataset_name = "LLMsForHepth/arxiv_hepth_first"  # 15,825 abstracts
model_name = "LLMsForHepth/test_llama_3.1_batch48"

## Model

Notes:
- Running inference with a quanitzed model usually gives slightly poorer quality results.
- Inference using a PeftModel incurs an overhead as extra operations are needed
to add the adapters 

In [6]:
bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_compute_dtype=torch.bfloat16,
                                bnb_4bit_quant_storage=None, #torch.bfloat16,
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_use_double_quant=True)


model_cfg = {"attn_implementation": "sdpa",  # can be "eager" (default),"sdpa" or "flash_attention_2"
             "device_map": "auto",
             "quantization_config": bnb_config,
            }

model = AutoModelForCausalLM.from_pretrained(model_name, **model_cfg)

adapter_config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

## Tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side="left"  # left pad for inference

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

## Get datasets

In [8]:
ds = load_dataset(dataset_name, split='train')  # use test for dataset arxiv_hepth_first

Downloading readme:   0%|          | 0.00/3.80k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [9]:
def split_abstracts(example):
    """
    Splits an abstract into a prompt and ground truth.

    The prompt is created from the first half (or slightly more) of the sentences in the abstract,
    and the ground truth is the remaining sentences.

    Args:
        example (dict): A dictionary containing the 'abstract' text to be split.

    Returns:
        dict: A dictionary with 'prompt' and 'y_true' keys containing the split abstract parts.
    """
    # Split the abstract into sentences (i.e. text sequences which end with any of .!? and a space)
    sentences = re.split(r'(?<=[.!?])\s+', example['abstract'])
    # Calculate the split point
    total_sentences = len(sentences)
    split_point = (total_sentences + 1) // 2  # Ensures the prompt has >= number of sentences than y_true
    # Join the sentences back into two parts
    prompt = ' '.join(sentences[:split_point])
    y_true = ' '.join(sentences[split_point:])
    return {'prompt': prompt, 'y_true': y_true}

In [10]:
ds = ds.map(split_abstracts, batched=False)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [13]:
for row in ds:
    print(len(row['abstract']), len(row['prompt']))

526 352
1922 645
679 344
696 389
993 539


## Set random seed just in case for reproducibility

In [15]:
from transformers import set_seed
set_seed(42)

# Inference using `model.generate`

### Set up some function definitions

In [25]:
def get_batches(dataset, batch_size):
    """Splits a dataset into a list of batches."""
    return [dataset[i:i + batch_size] for i in range(0, len(dataset), batch_size)]  

In [35]:
def tokenize_batch(batch):
    """Tokenizes a batch of prompts."""
    prompts = batch["prompt"] 
    prompts_tok = tokenizer(
        prompts, 
        return_tensors="pt", 
        padding='longest', 
        truncation=False,
        pad_to_multiple_of=8,
        add_special_tokens=False
    ).to(model.device)

    return prompts_tok

In [44]:
def generate_batch(batch_tok, generation_config):
    with torch.no_grad():
        outputs_tok = model.generate(**batch_tok, **generation_config).to("cpu")
        outputs = tokenizer.batch_decode(outputs_tok, skip_special_tokens=True)
    return outputs

In [54]:
def clear_cache():
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
def parse_y_pred(example):
    """
    Extracts the predicted text from the generated predictions.

    Args:
        example (dict): A dictionary containing 'prompt' and 'predictions' keys.

    Returns:
        dict: A dictionary with 'y_pred' key containing the generated output without the prompt.
    """
    len_prompt = len(example['prompt'])
    y_pred = example['predictions'][len_prompt:]
    return {'y_pred': y_pred}

### Get batches and tokenize them

In [None]:
batches = get_batches(ds, 8)
batch_tok = tokenize_batch(batches[0])  # select a single batch for speedy testing

### Set up generation configurations

In [69]:
# greedy search
gen_cfg1 = {'pad_token_id': tokenizer.pad_token_id, 'max_new_tokens': 1024}
# contrastive search
gen_cfg2 = {'pad_token_id': tokenizer.pad_token_id, 'max_new_tokens': 1024, 'penalty_alpha': 0.6, 'top_k': 4}
# multinomial sampling
gen_cfg3 = {'pad_token_id': tokenizer.pad_token_id, 'max_new_tokens': 1024, 'do_sample': True, 'num_beams': 1}
# beam-search decoding
gen_cfg4 = {'pad_token_id': tokenizer.pad_token_id, 'max_new_tokens': 1024, 'num_beams': 5}
# beam-search multinomial sampling
gen_cfg5 = {'pad_token_id': tokenizer.pad_token_id, 'max_new_tokens': 1024, 'num_beams': 5, 'do_sample': True}
# diverse beam search decoding
gen_cfg6 = {'pad_token_id': tokenizer.pad_token_id, 'max_new_tokens': 1024, 'num_beams': 5,
            'num_beam_groups': 5, 'diversity_penalty': 1.0, 'do_sample': False}

num_gen_cfgs = 6  # how many configurations have we set up immediately above

### Return predictions using a single generation config

In [75]:
result = generate_batch(batch_tok, gen_cfg5)

### Iterate through all the generation configs and return a list of dictionaries

Currently only uses a single batch

In [77]:
results = []
for idx in tqdm(range(1,1+num_gen_cfgs)):
    result_dict = {}
    gen_cfg = eval('gen_cfg' + str(idx))
    result = generate_batch(batch_tok, gen_cfg)
    result_dict.update({**gen_cfg, 'predictions': result})
    results.append(result_dict)
    clear_cache()

100%|██████████| 6/6 [52:15<00:00, 522.56s/it]


In [79]:
import json
filepath = '/Users/paul/Desktop/results.json'
with open(filepath, 'w') as file:
    json.dump(results, file, indent=4)

## Ideas for inference speedup

* Use unsloth - limited to single gpu but claim 2x inference speed up
* Try using torch.compile(model)
* Use accelerate for multiple gpu use