<a href="https://colab.research.google.com/github/NoamMichael/Comparing-Confidence-in-LLMs/blob/main/LlamaEvaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Initialize

In [3]:
%pip install transformers accelerate torch bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl (72.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.46.1


In [136]:
import pandas as pd
import numpy as np
import torch
import json
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from google.colab import userdata

# Parameters
hf_key = userdata.get('hf_llama_token')
model_name = "Meta-Llama-3-8B-Instruct"
temperature = 0
max_new_tokens = 3
token_dist_length = 100 # Get top 100 tokens

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_threshold=6.0,
    llm_int8_skip_modules=None,
    offload_buffers=True
)

## Initialize Llama

In [37]:
tokenizer = AutoTokenizer.from_pretrained(model_name, token = hf_key)
model = AutoModelForCausalLM.from_pretrained(model_name,token = hf_key, device_map="auto")

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]



In [138]:
from sys import path
def format_input(input: str, system: str):
    eot = '<|eot_id|>\n'
    system_header = '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n'

    user_header = '<|eot_id|><|start_header_id|>user<|end_header_id|>\n'

    assistant_header = '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n'

    return system_header + system + eot + user_header + input + eot + assistant_header

def get_tokens(input: str) -> tuple[str, dict]:
    batch = tokenizer(input, return_tensors= "pt").to('cuda')
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits

    probs = torch.softmax(logits[0, -1], dim=0)

    top_k_probs, top_k_indices = torch.topk(probs, token_dist_length, sorted =True)

    top_k_tokens = [tokenizer.decode([token_id]) for token_id in top_k_indices]

    top_token = top_k_tokens[0]

    token_dict = dict(zip(top_k_tokens, top_k_probs.tolist()))

    return top_token, token_dict



def get_response(input: str, system: str, max_tokens = max_new_tokens):
    last_token = ""
    token_count = 0
    content = ""
    full_input = format_input(input, system)
    tokens = []

    while last_token != "<|eot_id|>" and token_count < max_tokens: ## maybe just make last token '}' to close the json?
        full_input = full_input + last_token

        next_token, token_dict = get_tokens(full_input)
        next_item = {
            "top_token": next_token,
            "top_100": token_dict
        }
        tokens.append(next_item)
        content = content + next_token
        #print(next_token)
        token_count += 1
        last_token = next_token

    return content, tokens

def run_qset(df):
    responses = {}
    for index, row in df.iterrows():
        question_id = row['Question ID']
        system_prompt = row['System Prompt']
        input_string = row['Full Prompt']

        content, tokens = get_response(input_string, system_prompt)

        one_response = {
            "qid": question_id,
            "system": system_prompt,
            "input": input_string,
            "ouput": {
                "content": content,
                "tokens": tokens
            }
        }

        responses.update({index: one_response})
    return responses


def save_output(data: dict, filename: str):
    dir_name = 'Results'
    os.makedirs(dir_name, exist_ok=True)  # ensure the 'Results' folder exists

    path = os.path.join(dir_name, filename)
    with open(path, 'w') as f:
        json.dump(data, f, indent=2)

def run(datasets):
    for dataset_name, dataset in datasets.items():
        print(f'Running {model_name} on {dataset_name}')
        results = run_qset(dataset)
        print('Evaluation competed. Saving results as JSON')
        file_path =  f"{model_name}_{dataset_name}_results.json"
        save_output(results, file_path)
        print(f'Results saved to {file_path}')


In [139]:
test_df = pd.read_csv('/content/boolq_valid_prompts.csv')[:3]


test_datasets = {
    'boolq': test_df
}

run(test_datasets)

Running Meta-Llama-3-8B-Instruct on boolq
Evaluation competed. Saving results as JSON
Results saved to Meta-Llama-3-8B-Instruct_boolq_results.json


In [119]:
save_output(responses_test, 'test_results.json')