# EECS E6892 Reinforcement Learning - Final Project
### Group DPO-RAIT
Qinhao Chen, Alex Wei

In [None]:
#!pip install -U transformers accelerate bitsandbytes

In [None]:
from huggingface_hub import login

login(token="") # we used hugging face to download LLaMA model, please use your own token

In [None]:
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

for i in range(torch.cuda.device_count()):
    print(f"--- GPU {i} ---")
    print(f"Name: {torch.cuda.get_device_name(i)}")
    print(f"Memory: {torch.cuda.get_device_properties(i).total_memory / (1024 ** 3):.2f} GB")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B")

save_path = "./llama3.2-3b"
tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)

In [None]:
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.auto import tqdm
from accelerate import init_empty_weights, infer_auto_device_map

# Globals
STOP = []
SURE = []
UNSURE = []
choices = ["A", "B", "C", "D"]
results = []

def format_subject(subject):
    return " ".join(subject.split("_"))

def format_example(input_list):
    prompt = input_list[0]
    for j in range(len(input_list) - 2):
        prompt += f"\n{choices[j]}. {input_list[j + 1]}"
    prompt += "\nAnswer:"
    return prompt

def format_shots(prompt_data):
    prompt = ""
    for data in prompt_data:
        prompt += data[0]
        for j in range(len(data) - 2):
            prompt += f"\n{choices[j]}. {data[j + 1]}"
        prompt += f"\nAnswer:{data[-1]}\n\n"
    return prompt

def gen_prompt(input_list, subject, prompt_data):
    prompt = f"The following are multiple choice questions (with answers) about {format_subject(subject)}.\n\n"
    prompt += format_shots(prompt_data)
    prompt += format_example(input_list)
    return prompt

def inference(tokenizer, model, input_text, subject, prompt_data):
    full_input = gen_prompt(input_text, subject, prompt_data)
    inputs = tokenizer(full_input, return_tensors="pt")

    first_device = next(iter(model.hf_device_map.values())) # move input to model device
    inputs = {k: v.to(first_device) for k, v in inputs.items()}

    outputs = model.generate(
        inputs["input_ids"],
        max_new_tokens=1,
        return_dict_in_generate=True
    )

    generated_token_id = outputs["sequences"][0][-1].item()
    generated_token = tokenizer.decode(generated_token_id).strip()
    if generated_token in choices:
        answer = generated_token
        confidence = 1.0
    else:
        answer = "UNKNOWN"
        confidence = 0.0

    print(f"[DEBUG] Subject: {subject}")
    print(f"[DEBUG] Question: {input_text[0]}")
    print(f"[DEBUG] Predicted Answer: {answer}")
    print(f"[DEBUG] Reference Answer: {input_text[-1]}")
    print(f"[DEBUG] Confidence: {confidence}")

    return answer, full_input, confidence

def checksure(tokenizer, model, input_text):
    full_input = f"{input_text}. Are you sure you accurately answered the question based on your internal knowledge? I am"
    inputs = tokenizer(full_input, return_tensors="pt").to(model.device)
    
    outputs = model.generate(
        inputs["input_ids"],
        max_new_tokens=1,
        output_scores=True,
        return_dict_in_generate=True
    )

    logits = outputs["scores"][0][0]
    probs = torch.nn.functional.softmax(logits, dim=0)
    sure_prob = probs[SURE[0]]
    unsure_prob = probs[UNSURE[0]]
    normalized_conf = sure_prob / (sure_prob + unsure_prob + 1e-6)

    generated_id = outputs["sequences"][0][-1].item()
    generated_token = tokenizer.decode([generated_id]).strip()
    print(f"[DEBUG] Self-assessment prompt: {full_input}")
    print(f"[DEBUG] Generated token: '{generated_token}'")
    print(f"[DEBUG] Token ID: {generated_id}")
    print(f"[DEBUG] P(sure) = {sure_prob.item():.4f}, P(unsure) = {unsure_prob.item():.4f}")
    print(f"[DEBUG] Normalized confidence: {normalized_conf.item():.4f}")
    print(f"SURE[0]: {SURE[0]}, 'sure' tokenized: {tokenizer('sure', add_special_tokens=False)['input_ids']}")

    return normalized_conf.item(), generated_token, generated_id

def run_knowledge_boundary_eval(model_path, domain="ID", result_name="MMLU"):
    global tokenizer, model, results

    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, unk_token="<unk>", bos_token="<s>", eos_token="</s>", add_bos_token=False)
    device_map = infer_auto_device_map(model, max_memory={0: "14GiB", 1: "14GiB"})
    with init_empty_weights():
        model = AutoModelForCausalLM.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device_map, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)

    STOP.clear()
    SURE.clear()
    UNSURE.clear()
    STOP.append(tokenizer(".")["input_ids"][0])
    SURE.append(tokenizer("sure", add_special_tokens=False)["input_ids"][0])
    UNSURE.append(tokenizer("unsure", add_special_tokens=False)["input_ids"][0])

    results = []

    with open(f"/kaggle/input/mmlu-test/MMLU_ID_prompt.json", 'r') as f:
        data = json.load(f)
    with open(f"/kaggle/input/mmlu-test/MMLU_ID_train.json", 'r') as f:
        prompt = json.load(f)

    for subject in tqdm(data.keys(), desc="Subjects"):
        prompt_data = prompt[subject]
        for instance in tqdm(data[subject], leave=False, desc=subject):
            output, full_input, predict_conf = inference(tokenizer, model, instance, subject, prompt_data)
            reference_answer = instance[-1]
            correct = 1 if output == reference_answer else 0
            results.append({
                "is_correct": correct,
                "predicted": output,
                "reference": reference_answer
            })


        torch.cuda.empty_cache()

    os.makedirs("/kaggle/working/results3b", exist_ok=True)
    with open(f"/kaggle/working/results3b/{result_name}_{domain}_test_inference3b.json", 'w') as f:
        json.dump(results, f, indent=2)


In [None]:
run_knowledge_boundary_eval("./llama3.2-3b", "ID", "MMLU")

In [None]:
import json
from collections import defaultdict

# results file
with open("/kaggle/working/results3/MMLU_ID.json", "r") as f:
    results = json.load(f)

# dictionary to hold: {token_id: set of decoded token strings}
token_id_to_token = defaultdict(set)

for r in results:   # extract all pairs
    tid = r.get("self_assessed_token_id")
    tok = r.get("self_assessed_token")
    if tid is not None and tok is not None:
        token_id_to_token[tid].add(tok.strip())

# show all unique token IDs and their decoded forms
print("unique self-assessed tokens and their token IDs:\n")
for tid, token_set in sorted(token_id_to_token.items()):
    print(f"Token ID {tid}: {', '.join(repr(t) for t in token_set)}")

In [None]:
with open("/kaggle/working/results4/MMLU_ID_inference1.json", "r") as f:
    data1=json.load(f)
with open("/kaggle/working/results4/MMLU_ID_inference2.json", "r") as f:
    data2=json.load(f)
with open("/kaggle/working/results4/MMLU_ID_inference3.json", "r") as f:
    data3=json.load(f)
with open("/kaggle/working/results4/MMLU_ID_inference4.json", "r") as f:
    data4=json.load(f)
with open("/kaggle/working/results4/MMLU_ID_inference5.json", "r") as f:
    data5=json.load(f)
with open("/kaggle/working/results4/MMLU_ID_inference6.json", "r") as f:
    data6=json.load(f)
with open("/kaggle/working/results4/MMLU_ID_inference7.json", "r") as f:
    data7=json.load(f)
with open("/kaggle/working/results4/MMLU_ID_inference8.json", "r") as f:
    data8=json.load(f)
with open("/kaggle/working/results4/MMLU_ID_inference9.json", "r") as f:
    data9=json.load(f)
with open("/kaggle/working/results4/MMLU_ID_inference10.json", "r") as f:
    data10=json.load(f)

In [None]:
list = []
know = 0
rough = 0

for i in range(len(data1)):
    component = []
    count = data1[i]["is_correct"] + data2[i]["is_correct"] + data3[i]["is_correct"] + data4[i]["is_correct"] + data5[i]["is_correct"] + data6[i]["is_correct"] + data7[i]["is_correct"] + data8[i]["is_correct"] + data9[i]["is_correct"] + data10[i]["is_correct"]
    component.append(count)
    if count >=6:
        know = know +1
        component.append("sure")
    else:
        component.append("unsure")
    list.append(component)
    
print(know)
print(rough)
print(len(data1))

In [None]:
count = 0
new_list = []
with open(f"/kaggle/input/mmlu-test/MMLU_ID_train.json", 'r') as f:
    data = json.load(f)
for subject in data.keys():
    for instance in data[subject]:
        component = instance.copy()
        component.append(list[count])
        count += 1
        new_list.append(component)

print(new_list[0])
with open(f"/kaggle/working/results4/MMLU_ID_train10.json", 'w') as f:
    json.dump(new_list, f, indent=2)
print(results[0])

In [None]:
with open("/kaggle/working/results2/MMLU_ID.json", "r") as f:
    results = json.load(f)

# extract correct predictions
num_correct = sum([item[0] for item in results])
total = len(results)
accuracy = num_correct / total if total > 0 else 0

# see a few examples
print(f"Total questions: {total}")
print(f"Correct predictions: {num_correct}")
print(f"Final Accuracy: {accuracy:.4f}")
print("\nSample results:")
for i in range(3):
    print(f"{i+1}. Correct: {results[i][0]}, Model Confidence: {results[i][1]:.4f}, Self-assessed: {results[i][2]:.4f}")