# Login

In [None]:
!huggingface-cli login

# G-NLL Analysis

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

def load_llama7b_model(model_path, device='cuda'):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    model.eval()
    return tokenizer, model

def generate_text_gnll_once(
    model,
    tokenizer,
    prompt,
    max_length=50,
    top_p=0.9,
    temperature=1.0,
    device='cuda'
):
    """
    1회 샘플링 디코딩하여,
    생성된 시퀀스와 G-NLL을 반환하는 함수.

    Sampling(Top-p, Temperature).
    """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    generated_ids = input_ids.clone()
    total_gnll = 0.0

    for step in range(max_length):
        outputs = model(input_ids=generated_ids)
        logits = outputs.logits

        last_token_logits = logits[:, -1, :]
        scaled_logits = last_token_logits / temperature

        sorted_logits, sorted_indices = torch.sort(scaled_logits, descending=True)
        cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
        idx_remove = (cumulative_probs > top_p)
        if idx_remove.any():
            first_true_idx = torch.nonzero(idx_remove, as_tuple=True)[1][0].item()
            sorted_logits[0, first_true_idx+1:] = float('-inf')

        re_sorted_logits = torch.full_like(scaled_logits, float('-inf'))
        re_sorted_logits[0, sorted_indices] = sorted_logits[0]

        probs = torch.softmax(re_sorted_logits, dim=-1)
        next_token_id = torch.multinomial(probs, num_samples=1)

        next_token_prob = probs[0, next_token_id]
        total_gnll += -torch.log(next_token_prob).item()

        generated_ids = torch.cat([generated_ids, next_token_id], dim=1)

        if tokenizer.eos_token_id is not None:
            if next_token_id.item() == tokenizer.eos_token_id:
                break

    generated_text = tokenizer.decode(generated_ids.squeeze(), skip_special_tokens=True)
    return generated_text, total_gnll

def sample_n_times(
    model,
    tokenizer,
    prompt,
    n=5,
    max_length=50,
    top_p=0.9,
    temperature=1.0,
    device='cuda'
):
    """
    같은 prompt에 대해 N회 샘플링 디코딩을 수행하고,
    각 결과와 G-NLL을 리스트로 반환.
    """
    results = []
    for i in range(n):
        gen_text, gnll_value = generate_text_gnll_once(
            model=model,
            tokenizer=tokenizer,
            prompt=prompt,
            max_length=max_length,
            top_p=top_p,
            temperature=temperature,
            device=device
        )
        results.append((gen_text, gnll_value))
    return results

def analyze_gnll_distribution(results):
    """
    여러 번 샘플링한 결과들에 대한 G-NLL 분포를 분석.
    결과:
      - mean, std, min, max,
      - 가장 G-NLL 낮은/높은 시퀀스
    """
    gnll_values = [r[1] for r in results]
    mean_gnll = float(np.mean(gnll_values))
    std_gnll = float(np.std(gnll_values))
    min_gnll = float(np.min(gnll_values))
    max_gnll = float(np.max(gnll_values))

    idx_min = int(np.argmin(gnll_values))
    idx_max = int(np.argmax(gnll_values))
    best_text = results[idx_min][0]
    worst_text = results[idx_max][0]

    return {
        "mean_gnll": mean_gnll,
        "std_gnll": std_gnll,
        "min_gnll": min_gnll,
        "max_gnll": max_gnll,
        "best_text": best_text,
        "worst_text": worst_text
    }



if __name__ == "__main__":
    model_path = "meta-llama/Llama-3.1-8B-Instruct"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer, model = load_llama7b_model(model_path, device=device)

    prompt = "What is 3+3*2/(1+5)?"

    n_samples = 5
    results = sample_n_times(
        model, tokenizer,
        prompt=prompt,
        n=n_samples,
        max_length=500,
        top_p=0.9,
        temperature=1.0,
        device=device
    )

    analysis = analyze_gnll_distribution(results)

    print(f"=== 샘플링 {n_samples}회 결과 ===")
    for idx, (txt, gnll_val) in enumerate(results):
        print(f"[Sample {idx+1}] G-NLL={gnll_val:.4f}")
        print(f"         {txt}\n")

    print("=== G-NLL 통계량 ===")
    print(f"Mean   : {analysis['mean_gnll']:.4f}")
    print(f"Std    : {analysis['std_gnll']:.4f}")
    print(f"Min    : {analysis['min_gnll']:.4f}")
    print(f"Max    : {analysis['max_gnll']:.4f}")

    print("\n--- G-NLL 가장 낮은 시퀀스 (모델이 비교적 확신을 보인 샘플) ---")
    print(analysis["best_text"])

    print("\n--- G-NLL 가장 높은 시퀀스 (모델이 상대적으로 불확실했던 샘플) ---")
    print(analysis["worst_text"])


# Normalize G-NLL

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

def check_sentence_end(decoded_text_so_far: str):
    decoded_text_so_far = decoded_text_so_far.strip()
    if len(decoded_text_so_far) == 0:
        return False
    last_char = decoded_text_so_far[-1]
    return (last_char in ['.', '?', '!'])

def sample_single_sentence(
    model,
    tokenizer,
    context_ids,
    top_p=0.9,
    temperature=1.0,
    max_tokens_per_sentence=50,
    device='cuda'
):
    """
    (문장 단위) 한 번의 샘플링:
      1) 문장이 끝날 때까지 토큰을 생성 ('.','?','!' 또는 eos_token 등장 시)
      2) 해당 문장의 전체 G-NLL(= 토큰별 -log p의 합) 계산
      3) 문장에 사용된 토큰 수 반환
      4) 업데이트된 new_ids(= context_ids + 새 토큰들)

    Returns:
      generated_sentence (str)    : 생성된 문장(단락)
      total_gnll (float)         : 문장 전체의 G-NLL
      generated_token_count (int) : 문장에 사용된 토큰 수
      new_ids (tensor)           : 새 context (이전 context + 방금 생성된 문장)
    """
    new_ids = context_ids.clone()
    total_gnll = 0.0
    generated_token_count = 0

    for step in range(max_tokens_per_sentence):
        outputs = model(input_ids=new_ids)
        logits = outputs.logits
        last_token_logits = logits[:, -1, :]

        scaled_logits = last_token_logits / temperature

        sorted_logits, sorted_indices = torch.sort(scaled_logits, descending=True)
        cprobs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
        idx_remove = (cprobs > top_p)
        if idx_remove.any():
            first_true_idx = torch.nonzero(idx_remove, as_tuple=True)[1][0].item()
            sorted_logits[0, first_true_idx+1:] = float('-inf')

        re_sorted_logits = torch.full_like(scaled_logits, float('-inf'))
        re_sorted_logits[0, sorted_indices] = sorted_logits[0]

        probs = torch.softmax(re_sorted_logits, dim=-1)
        next_token_id = torch.multinomial(probs, num_samples=1)

        next_token_prob = probs[0, next_token_id]
        total_gnll += -torch.log(next_token_prob).item()
        generated_token_count += 1

        new_ids = torch.cat([new_ids, next_token_id], dim=1)

        if tokenizer.eos_token_id is not None:
            if next_token_id.item() == tokenizer.eos_token_id:
                break

        decoded_so_far = tokenizer.decode(new_ids[0], skip_special_tokens=True)
        if check_sentence_end(decoded_so_far):
            break

    added_tokens = new_ids[0, context_ids.shape[1]:]
    generated_sentence = tokenizer.decode(added_tokens, skip_special_tokens=True)

    return generated_sentence, total_gnll, generated_token_count, new_ids

def sample_sentence_candidates_and_pick_best(
    model,
    tokenizer,
    context_ids,
    n_candidates=3,
    uncertainty_method="total",
    top_p=0.9,
    temperature=1.0,
    max_tokens_per_sentence=50,
    device='cuda'
):
    """
    문장 하나를 만들기 위해 N번 샘플링 후,
    (1) total G-NLL (비정규화)  vs
    (2) normalized G-NLL (토큰 수로 나눈 값)
    중 하나를 선택적으로 비교해 가장 확신 낮은(= 측정값이 가장 높은) 후보를 골라낸다.

    Args:
        model, tokenizer
        context_ids (tensor): 현재까지의 컨텍스트
        n_candidates (int) : 문장 후보 샘플 수
        uncertainty_method (str): "total" or "normalized"
        top_p, temperature, max_tokens_per_sentence
        device

    Returns:
        best_sentence (str) : 선택된 문장
        best_gnll (float)   : 그 문장의 G-NLL(총합)
        best_ids (tensor)   : context_ids + best_sentence
    """
    candidates = []

    for i in range(n_candidates):
        gen_sentence, gnll, token_count, new_ids = sample_single_sentence(
            model=model,
            tokenizer=tokenizer,
            context_ids=context_ids,
            top_p=top_p,
            temperature=temperature,
            max_tokens_per_sentence=max_tokens_per_sentence,
            device=device
        )
        if uncertainty_method == "total":
            measure = gnll
        else:
            measure = gnll / (token_count if token_count > 0 else 1)

        candidates.append((gen_sentence, gnll, token_count, measure, new_ids))

    best_idx = max(range(len(candidates)), key=lambda i: candidates[i][3])
    best_sentence, best_gnll, best_count, best_measure, best_ids = candidates[best_idx]

    return best_sentence, best_gnll, best_ids

def generate_text_sentence_by_sentence(
    model,
    tokenizer,
    prompt,
    max_sentences=5,
    n_candidates=3,
    uncertainty_method="total",
    top_p=0.9,
    temperature=1.0,
    max_tokens_per_sentence=50,
    device='cuda'
):
    """
    문장 단위로 생성:
      - Prompt -> 문장1 생성(N샘플 중 best) -> 문장2 생성(N샘플 중 best) -> ...
      - 최대 max_sentences까지 반복
      - 문장 생성은 샘플링 방식(Top-p, Temperature)
      - 선택 시 G-NLL vs G-NLL/TokenLength 둘 중 하나를 골라서 '최댓값' 고르는 방식

    Returns:
      최종적으로 생성된 텍스트 (str)
    """
    context_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    final_text = tokenizer.decode(context_ids[0], skip_special_tokens=True)

    sentence_count = 0

    while sentence_count < max_sentences:
        sentence_count += 1

        best_sentence, best_gnll, best_ids = sample_sentence_candidates_and_pick_best(
            model=model,
            tokenizer=tokenizer,
            context_ids=context_ids,
            n_candidates=n_candidates,
            uncertainty_method=uncertainty_method,
            top_p=top_p,
            temperature=temperature,
            max_tokens_per_sentence=max_tokens_per_sentence,
            device=device
        )

        context_ids = best_ids

        final_text += best_sentence

        if tokenizer.eos_token_id is not None:
            if context_ids[0, -1].item() == tokenizer.eos_token_id:
                break

    return final_text


if __name__ == "__main__":
    model_path = "meta-llama/Llama-3.1-8B-Instruct"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer, model = load_llama7b_model(model_path, device=device)

    prompt = """How many r in the word 'strawberry?'"""

    print("\n=== [A] total G-NLL 기준 ===")
    text_total = generate_text_sentence_by_sentence(
        model=model,
        tokenizer=tokenizer,
        prompt=prompt,
        max_sentences=10,
        n_candidates=10,
        uncertainty_method="total",  # G-NLL
        top_p=0.9,
        temperature=1.0,
        max_tokens_per_sentence=100,
        device=device
    )
    print("[Result - total G-NLL]\n", text_total)

    print("\n=== [B] normalized G-NLL 기준 ===")
    text_normalized = generate_text_sentence_by_sentence(
        model=model,
        tokenizer=tokenizer,
        prompt=prompt,
        max_sentences=10,
        n_candidates=10,
        uncertainty_method="normalized",  # G-NLL / token_length
        top_p=0.9,
        temperature=1.0,
        max_tokens_per_sentence=100,
        device=device
    )
    print("[Result - normalized G-NLL]\n", text_normalized)
