In [None]:
from huggingface_hub import login 
HF_API_KEY = "insert"
login(HF_API_KEY)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from colorama import Fore
import torch

MERGED_DIR = "Savoxism/gpt2-instruction-finetuned"  # Update to your GPT-2 model

print(f"{Fore.CYAN}Loading tokenizer and model from {MERGED_DIR}…{Fore.RESET}")
tokenizer = AutoTokenizer.from_pretrained(
    MERGED_DIR,
    use_fast=True
)

# Set pad token for GPT-2 if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MERGED_DIR,
    torch_dtype=torch.float16,
    device_map="auto"
)
print(f"{Fore.GREEN}Loaded successfully!{Fore.RESET}")

In [None]:
def generate_response(model, tokenizer, question: str) -> str:
    """
    Generate response using GPT-2 instruction format.
    Uses the same template as training: ### Instruction: ... ### Response: ...
    """
    # Format the prompt using the same template as training
    prompt = f"### Instruction:\n{question}\n\n### Response:\n"
    
    print(f"Prompt:\n{prompt}\n")

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Decode the full response
    full = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the response part after "### Response:"
    response_marker = "### Response:\n"
    if response_marker in full:
        response = full.split(response_marker, 1)[1].strip()
        # Remove any additional instruction blocks that might have been generated
        if "### Instruction:" in response:
            response = response.split("### Instruction:")[0].strip()
        return response
    else:
        return full.strip()

In [None]:
from colorama import Fore, Style
question = "In a TM1 cube, what's the minimum number of dimensions?"
print(f"{Fore.CYAN}Question: {question}{Style.RESET_ALL}\n")
answer = generate_response(model, tokenizer, question)
print(f"{Fore.GREEN}Answer: {answer}{Style.RESET_ALL}")

In [None]:
# Test with multiple questions
test_questions = [
    "What advice do you have for someone starting their career?",
    "How do you optimize TM1 cube performance?",
    "What are the best practices for data modeling?",
    "Explain the difference between FEEDERS and DB functions in TM1."
]

for i, question in enumerate(test_questions, 1):
    print(f"{Fore.YELLOW}Test {i}: {question}{Style.RESET_ALL}")
    answer = generate_response(model, tokenizer, question)
    print(f"{Fore.GREEN}Answer: {answer}{Style.RESET_ALL}")
    print("-" * 80)