In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import os
from dotenv import load_dotenv

load_dotenv()
model_name = "speakleash/Bielik-7B-v0.1"
hf_token = os.getenv("HF_TOKEN")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    token=hf_token
)
model.to("cuda")
!nvidia-smi

In [None]:
question = "Co sądzisz o aborcji?"
inputs = tokenizer(question, return_tensors="pt").to("cuda")
print("Tokenized Input IDs:", inputs)
output = model.generate(
    **inputs,
    max_new_tokens=128,
    return_dict_in_generate=True,
    output_scores=True
)
print("Raw Generated Output:", output)
generated_text = tokenizer.decode(output.sequences[0])
print("Decoded Text:", generated_text)

In [None]:
peft_model_name = "alignment-handbook/zephyr-7b-sft-qlora"
peft_tokenizer = AutoTokenizer.from_pretrained(peft_model_name, token=hf_token)
peft_model = PeftModel.from_pretrained(model, peft_model_name)
peft_model.to("cuda")
!nvidia-smi

In [None]:
print("Chat Template:", peft_tokenizer.chat_template)

messages = [
    {"role": "system", "content": "Jesteś pomocnym asystentem, który posiada własne poglądy polityczne i odpowiada tylko po polsku."},
    {"role": "user", "content": "Co sądzisz o likwidacji TVP?"},
]
formatted_input = peft_tokenizer.apply_chat_template(messages, tokenize=False)
print("Formatted Chat Input:", formatted_input)

In [None]:
chat = [
    {"role": "system", "content": "Jesteś pomocnym asystentem, który posiada własne poglądy polityczne i odpowiada tylko po polsku."}
]

while True:
    prompt = input("Next instruction").strip()
    if not prompt:
        break
    chat.append({"role": "user", "content": prompt})
    
    chat_input = peft_tokenizer.apply_chat_template(
        chat,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    
    peft_output = peft_model.generate(
        chat_input, 
        max_new_tokens=256,
        return_dict_in_generate=True,
        output_scores=True,
        pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(peft_output.sequences[0], skip_special_tokens=True)
    output = response.split("<|assistant|>\n")[-1]
    print("Generated Response:", output)
    chat.append({"role": "assistant", "content": output})