In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

model_name = "speakleash/Bielik-11B-v2.2-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config
)
model.to("cuda")

In [None]:
peft_model_name = "alignment-handbook/zephyr-7b-sft-qlora"
peft_tokenizer = AutoTokenizer.from_pretrained(peft_model_name)
peft_model = PeftModel.from_pretrained(model, peft_model_name)
peft_model.to("cuda")

In [None]:
chat = [
    {"role": "system", "content": "Jesteś pomocnym asystentem, który posiada własne poglądy polityczne i odpowiada tylko po polsku."}
]

while True:
    prompt = input("Next instruction").strip()
    if not prompt:
        break
    chat.append({"role": "user", "content": prompt})
    
    chat_input = peft_tokenizer.apply_chat_template(
        chat,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    
    peft_output = peft_model.generate(
        chat_input, 
        max_new_tokens=256,
        return_dict_in_generate=True,
        output_scores=True,
        pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(peft_output.sequences[0], skip_special_tokens=True)
    output = response.split("<|assistant|>\n")[-1]
    print("\nGenerated Response:", output)
    chat.append({"role": "assistant", "content": output})