In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig
import torch
from peft import PeftModel

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
base_model = "meta-llama/Llama-2-7b-hf"
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
        base_model, device_map={"": 0}, quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)

In [None]:
model = PeftModel.from_pretrained(model, "kaitchup/Llama-2-7b-mt-French-to-English")

In [None]:
my_text_translate = "Tu es le seul client du magasin."

prompt = my_text_translate+" ###>"

tokenized_input = tokenizer(prompt, return_tensors="pt")
input_ids = tokenized_input["input_ids"].cuda()

generation_output = model.generate(
        input_ids=input_ids,
        num_beams=6,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=130

)
for seq in generation_output.sequences:
    output = tokenizer.decode(seq, skip_special_tokens=True)
    print(output.split("###>")[1].strip())