In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_model(model_name: str = "meta-llama/Meta-Llama-3-8B-Instruct",) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
    config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, quantization_config=config)

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    return model, tokenizer

In [3]:
# Load the model
model, tokenizer = load_model("mistralai/Mistral-7B-Instruct-v0.2")
# model, tokenizer = load_model("meta-llama/Llama-2-7b-chat-hf")
# model, tokenizer = load_model("google/gemma-7b-it")

Loading checkpoint shards: 100%|██████████| 3/3 [00:06<00:00,  2.03s/it]


In [52]:
def generate_prompt(movie: str) -> str:
    return f"""You are interacting with a movie recommendation system. Your goal is to make a short request that will help the system to suggest the movie "{movie}" without mentioning its title, characters, or ANY plot elements. The response should instead use GENERAL characteristics like the genre, tone, and themes of the movie. Your request should be extremely short, sound conversational, not be too enthusiastic, and use informal wording. As an example, for the movie "La La Land" you should give a request like "I want to watch a rom com." Reply ONLY with the human-like request for a movie. DO NOT include any other text.
    """
    
# SPLIT_STR = "\nmodel\n" # Gemma
SPLIT_STR = "[/INST] " # Minstral & Llama-2

# Form prompt
chat = [{"role": "user", "content": generate_prompt("The Notebook")}]

prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

# Tokenize
input_tokens = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")

# Generate
output_tokens = model.generate(input_tokens.to(model.device), max_new_tokens=150, do_sample=True, pad_token_id=tokenizer.eos_token_id)[0]

# Decode
response = tokenizer.decode(output_tokens, skip_special_tokens=True).split(SPLIT_STR)[-1]

print(response)

I'd like to watch a romantic, melodramatic movie.


In [5]:
input_tokens

tensor([[    1,   733, 16289, 28793,   995,   460,   264,  1338, 14113,   288,
           395,   264,  5994, 26077,  1587, 28723,  3604,  5541,   349,   298,
          1038,   264,  2485,  2159,   369,   622,  1316,   272,  1587,   298,
          3397,   272,  5994,   345,  1981,  1067,   811,   978,  3690,   366,
           768, 28739,  1671,  4389,   288,   871,  3941, 28725,  6128, 28725,
           442,  5788,  9242,  5176, 28723,   415,  2899,  1023,  3519,   938,
         25778,   725,  1086, 15559,   737,   272, 15926, 28725, 10294, 28725,
           304, 18978,   302,   272,  5994, 28723,  3604,  2159,  1023,   347,
          3078,   864, 28725,  2622,  5315,  1249, 28725,   304,   459,   347,
          1368, 18185,  3953, 28723,  1263,  2757, 28725,   272,  7918,  5994,
           345, 28743,  7853,   662,  8064,  7481, 28739,  1023,  2111,   264,
          2159,   737,   345, 28737,   947,   298,  3054,   264,  5423,   432,
           611,  3357,   346,  9688,  9880,   395,  