# Inference Notebook for Prompt Engineering

In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_model(
    model_name: str = "google/gemma-7b-it",
) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
    config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name, low_cpu_mem_usage=True, quantization_config=config, attn_implementation="flash_attention_2"
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    return model, tokenizer

In [3]:
def generate_prompt(movie: str) -> str:
    return f"""You are a person interacting with a movie recommendation system. Your goal is to make a short request that will help the system to suggest the movie "{movie}" without mentioning its title, characters, or ANY plot elements. The response should instead use GENERAL characteristics like the genre, tone, and themes of the movie. Your request should be concise, sound conversational, and not be too enthusiastic. For example, the hidden movie "Crazy Stupid Love" should give a request like "I'm looking for a silly romantic comedy with a happy ending. Any suggestions?" Reply ONLY with the human-like request for a movie. DO NOT include any other text.
    """

In [4]:
class SimulatorDataset(Dataset):
    def __init__(self, movies: pd.DataFrame, tokenizer: AutoTokenizer) -> None:
        self.movies = movies
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.movies)

    def __getitem__(self, idx):
        # Generate the prompt for the movie
        prompt = generate_prompt(self.movies.iloc[idx]["title"])

        # Form prompt
        chat = [
            { "role": "user", "content": prompt},
        ]

        # Apply the chat template
        prompt = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

        return prompt

In [5]:
def simulate(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    dataloader: DataLoader,
    max_length: int = 2048,
) -> list:
    responses = []

    for batch in tqdm(dataloader, desc="Simulating", unit="batch"):
        # Tokenize the inputs
        inputs = tokenizer(
            batch, return_tensors="pt", add_special_tokens=False, padding=True
        )

        # Send the inputs to the model device
        inputs.to(model.device)

        # Generate responses
        outputs = model.generate(**inputs, max_new_tokens=max_length)

        # Decode the responses
        responses.extend(
            [
                tokenizer.decode(output, skip_special_tokens=True).split("\nmodel\n")[
                    -1
                ]
                for output in outputs
            ]
        )

    return responses

In [6]:
# Load the model and tokenizer
model, tokenizer = load_model()

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.10s/it]


In [7]:
# Read in the set of movies
movies = pd.read_csv("data/movies.csv")

# Drop unused columns
movies = movies.drop("genres", axis=1)

# Rename columns
movies = movies.rename(columns={"movieId": "movie_id"})

movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [None]:
movies = movies.head(10)

In [8]:
# Create the dataset
dataset = SimulatorDataset(movies, tokenizer)

# Create the dataloader
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

In [9]:
# Simulate the responses
responses = simulate(model, tokenizer, dataloader)

Simulating:   0%|          | 174/62423 [04:30<26:54:55,  1.56s/batch]


KeyboardInterrupt: 

In [None]:
# Add the responses to the dataframe
movies["prompt"] = responses

movies.head()

In [None]:
# Save the new dataframe
movies.to_csv("data/prompts.csv", index=False)