# Inference Notebook for Prompt Engineering

In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_model(
    model_name: str = "google/gemma-2b-it",
) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
    config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name, low_cpu_mem_usage=True, quantization_config=config
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    return model, tokenizer

In [3]:
def generate_prompt(movie: str):
    return f"""<bos><start_of_turn>user\nYou are a movie enthusiast browsing for something to watch. You're in the mood for a specific type of film, but you can't quite put your finger on it. You want to provide the recommendation system with enough hints to suggest the perfect movie for you. The movie that you want to watch is: {movie}. Now, craft a sentence that will help the recommendation system suggest this movie to you, WITHOUT mentioning the title or any specifics like character names. Think about the genre, the tone, or any other characteristic that might help narrow down your search. Remember, the goal is to guide the recommendation engine to suggest the hidden movie you have in mind. Your response should sound conversational and not be too enthusiastic.Reply ONLY with the human-like request. DO NOT include any other text.<end_of_turn>\n<start_of_turn>model\n"""

In [4]:
class SimulatorDataset(Dataset):
    def __init__(self, movies: pd.DataFrame) -> None:
        self.movies = movies

    def __len__(self):
        return len(self.movies)

    def __getitem__(self, idx):
        return generate_prompt(self.movies.iloc[idx]["title"])

In [5]:
def simulate(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    dataloader: DataLoader,
    max_length: int = 2048,
) -> list:
    responses = []

    for batch in tqdm(dataloader, desc="Simulating", unit="batch"):
        # Tokenize the inputs
        inputs = tokenizer(
            batch, return_tensors="pt", add_special_tokens=False, padding=True
        )

        # Send the inputs to the model device
        inputs.to(model.device)

        # Generate responses
        outputs = model.generate(**inputs, max_new_tokens=max_length)

        # Decode the responses
        responses.extend(
            [
                tokenizer.decode(output, skip_special_tokens=True).split("\nmodel\n")[
                    -1
                ]
                for output in outputs
            ]
        )

    return responses

In [13]:
# Read in the set of movies
movies = pd.read_csv("data/movies.csv")

# Drop unused columns
movies = movies.drop("genres", axis=1)

# Rename columns
movies = movies.rename(columns={"movieId": "movie_id"})

movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [14]:
# Create the dataset
dataset = SimulatorDataset(movies)

# Create the dataloader
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

In [8]:
# Load the model and tokenizer
model, tokenizer = load_model()

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


bin c:\Users\troyd\Documents\Programs\ECE364D\CERS\.venv\lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll


Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.21s/it]
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [15]:
# Simulate the responses
responses = simulate(model, tokenizer, dataloader)

Simulating:   0%|          | 17/62423 [01:08<70:16:25,  4.05s/batch]


KeyboardInterrupt: 