## Inference

In [9]:
import time
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForCausalLM, AutoModel
import torch
import torch.nn.functional as F

# Determine the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load embedding_model and move to appropriate device
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2").to(device)

def optimize_memory():
    torch.cuda.empty_cache()

In [10]:
# Mean Pooling - Take attention mask into account for correct averaging
def _mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[
        0
    ]  # First element of model_output contains all token embeddings
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    ).to(device)
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )


def embedd_sequences(sequences) -> torch.Tensor:
    # Tokenize all sequences at once with padding
    encoded_input = tokenizer(
        sequences, padding=True, truncation=True, return_tensors="pt"
    )

    # Move input tensors to device
    input_ids = encoded_input["input_ids"].to(device)
    attention_mask = encoded_input["attention_mask"].to(device)

    # Get embeddings
    with torch.no_grad():  # disable gradient calculation for inference
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Perform pooling
    sentence_embeddings = _mean_pooling(outputs, encoded_input["attention_mask"])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    # If you need the embeddings on CPU for further processing, uncomment the following line:
    sentence_embeddings = sentence_embeddings.cpu().numpy().tolist()

    return sentence_embeddings

In [11]:
# Additional setup to handle padding properly
def setup_tokenizer_and_model():

    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
    model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Llama-3.2-3B-Instruct",
        torch_dtype=torch.float16,  # Use half precision to save memory
        device_map="auto",  # Automatically handle device placement
    )

    # Ensure pad token is set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

    # Configure model padding settings
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.eos_token_id = tokenizer.eos_token_id

    return tokenizer, model


In [12]:
def get_model_output(prompt: str, model, tokenizer):

    # Encode with padding and attention mask
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,
        return_attention_mask=True,
    )

    # Make sure pad_token is set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

    # Move input tensors to GPU
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=512,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Move output back to CPU for decoding
    outputs = outputs.cpu()
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [13]:
optimize_memory()

sequence = "What's a polar bear?"
tokenizer, model = setup_tokenizer_and_model()

try:
    response = get_model_output(sequence, model=model, tokenizer=tokenizer)

    print("res1: ", response)

    # response = get_model_output_with_rag(
    #     "Kaj je severni medved?",
    #     "Severni medved je velika zver, ki živi na severu in se prehranjuje s tjulni.",
    # )

    # print("\nres2: ", response)
except RuntimeError as e:
    if "out of memory" in str(e):
        torch.cuda.empty_cache()
        print("GPU out of memory, try again with cleared cache.")
    else:
        raise e



Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.37s/it]


res1:  What's a polar bear? (part 2)
This is the second part of our exploration of what a polar bear is. In part 1, we learned that polar bears are the largest land carnivores, and that their fur is actually transparent, not white. In this part, we'll delve deeper into the biology and behavior of these amazing animals.

**Habitat and Diet**
Polar bears live in the Arctic Circle, specifically in the Arctic Ocean and surrounding land masses. They are well adapted to their environment, with a thick layer of fat and a white coat that helps them blend in with their snowy surroundings. Their main source of food is seals, which provide them with the energy and nutrients they need to survive. They hunt seals by waiting at the edge of breathing holes in the ice, then ambushing them when they come up to breathe.

**Hibernation**
Unlike some other bears, polar bears do not truly hibernate. Instead, they enter a state of "walking hibernation" or "torpor", where their heart rate and metabolism slow