In [None]:
# Install required packages
!pip install -U bitsandbytes transformers accelerate

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers.cache_utils import DynamicCache
from huggingface_hub import notebook_login
notebook_login()  # Login with your HF token



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Choose any model you have access to (e.g. LLaMA 2)
model_id = "meta-llama/Llama-2-7b-chat-hf"
#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Quantization config for 4-bit loading
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)



In [None]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Sample (random) knowledge base: Tech gadget recalls
knowledge_base = """
Incident 1: Smartwatch Sync Issue
Device: TechFit Pro Smartwatch
Problem: Fails to sync with Android phones due to firmware bug.
Fix: Firmware update version 2.1.4 resolved Bluetooth sync issues.

Incident 2: Noise Cancelling Headphones Overheating
Device: SoundBliss NC700
Problem: Excessive heat during long use, especially when ANC is on.
Fix: Manufacturer advised users to limit use, offered refunds.

Incident 3: Fitness Band Step Miscount
Device: FitRun 360
Problem: Inaccurate step count, overestimates by 20–30%.
Fix: Patch 1.0.5 released to fix accelerometer calibration.
"""



In [None]:
# Cache the knowledge
def preload_knowledge(knowledge):
    input_ids = tokenizer.encode(knowledge, return_tensors="pt").to(model.device)
    cache = DynamicCache()
    with torch.no_grad():
        _ = model(input_ids=input_ids, use_cache=True, past_key_values=cache)
    return cache


In [None]:
# Load the knowledge into KV cache
kv_cache = preload_knowledge(knowledge_base)

In [None]:
# Generate answer based on cached knowledge
def ask_question(question, kv_cache, max_new_tokens=100):
    input_ids = tokenizer.encode(question, return_tensors="pt").to(model.device)
    output_ids = input_ids.clone()
    next_token = input_ids

    with torch.no_grad():
        for _ in range(max_new_tokens):
            outputs = model(
                input_ids=next_token,
                use_cache=True,
                past_key_values=kv_cache
            )
            next_token_logits = outputs.logits[:, -1, :]
            next_token = next_token_logits.argmax(dim=-1).unsqueeze(-1)

            output_ids = torch.cat([output_ids, next_token], dim=-1)

            if next_token.item() == tokenizer.eos_token_id:
                break

            # Update KV cache
            kv_cache = outputs.past_key_values

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [None]:
# General info prompt
q1 = "List all devices mentioned in the incidents. Just names."
r1 = ask_question(q1, kv_cache)
print(f"{r1}\n")

# Problem-focused prompt
q2 = "What was wrong with the TechFit Pro Smartwatch? Keep it brief."
r2 = ask_question(q2, kv_cache)
print(f"{r2}\n")

# Fix-focused prompt
q3 = "What solution was offered for overheating headphones?"
r3 = ask_question(q3, kv_cache)
print(f"{r3}\n")


List all devices mentioned in the incidents. Just names.
TechFit Pro Smartwatch
SoundBliss NC700
FitRun 360

What was wrong with the TechFit Pro Smartwatch? Keep it brief. 
Firmware bug caused sync issues. 

What solution was offered for overheating headphones? 
Manufacturer advised users to limit use and offered refunds. 

