In [None]:
# Basic setup
!pip install transformers accelerate bitsandbytes datasets
!pip install -q git+https://github.com/huggingface/peft.git
!pip install -q faiss-cpu  # Optional if you want to use retrieval

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import random
import numpy as np


In [None]:
model_name = "gpt2"  # You can later switch to LLaMA or Mistral

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)
model.eval()



In [None]:
session_contexts = [
    "User: What's the weather like today?\nAI:",
    "User: Book me a cab to the airport.\nAI:",
    "User: Can you recommend a vegetarian recipe?\nAI:",
]

# Simulate user personalization tokens
user_profile = {"user_id": 42, "preferences": ["vegetarian", "travel", "weather"]}


In [None]:
class KVCacheSimulator:
    def __init__(self):
        self.cache = []

    def add(self, token_id, importance=1.0):
        self.cache.append({"token": token_id, "importance": importance})

    def evict_fifo(self):
        if self.cache:
            self.cache.pop(0)

    def evict_least_important(self):
        self.cache.sort(key=lambda x: x["importance"])
        self.cache.pop(0)

    def print_cache(self):
        print([token["token"] for token in self.cache])


In [None]:
def sage_kv_attention_sparsity(attn_weights):
    return attn_weights.mean(dim=-1).tolist()  # Simplified

def ke_diff_similarity(kv_cache):
    similarities = []
    for i in range(1, len(kv_cache)):
        sim = np.dot(kv_cache[i]["token"], kv_cache[i-1]["token"])
        similarities.append(sim)
    return similarities


In [None]:
input_text = session_contexts[2]
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=30, return_dict_in_generate=True, output_scores=True)
generated = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

print("Generated:", generated)


In [None]:
import pandas as pd


In [None]:
# Example cache state
cache = [
    {"token": "User", "importance": 0.9},
    {"token": "vegetarian", "importance": 0.8},
    {"token": "airport", "importance": 0.6},
    {"token": "recipe", "importance": 0.95},
]

# Create DataFrame
df_cache = pd.DataFrame(cache)
display(df_cache)


In [None]:
class KVCacheSimulatorPandas:
    def __init__(self):
        self.df = pd.DataFrame(columns=["token", "importance"])

    def add(self, token, importance):
        new_row = pd.DataFrame([{"token": token, "importance": importance}])
        self.df = pd.concat([self.df, new_row], ignore_index=True)

    def evict_fifo(self):
        self.df = self.df.iloc[1:].reset_index(drop=True)

    def evict_least_important(self):
        idx = self.df["importance"].idxmin()
        self.df = self.df.drop(index=idx).reset_index(drop=True)

    def show_cache(self):
        display(self.df)


In [None]:
cache_sim = KVCacheSimulatorPandas()

# Adding tokens
cache_sim.add("User", 0.9)
cache_sim.add("vegetarian", 0.8)
cache_sim.add("airport", 0.6)
cache_sim.add("recipe", 0.95)

# Showing current cache
cache_sim.show_cache()

# Evicting least important
cache_sim.evict_least_important()
cache_sim.show_cache()


In [None]:
eviction_log = []

def log_eviction(token, reason):
    eviction_log.append({"token": token, "reason": reason})

# Log example
log_eviction("airport", "least_important")

# Convert to DataFrame
df_evictions = pd.DataFrame(eviction_log)
display(df_evictions)


# 🧠 Personalization-Aware KV Cache Eviction
This notebook explores techniques for prioritizing and evicting token-level key-value (KV) cache entries in multi-turn LLM inference, with a focus on **user-specific behavior** and **attention-based importance**.

The goal is to eventually train a lightweight model that can learn to decide which tokens to retain or evict during long conversations.



In [None]:
!pip install transformers datasets pandas matplotlib --quiet

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np



In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2", output_attentions=True)
model.eval()


In [None]:
session = [
    "Hello, I need help with my math homework.",
    "Sure! What topic are you struggling with?",
    "It's about solving quadratic equations.",
    "Great. Can you give me one example?"
]


In [None]:
token_data = []

for turn_id, sentence in enumerate(session):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        output = model(**inputs)

    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    attn_scores = output.attentions[-1][0][-1].cpu().numpy()  # attention from last layer, last head

    for i, token in enumerate(tokens):
        token_data.append({
            "turn": turn_id,
            "token": token,
            "token_id": int(inputs['input_ids'][0][i]),
            "attention": float(attn_scores[i]),
        })


In [None]:
token_data = []

for turn_id, sentence in enumerate(session):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        output = model(**inputs)

    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    # ---- attention handling ----
    attn = output.attentions[-1][0]                 # (heads, q_len, k_len)
    attn_scores = attn.mean(dim=0).mean(dim=0)      # (k_len,)
    attn_scores = attn_scores.cpu().numpy()

    for i, token in enumerate(tokens):
        token_data.append({
            "turn":       turn_id,
            "token":      token,
            "token_id":   int(inputs['input_ids'][0][i]),
            "attention":  float(attn_scores[i]),
        })


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd

# 1. Load model
model = AutoModelForCausalLM.from_pretrained("distilgpt2", output_attentions=True, attn_implementation="eager")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model.eval()

# 2. Session input
session = [
    "Hi, how are you?",
    "Can you suggest me a good movie?",
    "Thanks!"
]

# 3. Get token-level attention
token_data = []

for turn_id, sentence in enumerate(session):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        output = model(**inputs)

    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    attn = output.attentions[-1][0]                # (heads, q_len, k_len)
    attn_scores = attn.mean(dim=0).mean(dim=0)     # (k_len,)
    attn_scores = attn_scores.cpu().numpy()

    for i, token in enumerate(tokens):
        token_data.append({
            "turn": turn_id,
            "token": token,
            "token_id": int(inputs['input_ids'][0][i]),
            "attention": float(attn_scores[i]),
        })

# 4. Display as table
df = pd.DataFrame(token_data)
display(df.head(20))


In [None]:
df['evict'] = df['attention'] < 0.05  # Example threshold



In [None]:
X = df[['token_id', 'turn', 'attention']]
y = df['evict']


In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)


In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)


In [None]:
from sklearn.metrics import accuracy_score

# Predict on the same data (just for a basic check)
y_pred = model.predict(X)

# See accuracy
print("Training Accuracy:", accuracy_score(y, y_pred))


In [None]:
df['predicted_evict'] = y_pred
df[['token', 'attention', 'evict', 'predicted_evict']].head(10)


In [None]:
import torch.nn as nn

class TokenImportanceModel(nn.Module):
    def __init__(self, input_dim):
        super(TokenImportanceModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 1)
        self.sigmoid = nn.Sigmoid()  # Output score between 0 and 1

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.sigmoid(self.fc3(x))


***From here starting the tagging system, the main idea***

In [None]:
import pandas as pd

# Sample prompts and topic labels
data = {
    "prompt": [
        "How does retrieval-augmented generation work?",
        "Can I eat three chilas at once?",
        "How to improve aim in Valorant?",
        "Tell me about KV cache in LLMs",
        "Give me a diet plan for weight loss",
        "What is the best crosshair for Vandal?",
        "Can you help me with my gym routine?",
        "Difference between BERT and GPT?",
        "Suggest me good graphics settings for Valorant",
        "Will skipping dinner help in fat loss?"
    ],
    "label": [
        "LLM",
        "Weight loss",
        "Gaming",
        "LLM",
        "Weight loss",
        "Gaming",
        "Weight loss",
        "LLM",
        "Gaming",
        "Weight loss"
    ]
}

df = pd.DataFrame(data)
df



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Encode the labels (text to integers)
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['prompt'])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, df['label_encoded'], test_size=0.2, random_state=42)


In [None]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Predict
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# --- Fix: only pass the labels that actually appear in y_test ---
unique_test_labels = np.unique(y_test)
print(
    "\nClassification Report:\n",
    classification_report(
        y_test,
        y_pred,
        labels=unique_test_labels,
        target_names=label_encoder.inverse_transform(unique_test_labels),
        zero_division=0,   # avoids divide‑by‑zero warnings for small datasets
    ),
)


In [None]:
def classify_prompt(prompt):
    vec = vectorizer.transform([prompt])
    pred = model.predict(vec)
    return label_encoder.inverse_transform(pred)[0]

# Test
print(classify_prompt("Help me with fine-tuning GPT models"))
print(classify_prompt("Can I have cold coffee on keto?"))
print(classify_prompt("How to rank up in Valorant?"))


In [None]:
llm_prompts = [
    "What are key-value caches in transformers?",
    "Explain retrieval-augmented generation with an example.",
    "How can LLMs adapt to user behavior over time?",
    "What is SAGE-KV in attention mechanisms?",
    "How does a sliding window attention work in transformers?",
    "Which models support dynamic context expansion?",
    "Compare rotary positional embeddings and absolute embeddings.",
    "What does attention sparsity mean in transformer models?",
    "How does fine-tuning differ from instruction tuning?",
    "Can BART be used with FAISS for retrieval?",
    "Explain token prioritization in multi-turn LLMs.",
    "Are KV caches useful in inference optimization?",
    "What is the benefit of dropout in transformers?",
    "How do LoRA adapters modify transformer layers?",
    "What’s the best cache eviction strategy for long sessions?",
    "What is dynamic prompt injection?",
    "How can user personalization improve chatbot performance?",
    "What is the role of embeddings in document retrieval?",
    "How does FAISS work under the hood?",
    "What are sentence embeddings used for?",
    "Explain BERT vs SBERT in simple terms.",
    "Which LLMs support streaming token output?",
    "What are attention gates used for in LLMs?",
    "How do we quantify token importance in a KV cache?",
    "What is token salience ranking?",
    "Is KV cache storage a bottleneck in LLMs?",
    "Explain the concept of token-level importance modeling.",
    "How does OpenAI's GPT handle multi-user memory?",
    "What are the risks of incorrect KV cache eviction?",
    "Which metrics are used to evaluate cache-aware LLMs?"
]


In [None]:
weight_loss_prompts = [
    "Can I eat 2 chillas at once during diet?",
    "Is cold coffee okay for weight loss?",
    "What can I replace jaggery with in tea?",
    "How many calories in paneer bhurji?",
    "Can I do intermittent fasting with Indian food?",
    "Is roti better than rice for weight loss?",
    "What time should I stop eating at night?",
    "How to plan a vegetarian low-carb diet?",
    "Can I drink fruit juice while dieting?",
    "Is ghee harmful during weight loss?",
    "Is skipping dinner better than breakfast?",
    "How often should I eat during IF?",
    "What is a good weight loss chutney recipe?",
    "Can I use rock salt in weight loss meals?",
    "Is poha a good breakfast for dieting?",
    "Can I drink buttermilk while fasting?",
    "What are slow carbs and fast carbs?",
    "Is it okay to have cheat meals?",
    "How many steps should I walk per day?",
    "Can I drink flavored water during fast?",
    "What’s a good low-calorie Indian dessert?",
    "How to reduce bloating during diet?",
    "Can I eat after 8 PM during weight loss?",
    "Is coconut water good for weight loss?",
    "Can I use peanut butter in my diet?",
    "Should I stop dairy to lose weight?",
    "What foods cause water retention?",
    "Best home workouts without equipment?",
    "Can I add lemon to my green tea?",
    "Does fasting affect metabolism long-term?"
]


In [None]:
valorant_prompts = [
    "How to fix low FPS in Valorant?",
    "Why is Valorant stuck at 30 FPS?",
    "What’s the best sensitivity for beginners?",
    "How to check GPU usage while gaming?",
    "Should I use low latency mode in NVIDIA settings?",
    "How to enable high performance for Valorant?",
    "Which is the best agent for ranked solo play?",
    "How to reduce input lag in Valorant?",
    "Best crosshair settings for headshots?",
    "How to use killjoy effectively?",
    "Can I play Valorant with integrated graphics?",
    "How to set Valorant on dedicated GPU?",
    "What are the best system settings for smoother gameplay?",
    "How to update drivers for Valorant performance?",
    "Should I cap FPS or leave it unlocked?",
    "Why does my ping spike in Valorant?",
    "Can Discord cause FPS drops in Valorant?",
    "What does 'anti-aliasing' do in Valorant settings?",
    "Best resolution for competitive Valorant?",
    "What is 4:3 stretched resolution and is it allowed?",
    "How to prioritize RAM for gaming?",
    "Is Valorant CPU or GPU heavy?",
    "What is the best monitor refresh rate for Valorant?",
    "How to check if Valorant is using dedicated GPU?",
    "Does RAM size affect FPS in Valorant?",
    "How to fix Valorant update errors?",
    "Can I undervolt CPU for better temps while gaming?",
    "Does Valorant support dual monitors?",
    "What is the ideal aim training routine?",
    "How to record Valorant gameplay without lag?"
]


In [None]:
import pandas as pd

# Combine prompts and labels into a DataFrame
data = pd.DataFrame({
    "prompt": llm_prompts + weight_loss_prompts + valorant_prompts,
    "label": ["llm"]*30 + ["weight_loss"]*30 + ["valorant"]*30
})

# Shuffle for good measure
data = data.sample(frac=1).reset_index(drop=True)

# Preview
data.head()


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

# Print mapping
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['prompt'])
y = data['label_encoded']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=2000)
clf.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = clf.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X, y, cv=5)
print("Cross-validation scores:", scores)
print("Average:", scores.mean())
