In [None]:
import json
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from huggingface_hub import login
import os
import pandas as pd  # For visualization

In [None]:
class EmbeddingCalculator:
    def __init__(self, model_name, hf_token):
        """
        Initialize the embedding calculator with a specific model.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
        self.model = AutoModel.from_pretrained(
            model_name,
            use_auth_token=hf_token,
            device_map="auto",  # Automatically distribute model layers across devices
            torch_dtype=torch.float16  # Load the model in half precision to save memory
        )

    def calculate_embedding(self, text):
        """
        Calculate the embedding for a given text.
        """
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        # Move inputs to the same device as the model
        inputs = {key: value.to(self.model.device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True)
            # Use the last hidden layer's CLS token embedding
            embedding = outputs.hidden_states[-1][:, 0, :].squeeze().cpu().numpy()
        return embedding

In [None]:
class DynamicMemory:
    def __init__(self):
        """
        Initialize dynamic memory for embeddings.
        """
        self.memory = {}  # Store embeddings in a dictionary for fast retrieval

    def add_embedding(self, key, embedding):
        """
        Add an embedding to the dynamic memory.
        """
        self.memory[key] = embedding

    def get_embedding(self, key):
        """
        Retrieve an embedding from the dynamic memory.
        """
        return self.memory.get(key, None)

    def visualize_memory(self):
        """
        Visualize the stored embeddings as a table.
        """
        if not self.memory:
            print("Dynamic memory is empty.")
            return

        # Display the keys of the memory (question/answer labels)
        data = [{"Key": key, "Embedding Shape": embedding.shape} for key, embedding in self.memory.items()]
        df = pd.DataFrame(data)
        print(df)


In [None]:
def save_single_embedding(embedding, filename):
    """
    Save a single embedding to a NumPy file.
    """
    np.save(filename, embedding)

In [None]:
def main():
    # Authenticate with Hugging Face
    print("Authenticating with Hugging Face...")
    login()  # You will be prompted to enter your Hugging Face token

    # Specify the model name
    model_name = "meta-llama/Llama-3.2-3B-Instruct"  # Replace with your model
    hf_token = input("Enter your Hugging Face token: ").strip()

    # Initialize the embedding calculator and dynamic memory
    calculator = EmbeddingCalculator(model_name, hf_token)
    memory = DynamicMemory()

    # Load questions and original query
    try:
        from google.colab import files
        print("Please upload the JSON file containing the original query and related questions.")
        uploaded = files.upload()  # Returns a dictionary of uploaded files
        json_file = list(uploaded.keys())[0]  # Get the first uploaded file
        with open(json_file, "r") as f:
            data = json.load(f)
    except ImportError:
        json_file = input("Enter the path to the JSON file: ").strip()
        with open(json_file, "r") as f:
            data = json.load(f)

    # Extract original query and related questions
    original_query = data.get("original_query")
    qna_pairs = data.get("qna_pairs")  # List of question-answer pairs
    if not original_query or not qna_pairs:
        print("Invalid JSON format. Ensure the file contains 'original_query' and 'qna_pairs' keys.")
        return

    print(f"Original query: {original_query}")
    print(f"Number of QnA pairs: {len(qna_pairs)}")

    # Calculate and save the original query embedding
    print("Calculating embedding for the original query...")
    original_query_embedding = calculator.calculate_embedding(original_query)
    save_single_embedding(original_query_embedding, "original_query.npy")
    memory.add_embedding("original_query", original_query_embedding)
    print("Original query embedding saved to 'original_query.npy'.")

    # Process and save embeddings dynamically for each QnA pair
    print("Calculating embeddings for questions and answers...")
    for i, pair in enumerate(qna_pairs):
        question = pair.get("question")
        answer = pair.get("answer")

        print(f"Processing QnA Pair {i+1}:")
        print(f"Question: {question}")
        print(f"Answer: {answer}")

        # Calculate embeddings
        question_embedding = calculator.calculate_embedding(question)
        answer_embedding = calculator.calculate_embedding(answer)

        # Add embeddings to dynamic memory
        memory.add_embedding(f"question_{i+1}", question_embedding)
        memory.add_embedding(f"answer_{i+1}", answer_embedding)

        # Save embeddings to files
        save_single_embedding(question_embedding, f"dynamic_embeddings/question_{i+1}.npy")
        save_single_embedding(answer_embedding, f"dynamic_embeddings/answer_{i+1}.npy")

    print("All embeddings for questions and answers saved in 'dynamic_embeddings/'.")

    # Visualize dynamic memory
    print("\nVisualizing dynamic memory:")
    memory.visualize_memory()

    # Retrieval example
    print("\nRetrieving first question embedding from memory:")
    retrieved_embedding = memory.get_embedding("question_1")
    if retrieved_embedding is not None:
        print("Retrieved embedding shape:", retrieved_embedding.shape)
    else:
        print("Embedding not found.")

if __name__ == "__main__":
    main()

Authenticating with Hugging Face...


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Enter your Hugging Face token: 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




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Please upload the JSON file containing the original query and related questions.


Saving questions_answers.json to questions_answers.json


AttributeError: 'list' object has no attribute 'get'