### Split the data into chunks to maintain contextual information

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load the embedding model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Load text data
file_path = 'combined_text.txt'
with open(file_path, 'r') as f:
    text = f.read()

# Chunk the data (e.g., by word count)
def chunk_text(text, chunk_size=100):  # Chunk size in words
    words = text.split()
    chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = chunk_text(text, chunk_size=100)

# Generate embeddings for the chunks
chunk_embeddings = embedding_model.encode(chunks)

# Build FAISS index
dimension = chunk_embeddings.shape[1]  # Embedding size
index = faiss.IndexFlatL2(dimension)  # L2 (Euclidean distance) index
index.add(np.array(chunk_embeddings))  # Add chunk embeddings to the index

### Hugging Face Login

In [None]:
# Install ipywidgets package to use notebook_login()
%pip install ipywidgets

In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Retrieve the Model and Tokenizer

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "meta-llama/Llama-3.2-3B"  # Replace with your model name
llama_tokenizer = AutoTokenizer.from_pretrained(model_name)
llama_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)

if llama_tokenizer.pad_token_id is None:
    llama_tokenizer.pad_token_id = llama_tokenizer.eos_token_id  # Use EOS token as padding token

### Helper Functions

In [62]:
# Get the top 3 relevant chunks of data from the index
def retrieve_relevant_chunks(query, index, chunks, embedding_model, top_k=3):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    return [chunks[idx] for idx in indices[0]]

def combine_chunks(chunks):
    return "\n".join(chunks)

def generate_answer(context, query):
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    inputs = llama_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048, padding=True)
    
    outputs = llama_model.generate(
        inputs["input_ids"].to('cuda'),
        attention_mask=inputs["attention_mask"],
        max_new_tokens=256,
        temperature=0.5,
        num_return_sequences=1,
    )
    
    answer = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

### Generate Answers from Queries

In [None]:
while True:
    query = input("Enter your query: ")

    #  Retrieve relevant chunks and combine them into one text
    retrieved_chunks = retrieve_relevant_chunks(query, index, chunks, embedding_model)
    context = combine_chunks(retrieved_chunks)
    
    # Make the answer generation from LLM
    answer = generate_answer(context, query)
    print(answer)