In [3]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load the embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def get_embeddings(chunks):
    return [model.encode(chunk) for chunk in chunks]

def store_embeddings(embeddings):
    dimension = embeddings[0].shape[0]  # Size of the embedding vector
    index = faiss.IndexFlatL2(dimension)  # Initialize a FAISS index
    index.add(np.array(embeddings))  # Add embeddings to the index
    return index

def save_faiss_index(index, path):
    
    faiss.write_index(index, path)

def load_faiss_index(path):
    return faiss.read_index(path)

def search_faiss(query, index, model, k=5):
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding), k)  # Search for top-k similar embeddings
    return I  # Returns the indices of the most similar chunks

# Example chunks of text
import os

def load_chunks_from_folder(folder_path):
    chunks = []
    filenames = sorted(os.listdir(folder_path))  # Ensure files are read in order
    for filename in filenames:
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            chunks.append(file.read())
    return chunks

# Directory containing the text files
folder_path = 'output/chunks/'
chunks = load_chunks_from_folder(folder_path)

# Embed and store embeddings
embeddings = get_embeddings(chunks)
faiss_index = store_embeddings(embeddings)

# Save and load FAISS index
save_faiss_index(faiss_index, 'model_embeddings/faiss_index.index')
faiss_index = load_faiss_index('model_embeddings/faiss_index.index')




RuntimeError: Error in __cdecl faiss::FileIOWriter::FileIOWriter(const char *) at D:\a\faiss-wheels\faiss-wheels\faiss\faiss\impl\io.cpp:98: Error: 'f' failed: could not open model_embeddings/faiss_index.index for writing: Permission denied

In [11]:
save_faiss_index(faiss_index, 'model_embeddings/faiss_index.index')
faiss_index = load_faiss_index('model_embeddings/faiss_index.index')


Directory exists: model_embeddings
Saving FAISS index to: model_embeddings/faiss_index.index
FAISS index saved successfully.


In [12]:

# Perform a search
query = "Describe the anatomy of the human heart."
indices = search_faiss(query, faiss_index, model, k=5)

# Retrieve and print results
retrieved_chunks = [chunks[i] for i in indices[0]]
print("Retrieved Chunks:")
for chunk in retrieved_chunks:
    print(chunk)


Retrieved Chunks:
As you read this chapter, try to keep these twin concepts in mind: pump and
muscle.

 Although the term “heart” is an English word, cardiac (heart-related) terminology can be traced back to the Latin term,
“kardia.” Cardiology is the study of the heart, and cardiologists are the physicians who deal primarily with the heart.

 19.1 | Heart Anatomy

By the end of this section, you will be able to:
¢ Describe the location and position of the heart within the body cavity
¢ Describe the internal and external anatomy of the heart

dentify the tissue layers of the heart

¢ Relate the structure of the heart to its function as a pump

* Compare systemic circulation to pulmonary circulation

¢ Identify the veins and arteries of the coronary circulation system

 ¢ Trace the pathway of oxygenated and deoxygenated blood thorough the chambers of the heart

The vital importance of the heart is obvious.
The pericardial
sac consists of two fused layers: an outer fibrous capsule and an

In [6]:

from transformers import GPT2LMHeadModel, GPT2Tokenizer

import faiss
def load_faiss_index(path):
    return faiss.read_index(path)

faiss_index = load_faiss_index('model_embeddings/faiss_index.index')
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


    
def format_output(text):
    """
    Use GPT-2 model to generate formatted output.
    """
    model_name = "gpt2"  # You can use other variants like "gpt2-medium", "gpt2-large", etc.
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    inputs = tokenizer.encode(text, return_tensors="pt")
    outputs = model.generate(inputs, max_length=500, num_return_sequences=1)

    formatted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return formatted_text






def search_faiss(query, index, model, k=7):
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding), k)  # Search for top-k similar embeddings
    return I  # Returns the indices of the most similar chunks


def retrieve_and_format_results(query):
    """
    Retrieve and format results using Mistral.
    """
    # Retrieve results from FAISS
    results = search_faiss(query,index=faiss_index ,model=model)  # Replace with actual FAISS search implementation
    
    # Format results using the local Mistral model
    formatted_results = format_output(results)
    return formatted_results

# Example usage
if __name__ == '__main__':
    query = "Describe the anatomy of the human heart."
    formatted_results = retrieve_and_format_results(query)
    print(formatted_results)


NameError: name 'faiss' is not defined