In [None]:
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer
import faiss
import pickle

def load_faiss_index(path):
    return faiss.read_index(path)

def load_text_chunks(path):
    with open(path, 'rb') as file:
        return pickle.load(file)

faiss_index = load_faiss_index('model_embeddings/faiss_index.index')
text_chunks = load_text_chunks('model_embeddings/text_chunks.pkl')
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def format_output(text):
    """
    Use GPT-2 model to generate formatted output.
    """
    model_name = "gpt2"  # You can use other variants like "gpt2-medium", "gpt2-large", etc.
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    inputs = tokenizer.encode(text, return_tensors="pt")
    outputs = model.generate(inputs, max_length=500, num_return_sequences=1)

    formatted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return formatted_text

def search_faiss(query, index, model, k=7):
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding).astype(np.float32), k)  # Ensure query_embedding is of type float32
    return I  # Returns the indices of the most similar chunks

def retrieve_and_format_results(query):
    """
    Retrieve and format results using GPT-2.
    """
    # Retrieve indices from FAISS
    indices = search_faiss(query, index=faiss_index, model=model)

    # Retrieve text chunks corresponding to the indices
    results = " ".join([text_chunks[idx] for idx in indices[0]])

    # Format results using GPT-2
    formatted_results = format_output(results)
    return formatted_results

# Example usage
if __name__ == '__main__':
    query = "Describe the anatomy of the human heart."
    formatted_results = retrieve_and_format_results(query)
    print("GPT :()")
    print(formatted_results)


In [9]:
import os
import pickle
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain_community.llms import Ollama

# Paths
MODEL = "mistral"  # Use the Mistral model
model = Ollama(model=MODEL)

text_chunks_folder = 'output/chunks/'
faiss_index_path = 'model_embeddings/faiss_index.index'
vector_db_path = 'model_embeddings/embeddings.pkl'

def load_text_chunks_from_folder(folder_path):
    text_chunks = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                text_chunks.append(file.read())
    return text_chunks

def embed_text_chunks(text_chunks, model):
    return model.encode(text_chunks)

def save_embeddings(embeddings, text_chunks, path):
    with open(path, 'wb') as file:
        pickle.dump((text_chunks, embeddings), file)

def load_embeddings(path):
    with open(path, 'rb') as file:
        return pickle.load(file)

def build_faiss_index(embeddings):
    dim = embeddings.shape[1]  # Dimension of embeddings
    index = faiss.IndexFlatL2(dim)  # L2 distance index
    index.add(embeddings)  # Add embeddings to index
    return index

from transformers import AutoTokenizer, AutoModelForCausalLM

def format_output(text):
    from langchain_community.llms import Ollama


def format_output(context, question):
    """
    Use Mistral model to generate formatted output.
    """
    # Define the template
    template = """
    Answer the question based on the context below. If you can't 
    answer the question, reply "I don't know".
    Only give me the answers based on the context below.
    Only answer the question asked. Do not provide additional information.
    Give a clear and concise answer.
    


    Context: {context}

    Question: {question}
    """

    # Format the template with context and question
    prompt_text = template.format(context=context, question=question)

    # Generate a response using the Mistral model
    response = model(prompt_text)

    # Return the formatted output
    return response

# Example usage
formatted_results = format_output("Here is some context", "Here is a question")
print(formatted_results)




def search_faiss(query, index, model, k=5):
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding), k)  # Search for top-k similar embeddings
    return I[0]  # Returns the indices of the most similar chunks

def retrieve_and_format_results(query, index, text_chunks, model):
    indices = search_faiss(query, index, model)
    
    # Handle case where no indices are returned
    if not indices.size:
        return "No relevant information found."

    # Check for valid indices
    valid_indices = [i for i in indices if 0 <= i < len(text_chunks)]
    results = " ".join([text_chunks[i] for i in valid_indices])  # Concatenate retrieved chunks
    
    formatted_results = format_output(results ,query)
    return formatted_results

# Initialize models
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Load or create embeddings and FAISS index
if os.path.exists(vector_db_path):
    text_chunks, embeddings = load_embeddings(vector_db_path)
else:
    text_chunks = load_text_chunks_from_folder(text_chunks_folder)
    embeddings = embed_text_chunks(text_chunks, embedding_model)
    save_embeddings(embeddings, text_chunks, vector_db_path)

faiss_index = build_faiss_index(embeddings)

# Example usage


 I don't have the specific question in the provided context. Please provide the complete question for an accurate response.




In [15]:

query = "what are DNA made up of, explain in detail with help of flowchart" 
formatted_results = retrieve_and_format_results(query, faiss_index, text_chunks, embedding_model)
print("RAG :(\n")
print(formatted_results)


RAG :(

1. DNA (Deoxyribonucleic Acid) is a long, twisted molecule that carries genetic information in the form of a code made up of four different nucleotides: Adenine (A), Thymine (T), Guanine (G), and Cytosine (C).

2. Each DNA strand consists of a sugar-phosphate backbone, where alternating sugar (deoxyribose) and phosphate groups are connected together. Attached to each sugar molecule is one of the four nitrogenous bases: Adenine, Thymine, Guanine, or Cytosine.

3. The two strands of DNA twist around each other to form a double helix structure (Figure 2.29). Each base pair in the backbone attaches via hydrogen bonds between the complementary bases on opposite strands: Adenine pairs with Thymine, and Cytosine pairs with Guanine.

4. The sequence of these nitrogenous bases forms the genes that act as a molecular code instructing cells in the assembly of amino acids into proteins (Figure 3.25).

   Figure: DNA structure flowchart

   - Backbone: Sugar-phosphate group alternating unit