In [3]:
import os
import spacy

# Load SpaCy's large English model for better performance on semantic chunking

import spacy

# Load the SpaCy model
nlp = spacy.load("en_core_web_lg")

# Increase the maximum text length
nlp.max_length = 2000000  # Set this to a larger value as needed

# Define paths
text_directory = 'output/text/'
chunk_output_folder = 'output/chunks/'

# Ensure the output directory exists
os.makedirs(chunk_output_folder, exist_ok=True)

def semantic_chunking(text, min_chunk_size=50, max_chunk_size=200):
    doc = nlp(text)
    
    chunks = []
    chunk = []
    word_count = 0

    for sent in doc.sents:
        chunk.append(sent.text)
        word_count += len(sent)

        # If the current chunk exceeds the maximum chunk size, finalize the chunk
        if word_count >= max_chunk_size:
            chunks.append(" ".join(chunk))
            chunk = []
            word_count = 0
        
        # If the chunk is within the minimum size and the next sentence would push it over the max size, finalize the chunk
        elif word_count >= min_chunk_size and (word_count + len(sent)) > max_chunk_size:
            chunks.append(" ".join(chunk))
            chunk = []
            word_count = 0
    
    # Add any remaining text as the last chunk
    if chunk:
        chunks.append(" ".join(chunk))
    
    return chunks

def process_text_file(text_file_path, min_chunk_size, max_chunk_size):
    with open(text_file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    chunks = semantic_chunking(text, min_chunk_size, max_chunk_size)

    base_filename = os.path.splitext(os.path.basename(text_file_path))[0]
    for i, chunk in enumerate(chunks):
        chunk_filename = os.path.join(chunk_output_folder, f"{base_filename}_chunk_{i+1}.txt")
        with open(chunk_filename, 'w', encoding='utf-8') as chunk_file:
            chunk_file.write(chunk)

def chunk_all_text_files(min_chunk_size=50, max_chunk_size=200):
    text_files = [f for f in os.listdir(text_directory) if f.lower().endswith('.txt')]

    for text_file in text_files:
        text_file_path = os.path.join(text_directory, text_file)
        print(f"Processing {text_file_path}...")
        process_text_file(text_file_path, min_chunk_size, max_chunk_size)
    print("Semantic chunking completed.")

if __name__ == '__main__':
    chunk_all_text_files(min_chunk_size=50, max_chunk_size=200)


Processing output/text/anatomy_vol_1.txt...
Processing output/text/anatomy_vol_2.txt...
Processing output/text/anatomy_vol_3.txt...
Semantic chunking completed.


In [4]:
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Initialize the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

def chunk_text_with_textsplit(text, chunk_size=1000):
    """
    Chunk text into smaller pieces.
    """
    # Simple chunking by length
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def generate_embeddings(text_chunks):
    """
    Generate embeddings for a list of text chunks.
    """
    embeddings = model.encode(text_chunks, show_progress_bar=True)
    return embeddings

def create_faiss_index(embeddings):
    """
    Create a FAISS index and add the embeddings.
    """
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # Using L2 distance

    # Add embeddings to the index
    index.add(embeddings)
    return index

def search_faiss_index(index, query_embedding, k=5):
    """
    Search the FAISS index for the top-k most similar vectors.
    """
    distances, indices = index.search(np.array([query_embedding]), k)
    return distances, indices

def process_and_store_embeddings(text_directory):
    text_files = [f for f in os.listdir(text_directory) if f.lower().endswith('.txt')]
    
    all_chunks = []
    all_embeddings = []

    for text_file in text_files:
        text_file_path = os.path.join(text_directory, text_file)
        with open(text_file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        chunks = chunk_text_with_textsplit(text, chunk_size=1000)
        all_chunks.extend(chunks)
    
    # Generate embeddings
    embeddings = generate_embeddings(all_chunks)

    # Create and store embeddings in FAISS
    index = create_faiss_index(embeddings)
    
    # Save the index to disk
    faiss.write_index(index, 'faiss_index.index')

def search(text, index_path='faiss_index.index'):
    # Load the FAISS index
    index = faiss.read_index(index_path)

    # Generate embedding for the query text
    query_embedding = model.encode([text])[0]

    # Perform the search
    distances, indices = search_faiss_index(index, query_embedding, k=5)
    return distances, indices

if __name__ == '__main__':
    # Define your text directory
    text_directory = 'output/chunks/'

    # Process and store embeddings
    process_and_store_embeddings(text_directory)

    # Example search
    query = "context of the book"
    distances, indices = search(query)
    print(f"Distances: {distances}")
    print(f"Indices: {indices}")


  from tqdm.autonotebook import tqdm, trange


Batches:   0%|          | 0/197 [00:00<?, ?it/s]

Distances: [[1.0200987 1.1858795 1.2546614 1.2967848 1.3206853]]
Indices: [[6057 2658 2674 1394 3002]]


In [5]:
def retrieve_documents(text_directory, indices):
    """
    Retrieve the documents corresponding to the indices returned by the FAISS search.
    """
    document_chunks = []
    for index in indices[0]:
        if index >= 0 and index < len(all_chunks):
            document_chunks.append(all_chunks[index])
    
    return document_chunks

if __name__ == '__main__':
    # Define your text directory
    text_directory = 'output/chunks/'

    # Process and store embeddings
    process_and_store_embeddings(text_directory)

    # Example search
    query = "example query text"
    distances, indices = search(query)

    # Retrieve documents based on search results
    retrieved_docs = retrieve_documents(text_directory, indices)
    for doc in retrieved_docs:
        print(doc)


Batches:   0%|          | 0/197 [00:00<?, ?it/s]

NameError: name 'all_chunks' is not defined

In [None]:
from oollama import Mistral

# Update with the correct model path
model_path = r'C:\\Users\SURYA\\.ollama\\models\\manifests\\gistry.ollama.ai\\library\\mistral'

# Initialize Mistral with the local model path
mistral = Mistral(model_path=model_path)

def format_output(text):
    """
    Use the local Mistral model to format the output text.
    """
    formatted_text = mistral.format(text)
    return formatted_text

def search_faiss(query):
    """
    Dummy implementation of FAISS search.
    Replace this with your actual FAISS search code.
    """
    # Example placeholder for FAISS search
    return "Sample results from FAISS search related to: " + query

def retrieve_and_format_results(query):
    """
    Retrieve and format results using Mistral.
    """
    # Retrieve results from FAISS
    results = search_faiss(query)  # or however you retrieve results

    # Format results using the local Mistral model
    formatted_results = format_output(results)
    return formatted_results

# Example usage
if __name__ == '__main__':
    query = "Describe the anatomy of the human heart."
    formatted_results = retrieve_and_format_results(query)
    print(formatted_results)


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load the embedding model
model = SentenceTransformer('sentence-transformers/scibert-scivocab-cased')

def get_embeddings(chunks):
    return [model.encode(chunk) for chunk in chunks]

def store_embeddings(embeddings):
    dimension = embeddings[0].shape[0]  # Size of the embedding vector
    index = faiss.IndexFlatL2(dimension)  # Initialize a FAISS index
    index.add(np.array(embeddings))  # Add embeddings to the index
    return index

def save_faiss_index(index, path):
    faiss.write_index(index, path)

def load_faiss_index(path):
    return faiss.read_index(path)

def search_faiss(query, index, model, k=5):
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding), k)  # Search for top-k similar embeddings
    return I  # Returns the indices of the most similar chunks

# Example chunks of text
chunks = ["Chunk 1 text here", "Chunk 2 text here", "Chunk 3 text here"]

# Embed and store embeddings
embeddings = get_embeddings(chunks)
faiss_index = store_embeddings(embeddings)

# Save and load FAISS index
save_faiss_index(faiss_index, 'path/to/faiss_index.index')
faiss_index = load_faiss_index('path/to/faiss_index.index')

# Perform a search
query = "Describe the anatomy of the human heart."
indices = search_faiss(query, faiss_index, model, k=5)

# Retrieve and print results
retrieved_chunks = [chunks[i] for i in indices[0]]
print("Retrieved Chunks:")
for chunk in retrieved_chunks:
    print(chunk)
