In [1]:
import os
from llama_index.core import SimpleDirectoryReader, Settings, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Directory containing your files

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")  # Adjust device as needed



files_directory = "./data/"  # Replace with your directory path

# Get all files in the directory
file_paths = []
for root, _, files in os.walk(files_directory):
    for file in files:
        # You can add file extension filters here if needed
        # if file.endswith('.txt') or file.endswith('.pdf'):
        file_paths.append(os.path.join(root, file))

print(f"Found {len(file_paths)} files to process")



# Initialize an empty list to collect all nodes
all_nodes = []

# Configure global settings for the RAG pipeline
Settings.embed_model = embed_model  # Using your existing embed_model

# Configure document splitters
splitter = SemanticSplitterNodeParser(
    buffer_size=20, 
    breakpoint_percentile_threshold=95, 
    embed_model=embed_model
)
base_splitter = SentenceSplitter(chunk_size=512)

# Process each file individually
for i, file_path in enumerate(file_paths):
    try:
        print(f"Processing file {i+1}/{len(file_paths)}: {file_path}")
        
        # Load a single document
        documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
        
        # Process document into nodes
        nodes = splitter.get_nodes_from_documents(documents)
        
        print(f"  - Extracted {len(nodes)} nodes from file")
        if nodes:
            print(f"  - Sample content: {nodes[0].get_content()[:100]}...")
            
        # Add nodes from this file to the collection
        all_nodes.extend(nodes)
        
    except Exception as e:
        print(f"  - Error processing file {file_path}: {str(e)}")

print(f"\nTotal nodes collected from all files: {len(all_nodes)}")

# Create a vector index from all collected nodes
index = VectorStoreIndex(all_nodes)

# Configure a retriever with customized search parameters
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=20,  # Number of most relevant chunks to retrieve
)

# Test the vector database with a sample query
test_query = "your test query here"  # Replace with your actual query
print(f"\nTesting retrieval with query: '{test_query}'")
retrieval_results = retriever.retrieve(test_query)
print(f"Retrieved {len(retrieval_results)} chunks")

if retrieval_results:
    print(f"Top result score: {retrieval_results[0].score}")
    print(f"Top result content: {retrieval_results[0].node.get_content()[:150]}...")

# Persist the vector database
storage_path = "./vector_db_storage_only_5"
print(f"\nSaving vector database to {storage_path}")
index.storage_context.persist(storage_path)
print("Vector database saved successfully")

Found 5 files to process
Processing file 1/5: ./data/FINANC_1_Istisna’a and Parallel Istisna’a (10).PDF
  - Extracted 84 nodes from file
  - Sample content: Financial Accounting Standard No. (10)Financial Accounting Standard No. (10)
Istisna’a and Parallel ...
Processing file 2/5: ./data/Ijarah (32).pdf
  - Extracted 59 nodes from file
  - Sample content:  
 
 
 
 
AAOIFI Financial Accounting Standard 32 
Ijarah 
  ...
Processing file 3/5: ./data/FI922A_1_Murabaha and Other Deferred Payment Sales (28).PDF
  - Extracted 29 nodes from file
  - Sample content:  
 
 
 
 
AAOIFI Financial Accounting Standard 28 
Murabaha and Other Deferred Payment Sales 
  ...
Processing file 4/5: ./data/FI5F55_1_Musharaka Financing(4).PDF
  - Extracted 66 nodes from file
  - Sample content: Financial Accounting Standard No. (4)Financial Accounting Standard No. (4)
Musharaka FinancingMushar...
Processing file 5/5: ./data/FI28ED_1_Salam and Parallel Salam (07).PDF
  - Extracted 50 nodes from file
  - Sample 