# Notebook to generate FAISS index files

#### Import libraries

In [None]:
import time
from pathlib import Path
from typing import List, Tuple

from langchain import PromptTemplate, LLMChain

from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain.text_splitter import RecursiveCharacterTextSplitter,MarkdownHeaderTextSplitter,Language
from langchain_community.vectorstores.faiss import FAISS
from langchain_community.document_loaders.parsers.pdf import PDFPlumberParser 

from langchain_community.document_loaders import TextLoader, UnstructuredPDFLoader,PyPDFLoader,DirectoryLoader,UnstructuredHTMLLoader,BSHTMLLoader,DataFrameLoader,UnstructuredExcelLoader
from langchain_community.document_loaders.pdf import PDFPlumberLoader
from langchain_community.document_loaders.csv_loader import UnstructuredCSVLoader,CSVLoader
from langchain_community.document_loaders import MHTMLLoader
from langchain_community.document_loaders.web_base import WebBaseLoader
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader

from langchain_community.document_loaders import ConfluenceLoader,UnstructuredXMLLoader

from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever

import pickle
import os

In [None]:
# set variables
data_dir = "./data"
index_dir = "./data_index"

In [None]:
def save_object(obj, filename):
    with open(filename, 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)

In [None]:
def initialize_embeddings() -> HuggingFaceEmbeddings:
    model_name = "./models/all-mpnet-base-v2/"   # required: keep model at this path
    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': False}
    return HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )

embeddings = initialize_embeddings()

In [None]:
## load the document as per the data source and type
def load_documents() -> List:
    dirs = os.listdir( data_dir )
    docs=[]
    headers_to_split_on = [("#", "Header 1"),("##", "Header 2"),("###", "Header 3")]
    md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
    for file in dirs:
        if file.endswith('.md'):
            with open(data_dir+'/'+file, 'r',encoding="utf-8") as file:
                data = file.read()
                html_header_splits = md_splitter.split_text(data)
                for doc in html_header_splits:
                    doc.metadata['source']=Path(file.name).stem +'.md'
                    docs.append(doc)
    return docs

In [None]:
## split the data into smaller chunks
def split_chunks(sources: List, child_splitter) -> List:
    chunks = []
    for chunk in child_splitter.split_documents(sources):
        # print("\n" +str(chunk) +"\n")
        chunks.append(chunk)
    return chunks

## generate index
def generate_index(chunks: List, embeddings: HuggingFaceEmbeddings) -> FAISS:
    texts = [doc.page_content for doc in chunks]
    metadatas = [doc.metadata for doc in chunks]
    return FAISS.from_texts(texts, embeddings, metadatas=metadatas)

In [None]:
print("INFO: Generating Index...\n")
start = time.time()

sources = load_documents()

child_splitter = RecursiveCharacterTextSplitter(chunk_size=256,chunk_overlap=0, separators=[" ", ",", "\n"]) #500
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=256)

chunks = split_chunks(sources, child_splitter)

vectorstore = generate_index(chunks, embeddings)
store = InMemoryStore()

retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)
retriever.add_documents(sources)

vectorstore.save_local(index_dir)
save_object(store, './'+index_dir+'/retriever.pkl')

end = time.time()
emb_time = round((end - start), 0)
print("INFO: Index generated.\n")
print("\nEmbedding time : " +str(emb_time) +" sec")

In [None]:
query = '<query-here>' 
print("QUERY - " + query)

new_db = FAISS.load_local(index_dir, embeddings, allow_dangerous_deserialization=True)

docs = new_db.similarity_search_with_score(query)
docs