In [1]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders.pdf import PyMuPDFLoader
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_text_splitters import CharacterTextSplitter
from langchain_text_splitters import TokenTextSplitter
from operator import itemgetter
import os

## How to load these indexes into code 

In [2]:
#db = FAISS.load_local("../VectorStores/NameOfFolder", OllamaEmbeddings(model =  "mxbai-embed-large"), allow_dangerous_deserialization= True)

## Pre-requisites

In [3]:
Embedding_Model1 = "mxbai-embed-large"
Embedding_Model2 = "nomic-embed-text"

## mx-bai-embed model (only a 500 context length which is not too good)

### HowWebWorks PDF

In [4]:
loader = DirectoryLoader("../Data/testDataPDF",glob="*.pdf",loader_cls=PyMuPDFLoader)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
db = FAISS.from_documents(docs, OllamaEmbeddings(model = Embedding_Model1))

os.makedirs("../VectorStores/mxbai", exist_ok = True)
db.save_local("../VectorStores/mxbai/webPDF")

### HowWebWorks MD

In [5]:
loader = DirectoryLoader(path="../Data/testDataMD",glob="*.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(separator="##", chunk_size=400, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
db = FAISS.from_documents(docs, OllamaEmbeddings(model = Embedding_Model1))

os.makedirs("../VectorStores/mxbai", exist_ok = True)
db.save_local("../VectorStores/mxbai/webMD")

Created a chunk of size 1161, which is longer than the specified 1000
Created a chunk of size 2033, which is longer than the specified 1000
Created a chunk of size 2603, which is longer than the specified 1000
Created a chunk of size 1782, which is longer than the specified 1000
Created a chunk of size 5099, which is longer than the specified 1000
Created a chunk of size 2110, which is longer than the specified 1000
Created a chunk of size 1398, which is longer than the specified 1000
Created a chunk of size 3672, which is longer than the specified 1000
Created a chunk of size 1463, which is longer than the specified 1000
Created a chunk of size 1654, which is longer than the specified 1000
Created a chunk of size 2628, which is longer than the specified 1000
Created a chunk of size 1706, which is longer than the specified 1000
Created a chunk of size 1773, which is longer than the specified 1000
Created a chunk of size 2232, which is longer than the specified 1000
Created a chunk of s

### Single Printer Book

In [6]:
loader = DirectoryLoader("../Data/onePrinterBook",glob="*.pdf",loader_cls=PyMuPDFLoader)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
db = FAISS.from_documents(docs, OllamaEmbeddings(model = Embedding_Model1))

os.makedirs("../VectorStores/mxbai", exist_ok = True)
db.save_local("../VectorStores/mxbai/SinglePrinter")

### All Printer Books

In [7]:
loader = DirectoryLoader(path="../Data/multiplePrinterBooks",glob="**/*.pdf", loader_cls=PyMuPDFLoader)
documents = loader.load()
text_splitter = TokenTextSplitter(chunk_size=400, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
db = FAISS.from_documents(docs, OllamaEmbeddings(model = Embedding_Model1))

os.makedirs("../VectorStores/mxbai", exist_ok = True)
db.save_local("../VectorStores/mxbai/AllPrinters")

## nomic-embed model used for embedding

### HowWebWorks PDF

In [4]:
loader = DirectoryLoader("../Data/testDataPDF",glob="*.pdf",loader_cls=PyMuPDFLoader)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
db = FAISS.from_documents(docs, OllamaEmbeddings(model = Embedding_Model2))

os.makedirs("../VectorStores/nomic", exist_ok = True)
db.save_local("../VectorStores/nomic/webPDF")

### HowWebWorks MD

In [5]:
loader = DirectoryLoader(path="../Data/testDataMD",glob="*.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(separator="##", chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
db = FAISS.from_documents(docs, OllamaEmbeddings(model = Embedding_Model2))

os.makedirs("../VectorStores/nomic", exist_ok = True)
db.save_local("../VectorStores/nomic/webMD")

Created a chunk of size 1161, which is longer than the specified 1000
Created a chunk of size 2033, which is longer than the specified 1000
Created a chunk of size 2603, which is longer than the specified 1000
Created a chunk of size 1782, which is longer than the specified 1000
Created a chunk of size 5099, which is longer than the specified 1000
Created a chunk of size 2110, which is longer than the specified 1000
Created a chunk of size 1398, which is longer than the specified 1000
Created a chunk of size 3672, which is longer than the specified 1000
Created a chunk of size 1463, which is longer than the specified 1000
Created a chunk of size 1654, which is longer than the specified 1000
Created a chunk of size 2628, which is longer than the specified 1000
Created a chunk of size 1706, which is longer than the specified 1000
Created a chunk of size 1773, which is longer than the specified 1000
Created a chunk of size 2232, which is longer than the specified 1000
Created a chunk of s

### Single Printer Book

In [6]:
loader = DirectoryLoader("../Data/onePrinterBook",glob="*.pdf",loader_cls=PyMuPDFLoader)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
db = FAISS.from_documents(docs, OllamaEmbeddings(model = Embedding_Model2))

os.makedirs("../VectorStores/nomic", exist_ok = True)
db.save_local("../VectorStores/nomic/SinglePrinter")

### All Printer Books

In [4]:
loader = DirectoryLoader(path="../Data/multiplePrinterBooks",glob="**/*.pdf", loader_cls=PyMuPDFLoader)
documents = loader.load()
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=300)
docs = text_splitter.split_documents(documents)
db = FAISS.from_documents(docs, OllamaEmbeddings(model = Embedding_Model2))

os.makedirs("../VectorStores/nomic", exist_ok = True)
db.save_local("../VectorStores/nomic/AllPrinters")