In [1]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

In [2]:
def load_docs(docs_path):
    loader = DirectoryLoader(docs_path, glob="**/*.html")
    documents = loader.load()
    return documents

documents = load_docs('omniscien.com')

In [3]:
def clean_duplicate(documents):
    content_unique = []
    index_unique = []
    content_duplicate = []
    index_duplicate = []
    for index, doc in enumerate(documents):
        if doc.page_content not in content_unique:
            content_unique.append(doc.page_content)
            index_unique.append(index)
        else :
            content_duplicate.append(doc.page_content)
            index_duplicate.append(index)
    documents_clean = [item for index, item in enumerate(documents) if index in index_unique]
    return documents_clean
documents_clean = clean_duplicate(documents)

In [4]:
def split_docs(documents,chunk_size=1000,chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    sp_docs = text_splitter.split_documents(documents)
    return sp_docs
sp_docs = split_docs(documents_clean)

Current Embbeding (all-MiniLM-L6-v2)

In [18]:
embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2",
                                model_kwargs = {'device': 'cpu'})
db = FAISS.from_documents(sp_docs, embeddings)

In [19]:
query = "Which book are Philipp writing?"
docs = db.similarity_search(query, k = 10)
docs

[Document(page_content='Dion was a founder of The ActiveX Factory, where he was the recipient of the Chairman’s Commendation Award presented by Microsoft’s Bill Gates for the best showcase of software developed in the Philippines. The US Government has recognized Dion as being in the top 5% of his field worldwide and he is a former holder of a US O1 Extraordinary Ability Visa.\n\nPhilipp Koehn\n\nChief Scientist\n\nBehind many of the tools design is Omniscien’s Chief Scientist, Professor Philipp Koehn who leads our team of researchers and developers. Philipp is a pioneer in the machine translation space, his books on Statistical Machine Translation and Neural Machine Translation are the leading academic textbooks globally on machine translation. Both books are available now from Amazon.com or leading book stores.', metadata={'source': 'omniscien.com/about-us/company/index.html'}),
 Document(page_content='To find a way to address that problem, we flew in Philipp Koehn to introduce him t

# bge-small-en

In [7]:
embeddings = HuggingFaceEmbeddings(model_name = "BAAI/bge-small-en",
                                model_kwargs = {'device': 'cpu'})
db_gte = FAISS.from_documents(sp_docs, embeddings)

In [8]:
query = "Which book are Philipp writing?"
docs = db_gte.similarity_search(query, k = 10)
docs

[Document(page_content='To find a way to address that problem, we flew in Philipp Koehn to introduce him to our concepts and ideas. Philipp at the time was a promising researcher in Statistical Machine Translation (SMT) and recently created a large corpus of bilingual content derived from European Parliament documents and had just released the first version of the Moses SMT decoder which in the meantime has become the de-facto platform for SMT. Soon after, Philipp joined our team as Chief Scientist and has been driving research and development efforts ever since. Philipp joining the team gave us the in-depth knowledge of MT that few had at the time and allowed us to stay ahead of the competition from a technical perspective. While Philipp was the father of the Moses decoder, his Master’s thesis in the 1990’s was on Neural Networks before they become practical for machine translation.', metadata={'source': 'omniscien.com/blog/riding-machine-translation-hype-cycle/index.html'}),
 Documen

# gte-base

In [16]:
embeddings = HuggingFaceEmbeddings(model_name = "thenlper/gte-base",
                                model_kwargs = {'device': 'cpu'})
db_gte = FAISS.from_documents(sp_docs, embeddings)

In [17]:
query = "Which book are Philipp writing?"
docs = db_gte.similarity_search(query, k = 10)
docs

[Document(page_content='Naturally, Omniscien tools and technologies are heavily reliant on high-quality specialized data to power our platform and technologies. Because we understand the importance of high-quality data, the Omniscien team is dedicated to breaking new ground with novel research and approaches to creating, mining, harvesting, synthesizing, and manufacturing data. Omniscien has built a variety of powerful tools for data creation, preparation, and analysis.\n\nBehind many of the tools design is Omniscien’s Chief Scientist, Professor Philipp Koehn who leads our team of researchers and developers. Philipp is a pioneer in the machine translation space, his books on Statistical Machine Translation and Neural Machine Translation are the leading academic textbooks globally on machine translation. Both books are available now from Amazon.com or leading book stores.\n\nProfessor Philipp Koehn,Chief Scientist,Omniscien Technologies.\n\nRelated Links\n\nWhat is Rules-Based Machine T

#  gte-small

In [11]:
embeddings = HuggingFaceEmbeddings(model_name = "thenlper/gte-small",
                                model_kwargs = {'device': 'cpu'})
db_gte = FAISS.from_documents(sp_docs, embeddings)

Downloading (…)2e6d8/.gitattributes: 100%|██████████| 1.52k/1.52k [00:00<00:00, 2.63MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 317kB/s]
Downloading (…)9e0ce2e6d8/README.md: 100%|██████████| 68.1k/68.1k [00:00<00:00, 257kB/s]
Downloading (…)0ce2e6d8/config.json: 100%|██████████| 583/583 [00:00<00:00, 1.06MB/s]
Downloading pytorch_model.bin: 100%|██████████| 66.8M/66.8M [00:22<00:00, 2.94MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 57.0/57.0 [00:00<00:00, 93.0kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 202kB/s]
Downloading (…)2e6d8/tokenizer.json: 100%|██████████| 712k/712k [00:00<00:00, 725kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 394/394 [00:00<00:00, 672kB/s]
Downloading (…)9e0ce2e6d8/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 4.08MB/s]
Downloading (…)ce2e6d8/modules.json: 100%|██████████| 385/385 [00:00<00:00, 542kB/s]


In [13]:
query = "Which book are Philipp writing?"
docs = db_gte.similarity_search(query, k = 10)
docs

[Document(page_content='To find a way to address that problem, we flew in Philipp Koehn to introduce him to our concepts and ideas. Philipp at the time was a promising researcher in Statistical Machine Translation (SMT) and recently created a large corpus of bilingual content derived from European Parliament documents and had just released the first version of the Moses SMT decoder which in the meantime has become the de-facto platform for SMT. Soon after, Philipp joined our team as Chief Scientist and has been driving research and development efforts ever since. Philipp joining the team gave us the in-depth knowledge of MT that few had at the time and allowed us to stay ahead of the competition from a technical perspective. While Philipp was the father of the Moses decoder, his Master’s thesis in the 1990’s was on Neural Networks before they become practical for machine translation.', metadata={'source': 'omniscien.com/blog/riding-machine-translation-hype-cycle/index.html'}),
 Documen

#  e5-base

In [14]:
embeddings = HuggingFaceEmbeddings(model_name = "intfloat/e5-base",
                                model_kwargs = {'device': 'cpu'})
db_gte = FAISS.from_documents(sp_docs, embeddings)

Downloading (…)06d6a/.gitattributes: 100%|██████████| 1.48k/1.48k [00:00<00:00, 2.80MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 200/200 [00:00<00:00, 335kB/s]
Downloading (…)20b0006d6a/README.md: 100%|██████████| 67.6k/67.6k [00:00<00:00, 286kB/s]
Downloading (…)b0006d6a/config.json: 100%|██████████| 645/645 [00:00<00:00, 1.03MB/s]
Downloading model.safetensors: 100%|██████████| 438M/438M [02:30<00:00, 2.91MB/s] 
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [02:22<00:00, 3.06MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 57.0/57.0 [00:00<00:00, 75.3kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 131kB/s]
Downloading (…)06d6a/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 931kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 356/356 [00:00<00:00, 558kB/s]
Downloading (…)20b0006d6a/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 4.79MB/s]
Downloading (…)0006d6a/modules.json: 100%|█████████

In [15]:
query = "Which book are Philipp writing?"
docs = db_gte.similarity_search(query, k = 10)
docs

[Document(page_content='To find a way to address that problem, we flew in Philipp Koehn to introduce him to our concepts and ideas. Philipp at the time was a promising researcher in Statistical Machine Translation (SMT) and recently created a large corpus of bilingual content derived from European Parliament documents and had just released the first version of the Moses SMT decoder which in the meantime has become the de-facto platform for SMT. Soon after, Philipp joined our team as Chief Scientist and has been driving research and development efforts ever since. Philipp joining the team gave us the in-depth knowledge of MT that few had at the time and allowed us to stay ahead of the competition from a technical perspective. While Philipp was the father of the Moses decoder, his Master’s thesis in the 1990’s was on Neural Networks before they become practical for machine translation.', metadata={'source': 'omniscien.com/blog/riding-machine-translation-hype-cycle/index.html'}),
 Documen