https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore

In [45]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

In [48]:
DB_FAISS_PATH = "/home/sira/sira_project/meta-Llama2/vectorstores_clean_doc_gte-base/db_faiss"
embeddings = HuggingFaceEmbeddings(model_name = "thenlper/gte-base",
                                model_kwargs = {'device': 'cpu'})
db = FAISS.load_local(DB_FAISS_PATH, embeddings)

# Top k

In [62]:
question = "Which role is Gregory in?"
docs_ss = db.similarity_search(question,k=5)

In [63]:
docs_ss

[Document(page_content='Gregory Binger\n\nChairman, Chief Operating Officer, Co-Founder\n\nGregory Binger has been a senior officer, board member, senior counsel or adviser to many of the world’s best-known Internet brands, technology, communications, media and entertainment companies. His positions have included Senior Counsel and Management Board for at Yahoo! UK & Ireland, Executive Vice President & General Counsel at Lycos Asia, Acting Senior Corporate Counsel at nineMSN Australia, and Acting Corporate Counsel Europe at CompuServe.\n\nHe has taken a hands-on approach in securing successful deals on a wide range of issues relating to technology and media licensing and procurement, intellectual property, information technology, technology outsourcing, systems integration, electronic, online and mobile content, games and applications, animation and video postproduction, VoIP, WiFi, and online betting and gaming.', metadata={'source': 'omniscien.com/about-us/company/index.html'}),
 Doc

In [64]:
score = db.similarity_search_with_score(question, k=20)

In [65]:
for i in score :
    print(i[0].metadata, i[1])

{'source': 'omniscien.com/about-us/company/index.html'} 0.3711669
{'source': 'omniscien.com/about-us/company/index.html'} 0.41423225
{'source': 'omniscien.com/blog/speech-recognition-and-speech-synthesis-glossary-terminology-that-you-should-know/index.html'} 0.46924442
{'source': 'omniscien.com/about-us/company/index.html'} 0.4727887
{'source': 'omniscien.com/blog/speech-recognition-speech-synthesis-glossary-o-u/index.html'} 0.48094168
{'source': 'omniscien.com/blog/speech-recognition-and-speech-synthesis-glossary-terminology-that-you-should-know/index.html'} 0.48192027
{'source': 'omniscien.com/technology/index.html'} 0.48200208
{'source': 'omniscien.com/blog/speech-recognition-and-speech-synthesis-glossary-terminology-that-you-should-know/index.html'} 0.484687
{'source': 'omniscien.com/blog/speech-recognition-and-speech-synthesis-glossary-terminology-that-you-should-know/index.html'} 0.48496088
{'source': 'omniscien.com/lsev6/features/amp/advanced-media-processing-overview/index.html

In [66]:
score[0][1]

0.3711669

# Maximum Marginal Relevance Retrieval

In [126]:
docs_mmr = db.max_marginal_relevance_search(question,k=5, fetch_k=10)

In [127]:
docs_mmr

[Document(page_content='Euromatrix and\n\nCASMACAT. Koehn’s research has been funded by the\n\nEuropean Union,\n\nDARPA,\n\nGoogle,\n\nFacebook,\n\nAmazon,\n\nBloomberg, and several other funding agencies. Koehn received his PhD in 2003 from the\n\nUniversity of Southern California and was a postdoctoral research associate at\n\nMIT. He was a finalist for the\n\nEuropean Patent Office’s European Inventor Award in 2013 and received the Award of Honor from the International Association of Machine Translation in 2015.\n\nAt Omniscien, Koehn refined machine translation technology for use in real-world deployments and helped develop methods for data acquisition and refinement. Koehn continues to drive innovation and technological development at Omniscien.\n\nSee how Omniscien can helpsolve your unique language and document processing challenges\n\nRequest a Demo\n\nContact Us\n\nSubscribe to our Mailing List\n\nProducts\n\nLanguage Studio\n\nMedia Studio\n\nServices\n\nCustom MT Engines\n\n

In [92]:
embedded_query = embeddings.embed_query(question)
docs_mmr_vector = db.max_marginal_relevance_search_by_vector(embedded_query,k=5, fetch_k=20)

In [93]:
docs_mmr_vector

[Document(page_content='Dion was a founder of The ActiveX Factory, where he was the recipient of the Chairman’s Commendation Award presented by Microsoft’s Bill Gates for the best showcase of software developed in the Philippines. The US Government has recognized Dion as being in the top 5% of his field worldwide and he is a former holder of a US O1 Extraordinary Ability Visa.\n\nPhilipp Koehn\n\nChief Scientist\n\nBehind many of the tools design is Omniscien’s Chief Scientist, Professor Philipp Koehn who leads our team of researchers and developers. Philipp is a pioneer in the machine translation space, his books on Statistical Machine Translation and Neural Machine Translation are the leading academic textbooks globally on machine translation. Both books are available now from Amazon.com or leading book stores.', metadata={'source': 'omniscien.com/about-us/company/index.html'}),
 Document(page_content='Related Links\n\n→ Index', metadata={'source': 'omniscien.com/blog/localization-gl

# Similarity Score Threshold Retrieval

In [100]:
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.3})
docs_threshold = retriever.get_relevant_documents(question)



In [101]:
docs_threshold

[]

Empty list when use this method.

# -------------------------------------------------------------------------

In [77]:
first_two = dict(list(db.docstore._dict.items())[:20])
first_two

{'e13dcc41-cdec-4520-aa67-1c46d88bdac3': Document(page_content='Home\n\nProducts Private and Secure Artificial Intelligence Tools for EnterpriseOverview\xa0\xa0\xa0|\xa0\xa0\xa0FeaturesEditionsSecure CloudEnterprise Project Management, Editing & Subtitle Data ProcessingOverview\xa0\xa0\xa0|\xa0\xa0\xa0FeaturesEditionsProject Management and Editing PlatformData Processing Platform Data Workflow Automation & Natural Language ProcessingOverview\xa0\xa0\xa0|\xa0\xa0\xa0Features\xa0Learn aboutMachine TranslationCustom MT EnginesIndustry DomainsLanguagesDocument FormatsDeployment ModelsWays to TranslateHuman Language Technology Enhanced by Artificial Intelligence\n\nServices\n\nSolutions\n\nTechnology\n\nResourcesAbout UsCompanyCareersInternshipsNewsOffice LocationsLegalContact UsFrequently Asked Questions (FAQ)SupportOmniscien BlogWebinarsEvents and ConferencesAI, MT and Language Processing SymposiumCase StudiesTestimonialsIntegrated Solution PartnersTechnology PartnersLanguage Pairs – Mach

In [34]:
string1 = "Speakers:\n\nDr Joseph SweeneyIndustry Analyst,Intelligent Business Research Services (IBRS)\n\nGuest Speaker Bio\n\nDr. Joseph Sweeney, is a seasoned advisor and consultant at Intelligent Business Research Services (IBRS), where he specializes in business and technology strategy, emphasizing AI, data privacy, and compliance. He helps organizations navigate the complex landscape of emerging technologies, with deep expertise in education, the public sector, and Microsoft technologies. Dr. Sweeney’s background in AI, cloud computing, and technology trends enables him to offer valuable insights for developing strategic solutions. As a sought-after advisor, he guides organizations through digital transformation, market research, and the integration of cutting-edge technologies.\n\nRedefining Domain Adaptation for Machine Translation and Voice Recognition – An Essential Primer\n\nThursday 16 February 2023\n\nWatch the Replay"
string2 = "Omniscien Technologies products and technologies are deployed globally to a wide variety of customers including governments, multinationals, language service providers, e-commerce providers, media organizations and some of the world’s leading companies and best-known global brands.\n\nCovering nearly 600 global language pairs and with several industry-specific solutions, Omniscien Technologies remains the partner of choice for customers with complex, high-volume bespoke data processing and machine translation needs.\n\nFounded in 2007, Omniscien Technologies Headquarters are located in Singapore, with R&D based out of Bangkok, Thailand, and a commercial presence in several locations in Europe and North America.\n\nTeam\n\nThe Omniscien team comes from a broad set of complementary backgrounds that empower the organization and bridge the cultural and language barriers that define the cross-border and language industries.\n\nGregory Binger\n\nChairman, Chief Operating Officer, Co-Founder"
string3 = "Far from being an ivory tower academic, Greg has excelled at managing and achieving successful results from multi-disciplinary teams of all sizes dealing with marketing, business development and corporate governance. He has significant experience managing companies in Asia. For example at Lycos Asia, he had oversight management of the company secretarial function for the primary holding company and 23 affiliated companies around the world.\n\nGreg has been a frequent speaker and publisher of articles on such diverse topics as hyper-linking risks, copyright and information distribution on computer networks, data security, information privacy, content supply, resale and distribution, intellectual property and technology convergence. He co-authored Computer Evidence: A Forensic Investigations Handbook, a book that deals with covert electronic surveillance, legal and ethical hacking and evidence gathering and preservation techniques.\n\nDion Wiggins\n\nChief Technology Officer, Co-Founder"
string4 = """Join Omniscien\'s Chief Scientist, Professor Philipp Koehn, a leading researcher in the field of AI and MT, along with Dion Wiggins, Omniscien’s CTO, for an hour of information sharing and insights.\n\nThis is Part 2 of a 2 part series. Part 1 is titled "Advances in Artificial Intelligence and Machine Translation: 2022 and Beyond" and is on Wednesday 17 November 2021. See above for registration details.\n\nWatch the Replay\n\nAdvances in Artificial Intelligence and Machine Translation: 2022 and Beyond\n\nWednesday , November 17, 2021"""

In [35]:
print(len(string1))
print(len(string2))
print(len(string3))
print(len(string4))

921
987
993
530


In [40]:
docs_score = db.similarity_search_with_score(question, k=5)

In [41]:
docs_score

[(Document(page_content='Gregory Binger\n\nChairman, Chief Operating Officer, Co-Founder\n\nGregory Binger has been a senior officer, board member, senior counsel or adviser to many of the world’s best-known Internet brands, technology, communications, media and entertainment companies. His positions have included Senior Counsel and Management Board for at Yahoo! UK & Ireland, Executive Vice President & General Counsel at Lycos Asia, Acting Senior Corporate Counsel at nineMSN Australia, and Acting Corporate Counsel Europe at CompuServe.\n\nHe has taken a hands-on approach in securing successful deals on a wide range of issues relating to technology and media licensing and procurement, intellectual property, information technology, technology outsourcing, systems integration, electronic, online and mobile content, games and applications, animation and video postproduction, VoIP, WiFi, and online betting and gaming.', metadata={'source': 'omniscien.com/about-us/company/index.html'}),
  0

In [54]:
db

<langchain.vectorstores.faiss.FAISS at 0x7fdf3a690f70>

# Pre filter 

In [23]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

In [24]:
def load_docs(docs_path):
    loader = DirectoryLoader(docs_path, glob="**/*.html")
    documents = loader.load()
    return documents

documents = load_docs('omniscien.com')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [25]:
def clean_duplicate(documents):
    content_unique = []
    index_unique = []
    content_duplicate = []
    index_duplicate = []
    for index, doc in enumerate(documents):
        if doc.page_content not in content_unique:
            content_unique.append(doc.page_content)
            index_unique.append(index)
        else :
            content_duplicate.append(doc.page_content)
            index_duplicate.append(index)
    documents_clean = [item for index, item in enumerate(documents) if index in index_unique]
    return documents_clean
documents_clean = clean_duplicate(documents)

In [28]:
len(documents_clean)

118

In [4]:
def split_docs(documents,chunk_size=2000,chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    sp_docs = text_splitter.split_documents(documents)
    return sp_docs
sp_docs = split_docs(documents_clean)

In [22]:
embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2",
                                model_kwargs = {'device': 'cpu'})
db = FAISS.from_documents(sp_docs, embeddings)

NameError: name 'sp_docs' is not defined

In [27]:
query = "Which book are Philipp writing?"
docs = db.similarity_search(query, k = 10)
docs

[Document(page_content='Previously Dion was Vice President and Research Director for Gartner based in Hong Kong, where he was the most senior and highly-respected analyst based in all of Asia. Dion’s research reports on ICT in China helped change the way the world views this market.\n\nDion is also a well-known pioneer of the Asian Internet Industry, being the founder of one of Asia’s first ever ISPs (Asia Online in Hong Kong). In his role at Gartner and in various other consulting positions prior to that, Dion advised literally hundreds of enterprises on their ICT strategy.\n\nDion was a founder of The ActiveX Factory, where he was the recipient of the Chairman’s Commendation Award presented by Microsoft’s Bill Gates for the best showcase of software developed in the Philippines. The US Government has recognized Dion as being in the top 5% of his field worldwide and he is a former holder of a US O1 Extraordinary Ability Visa.\n\nPhilipp Koehn\n\nChief Scientist\n\nBehind many of the t

In [None]:
embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2",
                                model_kwargs = {'device': 'cpu'})
db = FAISS.from_documents(sp_docs, embeddings)

In [None]:
from transformers import pipeline

classifier = pipeline("token-classification", model = "vblagoje/bert-english-uncased-finetuned-pos")
classifier("Hello I'm Omar and I live in Zürich.")

In [None]:
classifier("Which book are Philipp writing?")

In [31]:
embeddings = HuggingFaceEmbeddings(model_name = "thenlper/gte-base",
                                model_kwargs = {'device': 'cpu'})
db_gte = FAISS.from_documents(sp_docs, embeddings)

Downloading (…)9c8a9/.gitattributes: 100%|██████████| 1.52k/1.52k [00:00<00:00, 337kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 250kB/s]
Downloading (…)db4ec9c8a9/README.md: 100%|██████████| 68.1k/68.1k [00:00<00:00, 12.0MB/s]
Downloading (…)4ec9c8a9/config.json: 100%|██████████| 618/618 [00:00<00:00, 690kB/s]
Downloading (…)8a9/onnx/config.json: 100%|██████████| 630/630 [00:00<00:00, 986kB/s]
Downloading model.onnx: 100%|██████████| 436M/436M [02:13<00:00, 3.27MB/s] 
Downloading (…)cial_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 204kB/s]
Downloading (…)/onnx/tokenizer.json: 100%|██████████| 712k/712k [00:00<00:00, 731kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 314/314 [00:00<00:00, 493kB/s]
Downloading (…)9c8a9/onnx/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 861kB/s]
Downloading pytorch_model.bin: 100%|██████████| 219M/219M [01:07<00:00, 3.26MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 57.0/57.0

In [32]:
query = "Which book are Philipp writing?"
docs = db_gte.similarity_search(query, k = 10)
docs

[Document(page_content='Search\n\nThe Omniscien Advantage – We wrote the leading academic machine translation textbooks!!\n\nNov 11, 2022\n\nBuilt on the world’s leading translation, language processing, workflow automation, and artificial intelligence technologies.\n\nTranslation and language processing technologies have evolved substantially over the last decade. The Omniscien team has been at the forefront of research and development, leading the way with a comprehensive set of integrated tools, features, and technologies that are powered by and drive artificial intelligence and machine learning.\n\nNaturally, Omniscien tools and technologies are heavily reliant on high-quality specialized data to power our platform and technologies. Because we understand the importance of high-quality data, the Omniscien team is dedicated to breaking new ground with novel research and approaches to creating, mining, harvesting, synthesizing, and manufacturing data. Omniscien has built a variety of 