In [1]:
!pip install -U sentence-transformers rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")


#We use the Bi-Encoder to encode all passages, so that we can use it with semantic search
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only
# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder

wikipedia_filepath = 'simplewiki-2020-11-01.jsonl.gz'

if not os.path.exists(wikipedia_filepath):
    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)

passages = []
with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        data = json.loads(line.strip())

        #Add all paragraphs
        #passages.extend(data['paragraphs'])

        #Only add the first paragraph
        passages.append(data['paragraphs'][0])

print("Passages:", len(passages))

# We encode all passages into our vector space. This takes about 5 minutes (depends on your GPU speed)


Passages: 169597


In [4]:
passages

['Ted Cassidy (July 31, 1932 - January 16, 1979) was an American actor. He was best known for his roles as Lurch and Thing on "The Addams Family".',
 'Aileen Carol Wuornos Pralle (born Aileen Carol Pittman; February 29, 1956\xa0– October 9, 2002) was an American serial killer. She was born in Rochester, Michigan. She confessed to killing six men in Florida and was executed in Florida State Prison by lethal injection for the murders. Wuornos said that the men she killed had raped her or tried to rape her while she was working as a prostitute.',
 "A crater is a round dent on a planet. They are usually shaped like a circle or an oval. They are usually made by something like a meteor hitting the surface of a planet. Underground activity such as volcanoes or explosions can also cause them but it's not as likely.",
 'Store has several meanings:',
 'Chinese New Year, known in China as the SpringFestival and in Singapore as the LunarNewYear, is a holiday on and around the new moon on the first

In [9]:
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.vectorstores import FAISS
from datetime import datetime
from langchain.embeddings.openai import OpenAIEmbeddings

# os.environ["OPENAI_API_TYPE"] = "azure"
# os.environ["OPENAI_API_BASE"] = "https://utterancesresource.openai.azure.com/"
# os.environ["OPENAI_API_KEY"] = "5ea3e8e59b8a418e9cc3c066f853b0c0"
# os.environ["OPENAI_API_VERSION"] = "2023-07-01-preview"

os.environ["OPENAI_API_KEY"]= 'e63ed695495543d58595fab4e27e4ff1'
os.environ['OPENAI_API_VERSION'] = '2023-07-01-preview'
os.environ['OPENAI_API_BASE'] = 'https://tv-llm-applications.openai.azure.com/'
os.environ['OPENAI_API_TYPE'] = 'azure'

documents_query = []
doc_path = r"C:\Users\samuel.t\OneDrive - Technovert\Azure\GMBOT\documents\Guardsman Group FAQ.docx"
loader = Docx2txtLoader(doc_path)
documents_query.extend(loader.load())

text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs=text_splitter.split_documents(documents_query)

embeddings = OpenAIEmbeddings(deployment='ada-embed',
                                  openai_api_key='e63ed695495543d58595fab4e27e4ff1',
                                  openai_api_base= 'https://tv-llm-applications.openai.azure.com/',
                                  openai_api_type="azure",
                                  openai_api_version='2023-07-01-preview',
                                  chunk_size=16)

document_search = FAISS.from_texts([t.page_content for t in docs], embeddings)


In [29]:
document_search

<langchain.vectorstores.faiss.FAISS at 0x2a75afdea10>

In [16]:
docs = document_search.similarity_search('vacation leave')

In [21]:
type(docs[0])

langchain.schema.document.Document

In [26]:
from langchain.schema.document import Document

In [27]:
passages_lang = Document(page_content=passages[0],metadata={"source":"local"})

In [28]:
passages_lang

Document(page_content='Ted Cassidy (July 31, 1932 - January 16, 1979) was an American actor. He was best known for his roles as Lurch and Thing on "The Addams Family".', metadata={'source': 'local'})

In [52]:
documents_query = []
doc_path = "C:\\Users\\samuel.t\\OneDrive - Technovert\\Azure\\GMBOT\\documents\\Guardsman Group FAQ.docx"
# loader = Docx2txtLoader(doc_path)

from docx import Document 


doc = Document(doc_path) 


SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (2351203319.py, line 2)

In [42]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

query_embedding = model.encode('vacation leaves')
passage_embedding = model.encode(passages[0])

print("Similarity:", util.dot_score(query_embedding, passage_embedding))

Similarity: tensor([[-0.0354]])


['Ted Cassidy (July 31, 1932 - January 16, 1979) was an American actor. He was best known for his roles as Lurch and Thing on "The Addams Family".',
 'Aileen Carol Wuornos Pralle (born Aileen Carol Pittman; February 29, 1956\xa0– October 9, 2002) was an American serial killer. She was born in Rochester, Michigan. She confessed to killing six men in Florida and was executed in Florida State Prison by lethal injection for the murders. Wuornos said that the men she killed had raped her or tried to rape her while she was working as a prostitute.',
 "A crater is a round dent on a planet. They are usually shaped like a circle or an oval. They are usually made by something like a meteor hitting the surface of a planet. Underground activity such as volcanoes or explosions can also cause them but it's not as likely.",
 'Store has several meanings:',
 'Chinese New Year, known in China as the SpringFestival and in Singapore as the LunarNewYear, is a holiday on and around the new moon on the first