BGE Model( BAAI(Beijing Academy of Artificial Intelligence) General Embeddings) Model

BGE models on HuggingFaceare one of the best open source embedding models.

In [7]:
from collections import namedtuple
from pypdf import PdfReader
Page = namedtuple("Page", ["id", "page_content", "metadata"])

def pdf_reader(file_path):
    reader = PdfReader(file_path)
    pdf_pages = []
    for page_number, page in enumerate(reader.pages):
        page_content = page.extract_text().strip()
        if page_content:
            metadata = {"page_number": page_number}  # Add any additional metadata as needed
            pdf_pages.append(Page(id=page_number, page_content=page_content, metadata=metadata))
    return pdf_pages

file_path = '../data/RaptorContract.pdf'
pages = pdf_reader(file_path)

In [8]:
from langchain.text_splitter import CharacterTextSplitter
text = '\n\n'.join([page.page_content for page in pages])

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator = "\n\n",
    chunk_size = 1200,
    chunk_overlap = 100,
    is_separator_regex = False,
    model_name='text-embedding-3-small', #used to calculate tokens
    encoding_name='text-embedding-3-small'
)

doc_list = text_splitter.create_documents(text)
doc_list # returns list of document objects
line_list = text_splitter.split_text(text)
line_list[0]

'[R&G Draft 12.__.2021] \n112923184_5  \n \nSTOCK PURCHASE AGREEMENT \nBY AND AMONG \n[BUYER], \n[TARGET COMPANY], \nTHE SELLERS LISTED ON SCHEDULE I HERETO \nAND  \nTHE SELLERS ’ REPRESENTATIVE NAMED HEREIN \nDated as of [●]  \n \n[This document is intended solely to facilitate discussions among the parties identified herein.  \nNeither this document nor such discussions are intended to create, nor will either or both be \ndeemed to create, a legally binding or enforceable offer or agreement of any type or nature, \nunless and until a definitive written agreement is executed and delivered by each of th e parties \nhereto. \n \nThis document shall be kept confidential pursuant to the terms of the Confidentiality \nAgreement entered into by the parties and, if applicable, its affiliates with respect to the subject \nmatter hereof.]\n\n-i- \n112923184_5 TABLE OF CONTENTS \nARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION .................................... 2  \nSection 1.01  Definit

In [1]:
import numpy as np
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device":'cpu'}
encode_kwargs = {'normalize_embeddings':True}

hf = HuggingFaceEmbeddings(
    model_name = model_name,
    model_kwargs = model_kwargs,
    encode_kwargs = encode_kwargs
)

  warn_deprecated(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
import numpy as np
embedded_docs = [hf.embed_query(text) for text in line_list]

np.array(embedded_docs).shape

(65, 768)

Fake Embeddings

If we have some hardware constraints, we can also use something Fake Embeddings from LangChain.

In [5]:
from langchain_community.embeddings import FakeEmbeddings

fake_embeddings = FakeEmbeddings(size = 300) # embedding size

fake_embedding_record = fake_embeddings.embed_query("This is a random text")
fake_embedding_records = fake_embeddings.embed_documents(["This is a random text"])
fake_embedding_records

[[-0.16965905908042958,
  1.5939186768826148,
  -0.7133848358361978,
  0.6664052620711756,
  -1.2059056236153183,
  -0.023488316255508266,
  1.1549833497192237,
  1.621098862084171,
  -1.8785527193021985,
  0.9143707827955454,
  0.7961616202367885,
  -0.4509841261236601,
  -0.4022375700116447,
  -0.802605656955823,
  0.2925978621627828,
  0.19811393881047945,
  0.043413212558926856,
  0.0008839694283831814,
  -0.4969221718647262,
  -0.3818215590096675,
  -1.1417235540882416,
  0.7832409702037648,
  0.04920227141733389,
  -0.9445591779066456,
  -0.21293952590640589,
  -0.1314604618954993,
  0.23223352466820546,
  -0.6512801933006795,
  1.1494704679397605,
  -0.8836998884764613,
  1.025430916924119,
  0.2603584901835216,
  -1.3584422558060683,
  1.6382429308951105,
  -0.6627835903564737,
  0.188055630474813,
  -0.3962292472506285,
  -2.1120082159872307,
  -0.21848370861654173,
  -0.13967536795945998,
  -0.929026872946699,
  0.46264884741971113,
  -0.4633234953084111,
  0.0166966898807399