In [1]:
### If on Ubuntu, run in CLI:
# sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev

In [2]:
%pip install -r requirements.txt

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Note: you may need to restart the kernel to use updated packages.


In [3]:
### If doesn't work, run in CLI
# !pip install -U huggingface_hub
# !export HF_ENDPOINT=https://hf-mirror.com
# !huggingface-cli download --resume-download BAAI/bge-large-zh-v1.5 --local-dir /home/models/bge-large-zh-v1.5
# model_path = '/home/models/bge-large-zh-v1.5'

In [4]:
### Preprocess pdf documents
import pdftotext
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema.document import Document

# modified from https://stackoverflow.com/questions/77045559/langchain-load-with-string
def get_text_chunks_langchain(text):
    """ Turns raw string into docs that conform with docs = loader.load()"""
    text_splitter = CharacterTextSplitter(
		separator="\n",
		chunk_size=1000,
		chunk_overlap=50,
		length_function=len
	)
    docs = [Document(page_content=x) for x in text_splitter.split_text(text)]
    return docs

def load_pdf(filepath):
    """ From a pdf, return a docs"""
    text = ""
    with open(filepath, "rb") as f:
        pdf = pdftotext.PDF(f)
        for page in pdf:
            text += page
    return get_text_chunks_langchain(text)


""" Unit test """
pdf_files = ["./data/CMA_ES.pdf"]
documents = []

for pdf_file in pdf_files:
    docs = load_pdf(pdf_file)

print(docs[0])

page_content='arXiv:1604.00772v2 [cs.LG] 10 Mar 2023\nThe CMA Evolution Strategy: A Tutorial\nNikolaus Hansen\nInria\nResearch centre Saclay–Île-de-France\nContents\nNomenclature\n2\n0\nPreliminaries\n0.1 Eigendecomposition of a Positive Definite Matrix . . . . . . . . . . . . . . .\n0.2 The Multivariate Normal Distribution . . . . . . . . . . . . . . . . . . . . .\n0.3 Randomized Black Box Optimization . . . . . . . . . . . . . . . . . . . . .\n0.4 Hessian and Covariance Matrices . . . . . . . . . . . . . . . . . . . . . . . .\n3\n4\n5\n6\n7\n1\nBasic Equation: Sampling\n8\n2\nSelection and Recombination: Moving the Mean\n8\n3\nAdapting the Covariance Matrix\n3.1 Estimating the Covariance Matrix From Scratch . . . . . . . . . . . . . . . .\n3.2 Rank-µ-Update . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n3.3 Rank-One-Update . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n3.3.1 A Different Viewpoint . . . . . . . . . . . . . . . . . . . . . . . . .

In [6]:
### Load model from local files
from transformers import AutoModel, AutoTokenizer

# model_path = '/home/models/bge-large-zh-v1.5'
model_path = './bge-large-zh-v1.5'

model = AutoModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

  return self.fget.__get__(instance, owner)()


In [None]:
### Embed documents into vectordb
from langchain.vectorstores import Chroma
from sentence_transformers import SentenceTransformer
from FlagEmbedding import BGEM3FlagModel
from langchain.embeddings import HuggingFaceBgeEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name=model_path)

vector_store = Chroma(embedding_function=embeddings)
vector_store.add_documents(docs)
retriever = vector_store.as_retriever()

""" Unit test """
query = "What is the main topic of the document?"
retrieved_docs = retriever.get_relevant_documents(query)
for doc in retrieved_docs:
    print(doc)

In [9]:
# for doc in retrieved_docs:
#     print(doc)

page_content='recherches, Université Paris-Sud, 2010.\n[18] Hansen N. Injecting External Solutions Into CMA-ES. CoRR, arXiv:1110.4181, 2011.\n[19] Hansen N, Auger A. Principled design of continuous stochastic search: From theory\nto practice. In Y Borenstein and A Moraglio, eds.: Theory and Principled Methods for\nDesigning Metaheustics. Springer, pages 145–180, 2014.\n[20] Hansen N, Atamna A, Auger A. How to Assess Step-Size Adaptation Mechanisms in\nRandomised Search. In Parallel Problem Solving from Nature – PPSN XIII, pages 60–69.\nSpringer, 2014.\n[21] Hansen N, Kern S. Evaluating the CMA evolution strategy on multimodal test functions.\nIn Xin Yao et al., editors, Parallel Problem Solving from Nature – PPSN VIII, pages\n282–291. Springer, 2004.\n[22] Hansen N, Niederberger SPN, Guzzella L, Koumoutsakos P. A method for handling\nuncertainty in evolutionary optimization with an application to feedback control of combustion. IEEE Transactions on Evolutionary Computation, 13(1):180–