In [9]:
docs_list = [
    {
        'name': 'brass_birmingham',
        'path': './example_pdf/Brass_Birmingham_Reference_Sheet_EN.pdf'
    },
    {
        'name': 'pandemic_legacy_season_1',
        'path': './example_pdf/Pandemic_Legacy_rules_English_no_spoilers.pdf'
    },
]

In [10]:
from langchain.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model='nomic-embed-text') 

In [11]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
import os
from dotenv import load_dotenv

load_dotenv(override=True)

url = "https://183e03de-cba4-4c31-9a62-be25dc87e60e.europe-west3-0.gcp.cloud.qdrant.io"
api_key = os.environ["QDRANT_KEY"]

for doc in docs_list:
    print('loading', doc['name'])
    loader = PyMuPDFLoader(doc['path'])
    documents = loader.load()

    for document in documents:
        doc_md = document.metadata
        document_name = doc_md["source"].split("/")[-1]
        # derive doc source from Document loader
        doc_source_suffix = "/".join(doc_md["source"].split("/")[4:-1])
        source = f"{doc_source_suffix}"
        document.metadata = {"document_name": document_name}

    print(f"# of documents loaded (pre-chunking) = {len(documents)}")
    
    # split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=150,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
    )
    doc_splits = text_splitter.split_documents(documents)

    # Add chunk number to metadata
    for idx, split in enumerate(doc_splits):
        split.metadata["chunk"] = idx

    print(f"# of documents = {len(doc_splits)}")

    qdrant = QdrantVectorStore.from_documents(
        doc_splits,
        embeddings,
        url=url,
        prefer_grpc=True,
        api_key=api_key,
        collection_name=doc['name'],
    )
    print('qdrant finished')
    print('')

loading brass_birmingham
# of documents loaded (pre-chunking) = 2
# of documents = 8
qdrant finished

loading pandemic_legacy_season_1
# of documents loaded (pre-chunking) = 16
# of documents = 107
qdrant finished

