# PDF Ingestion Pipeline

## 1. Instruction for running on local machine
1. Create a virtual environment (Recommended)
2. Make sure you have install all dependencies listed in requirements.txt
3. In addition, for reading PDFs and OCR, we'll need Poppler, Tesseract, and Tesseract-lang. If using Homebrew, you can run the following commands

```bash
brew install poppler
brew install tesseract-lang
```

4. Make sure you have your PDF files placed in folder `data`

## 2. Instruction for Google Colab
TODO

In [None]:
import os
import pprint
from dotenv import load_dotenv
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
from langchain_core.documents import Document
from langchain_milvus.vectorstores import Milvus
from langchain_community.embeddings import HuggingFaceEmbeddings
load_dotenv()

## BƯỚC 1: CHUYỂN FILE PDF THÀNH DẠNG DỮ LIỆU SỐ HOÁ

In [None]:
# GLOBALS
# file_path = "../data/tai_lieu_vat_ly_9.pdf"
file_path = "../data/mục_lục.pdf"
embedding_model_name = "bkai-foundation-models/vietnamese-bi-encoder"
collection_name="s4v_python_oh_bkai"

In [None]:
elements = partition_pdf(
    filename=file_path,
    mode="elements",
    strategy="hi_res",
    # infer_table_structure=True,
    languages=["vie", "equ"]
)
elements_chunked = chunk_by_title(
    elements,
    max_characters=3000,
    overlap=200,
    multipage_sections=True
)

In [None]:
# Wpap in langchain
docs = [Document(page_content=f"File title {element.metadata.filename.split('.')[0]}: {str(element)}", metadata=element.metadata.to_dict())
        for element in elements_chunked]

In [None]:
# trim metadata
for doc in docs:
    metadata = {key: value for key, value in doc.metadata.items()
                if key in ["source", "file_directory", "filename",
                           "page_number", "category"]}
    doc.metadata = metadata

In [None]:
# embedding_model = HuggingFaceEmbeddings(model_name='keepitreal/vietnamese-sbert')
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

In [None]:
pprint.pprint(docs[1].page_content)
pprint.pprint(docs[1].metadata["page_number"])

## BƯỚC 2: UPLOAD LÊN KHO LƯU TRỮ (DATABASE)
Using defaults

In [None]:
URI = os.environ.get("DATABSE_PUBLIC_ENDPOINT")
vector_db = Milvus.from_documents(
    documents=docs,
    embedding=embedding_model,
    collection_name=collection_name,
    connection_args={"uri": URI,
                     "token": os.environ.get("DATABASE_API_KEY"),
                     "secure": True
    },
    # drop_old=True
)

## BƯỚC 3: TÌM DỮ ĐOẠN VĂN GẦN NHẤT VỚI CÂU HỎI

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
vector_db = Milvus(
    embedding_function=embedding_model,
    collection_name=collection_name,
    connection_args={"uri": os.environ.get("DATABSE_PUBLIC_ENDPOINT"),
                    "token": os.environ.get("DATABASE_API_KEY"),
                    "secure": True
    }
)

In [None]:
cau_hoi = "Chuyển động tròn đều"
cac_doan_van = vector_db.similarity_search_with_score(cau_hoi, k=5)

In [None]:
for page, score in cac_doan_van:
    print("Trang", page.metadata["page_number"])
    print(page.page_content)
    print("====================")

## OPTIONAL: MANAGING COLLECTIONS

In [None]:
from pymilvus import MilvusClient, CollectionSchema, FieldSchema, DataType

client = MilvusClient(
    uri=os.environ.get("PUBLIC_ENDPOINT"),
    token=os.environ.get("API_KEY"),
)

client.create_collection(
    collection_name="s4v_python_oh",
    dimension=768,
    metric_type="COSINE",
    index_type="HNSW",
    index_name="vector_index",
    enable_dynamic_field=True
)

res = client.describe_collection(
    collection_name="s4v_python_oh"
)

pprint.pprint(res)

# Create index
index_params = MilvusClient.prepare_index_params()

index_params.add_index(
    field_name="vector",
    metric_type="COSINE",
    index_type="HNSW",
    index_name="vector_index",
)

client.create_index(
    collection_name="s4v_python_oh",
    index_params=index_params
)

client.close()