# PDF Ingestion Pipeline

## 1. Instruction for running on local machine
1. Create a virtual environment (Recommended)
2. Make sure you have install all dependencies listed in requirements.txt
3. In addition, for reading PDFs and OCR, we'll need Poppler, Tesseract, and Tesseract-lang. If using Homebrew, you can run the following commands

```bash
brew install poppler
brew install tesseract-lang
```

4. Make sure you have your PDF files placed in folder `data`

## 2. Instruction for Google Colab
TODO

In [None]:
import os
import pprint
from dotenv import load_dotenv
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
from langchain_core.documents import Document
from langchain_milvus.vectorstores import Milvus
load_dotenv()

## STEP 1: READ PDF, CHUNK, AND CONVERT TO VECTOR

In [None]:
# GLOBALS
file_path = "../data/test.pdf"

In [None]:
elements = partition_pdf(
    filename=file_path,
    mode="elements",
    strategy="hi_res",
    languages=["vie", "equ"]
)
elements = chunk_by_title(
    elements
)

In [None]:
print(str(elements[0]))
print(elements[0].metadata.to_dict())

In [None]:
# Wpap in langchain
docs = [Document(page_content=str(element), metadata=element.metadata.to_dict()) for element in elements]

In [None]:
# trim metadata
for doc in docs:
    metadata = {key: value for key, value in doc.metadata.items()
                if key in ["source", "file_directory", "filename",
                           "page_number", "category"]}
    doc.metadata = metadata

In [None]:
len(docs)

In [None]:
for doc in docs[:2]:
    print("=========")
    pprint.pprint(doc.page_content)


In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name='keepitreal/vietnamese-sbert')

In [None]:
len(embedding_model.embed_query(docs[0].page_content))

In [None]:
docs[0].metadata

## STEP 2 :UPSERT TO DATABASE
Using defaults

In [None]:
URI = os.environ.get("PUBLIC_ENDPOINT")
vector_db = Milvus.from_documents(
    documents=docs,
    embedding=embedding_model,
    collection_name="s4v_python_oh",
    connection_args={"uri": URI,
                     "token": os.environ.get("API_KEY"),
                     "secure": True
    },
    drop_old=True
)

## STEP 3: SIMILARITY SEARCH

In [None]:
test_query = "Câu 1 là gì thế?"
retrieved_docs = vector_db.similarity_search(test_query, k=2)

In [None]:
retrieved_docs

In [None]:
retrieved_docs[0].page_content

## OPTIONAL: MANAGING COLLECTIONS

In [None]:
from pymilvus import MilvusClient, CollectionSchema, FieldSchema, DataType

client = MilvusClient(
    uri=os.environ.get("PUBLIC_ENDPOINT"),
    token=os.environ.get("API_KEY"),
)

client.create_collection(
    collection_name="s4v_python_oh",
    dimension=768,
    metric_type="COSINE",
    index_type="HNSW",
    index_name="vector_index",
    enable_dynamic_field=True
)

res = client.describe_collection(
    collection_name="s4v_python_oh"
)

pprint.pprint(res)

# Create index
index_params = MilvusClient.prepare_index_params()

index_params.add_index(
    field_name="vector",
    metric_type="COSINE",
    index_type="HNSW",
    index_name="vector_index",
)

client.create_index(
    collection_name="s4v_python_oh",
    index_params=index_params
)

client.close()