In [2]:
import os
import pathway as pw
from pathway.xpacks.llm.parsers import UnstructuredParser

In [3]:
files = pw.io.fs.read(
    path = "../Data/Summarized_PDFs/",
    mode="streaming",
    format="binary",
    autocommit_duration_ms=50,
)

print("Files table schema : ", files)

parser = UnstructuredParser(chunking_mode="elements")
# Unstructured library accepts the documents in the raw binary format and returns the text
documents = files.select(elements=parser(pw.this.data))
# TokenCountSplitter returns data in the same format as UnstructuredParser - that is for each row it returns a list of tuples, where each tuple consists of a string with the text of a chunk and a dictionary with associated metadata.
print("Documents table schema : ", documents)

Files table schema :  <pathway.Table schema={'data': <class 'bytes'>}>
Documents table schema :  <pathway.Table schema={'elements': list[tuple[str, pathway.internals.json.Json]]}>


**Before Flattening** :

| id | elements         |
| -- | ---------------- |
| 1  | \["a", "b", "c"] |
| 2  | \["d", "e"]      |

In [4]:
documents = documents.flatten(pw.this.elements) # flatten list into multiple rows
print("Document schema after flattening : ", documents)

Document schema after flattening :  <pathway.Table schema={'elements': tuple[str, pathway.internals.json.Json]}>


**After flattening** : 

| id | elements |
| -- | -------- |
| 1  | "a"      |
| 1  | "b"      |
| 1  | "c"      |
| 2  | "d"      |
| 2  | "e"      |


In [5]:
documents = documents.select(text=pw.this.elements[0], metadata=pw.this.elements[1]) # extract text and metadata from tuple
print("Refined Documents schema : ", documents)

Refined Documents schema :  <pathway.Table schema={'text': <class 'str'>, 'metadata': <class 'pathway.internals.json.Json'>}>


In [6]:
from pathway.xpacks.llm.splitters import TokenCountSplitter

splitter = TokenCountSplitter(min_tokens=100, max_tokens=300)
texts = documents.select(chunk=splitter(pw.this.text))
print("Schema of the document table after splitting: ", texts)

Schema of the document table after splitting:  <pathway.Table schema={'chunk': list[tuple[str, pathway.internals.json.Json]]}>


Strategy to get more accurate retrieval : **Reranking**

**Reference** : https://pathway.com/developers/user-guide/llm-xpack/overview#rerankers