In [76]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
import re
import os

data_path = '../../data/UNVXIS.05 Enable Internal Sales Orders.pdf'

loader = PyPDFLoader(data_path)

pages = []
async for page in loader.alazy_load():
    pages.append(page)

def preprocess_content(content):
    # Remove the specific phrase from the content
    return re.sub(r'Enable Internal Sales Orders \(Internal Sales Orders\) \d+ \(\d+\)', '', content)

def split_by_headers(documents, header_pattern, specific_headers):
    split_docs = []
    last_seen_header = None
    for doc in documents:
        # Preprocess the content to remove the specific phrase
        content = preprocess_content(doc.page_content)
        chunks = re.split(f'({header_pattern})', content)
        chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
        for chunk in chunks:
            if chunk in specific_headers:
                last_seen_header = chunk
            else:
                split_docs.append(
                    Document(
                        page_content=chunk,
                        metadata={**doc.metadata, 'header': last_seen_header}
                    )
                )
    return split_docs

# Define the specific headers you want to capture
specific_headers = ["Overview", "Tasks", "Setup", "Additional Information"]

# Define the header pattern (adjust the pattern based on your headers)
header_pattern = r'\b(?:' + '|'.join(specific_headers) + r')\b'

# Split the document by headers
docs = split_by_headers(pages, header_pattern, specific_headers)

# Print the first chunk's content and metadata
docs = docs[2:]

for doc in docs:
    print(doc.page_content)
    print(doc.metadata)
    print('-------------------')

Concept 
 
Process  
This configuration process explains how the system must be set up to enable internal sales 
orders. One major consideration is the Goods-in-transit (GIT) management. As explained in 
the detailed steps, this is controlled by a parameter in ‘Settings – Cost Accounting’ 
(CAS900), which is set Off. 
The project team should decide whether to activate this, as it will have an impact on internal 
sales orders accounting and “external”, or “regular” customer orders and purchase orders. 
Although the Goods-in-transit functionality has not been activated, the process descriptions 
assume that it has been. 
Input  
 
Output  
The settings and basic data are in place for using internal sales orders. 
The project team has chosen whether to activate the Goods-in-transit (GIT) functionality. 
Depending on this, GIT will be managed or not during the internal sales orders process.
{'source': '../../data/UNVXIS.05 Enable Internal Sales Orders.pdf', 'page': 2, 'header': 'Overview'}

In [77]:
from dotenv import load_dotenv
load_dotenv()

from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
ASTRA_DB_API_KEY = os.getenv('ASTRA_DB_API_KEY')
ASTRA_DB_ENDPOINT = os.getenv('ASTRA_DB_ENDPOINT')
ASTRA_DB_KEYSPACE = os.getenv('ASTRA_DB_KEYSPACE')

model = ChatOpenAI(model='gpt-4o')

embeddings = OpenAIEmbeddings()

In [78]:
from langchain_community.vectorstores import AstraDB

vectorstore = AstraDB(
    embedding=embeddings,
    token=ASTRA_DB_API_KEY,
    api_endpoint=ASTRA_DB_ENDPOINT,
    collection_name="test_collection",

)

INFO     [astrapy.core.db] ASTRA_DB_KEYSPACE is not set. Defaulting to 'default_keyspace'
INFO     [astrapy.core.db] ASTRA_DB_KEYSPACE is not set. Defaulting to 'default_keyspace'


In [79]:
import uuid

doc_ids = [str(uuid.uuid4()) for _ in docs]

In [80]:
vectorstore.add_documents(documents=docs, doc_ids=doc_ids)



['9a4ddeacba6a4a33a62d7ed523a1af2e',
 '1c51b182d9cb4b44af10d7d1c45ed88c',
 'c02789597b9d4bfcaf5b8fc7986dd3d7',
 '07b20e908f9c44d3ba15cc3308018c30',
 '05f5ca27e062487eb49194f5dd80ba20',
 '651b3dbdbdcf40a394a5ab9574971625',
 '56523be538cd4148826d3e257b4ff09d',
 'b7b5db8f64324e8793e426c87895fc3d',
 '979941a9ca154bef81221b61c7aa9b61',
 '650bf24851f54619bd26aa7c1ecb24c9',
 '70d7a38621ef4557b030bce04c5b3a21']

In [81]:
import os
from astrapy import DataAPIClient

# Initialize the client and get a "Database" object
client = DataAPIClient(ASTRA_DB_API_KEY)
database = client.get_database(ASTRA_DB_ENDPOINT)
collection = database.get_collection("test_collection")

# Retrieve all documents from the collection
documents = collection.find({})

# Print the retrieved documents
for doc in documents:
    print(doc)
    print('-------------------')

INFO     [astrapy.cursors] creating iterator on 'test_collection'
INFO     [astrapy.cursors] finished creating iterator on 'test_collection'
INFO     [astrapy.collection] command=find on 'test_collection'
INFO     [astrapy.collection] finished command=find on 'test_collection'
{'_id': 'b7b5db8f64324e8793e426c87895fc3d', 'content': 'o PP10 224: Ordered not Received (2470) \no PP10 909: Goods in Transit (1430)', 'metadata': {'source': '../../data/UNVXIS.05 Enable Internal Sales Orders.pdf', 'page': 9, 'header': 'Tasks'}}
-------------------
{'_id': '1c51b182d9cb4b44af10d7d1c45ed88c', 'content': 'Manage General Settings \n1. Activate Internal Sales Orders \n‘Feature List. Open’ (CMS975) \n\uf0b7 Activate toggles: NCR 11589 and NCR 11604 \n \n2. Configure Stock Messages \n‘Order Init Stock Trans Qualif’ (MHS860) \n\uf0b7 Generate all with F14. \n\uf0b7 Qualifier 29 will be used. \n \n‘Stock Msg Partner. Open’ (MMS865) \n\uf0b7 Stock message partner: \nWhs Msg Partner \ntype \nMsg type \n I

In [82]:
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The source file path of the document",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page number of the document",
        type="integer",
    ),
    AttributeInfo(
        name="header",
        description="The header of the document page. One of ['overview', 'tasks', 'setup', 'additional information']",
        type="string",
    ),
]

document_content_description = "Content of the headers in the metadata"

retriever = SelfQueryRetriever.from_llm(
    model, vectorstore, document_content_description, metadata_field_info, verbose=True
)

results = retriever.invoke(
    "Can you retrieve all the documents for the tasks?"
)

# Print the results
for result in results:
    print(result.page_content)
    print(result.metadata)
    print('-------------------')

In [83]:
retriever.invoke(
    "Can you retrieve the tasks"
)

[Document(metadata={'source': '../../data/UNVXIS.05 Enable Internal Sales Orders.pdf', 'page': 10, 'header': 'Setup'}, page_content='This is a setup process itself.'),
 Document(metadata={'source': '../../data/UNVXIS.05 Enable Internal Sales Orders.pdf', 'page': 2, 'header': 'Overview'}, page_content='Concept \n \nProcess  \nThis configuration process explains how the system must be set up to enable internal sales \norders. One major consideration is the Goods-in-transit (GIT) management. As explained in \nthe detailed steps, this is controlled by a parameter in ‘Settings – Cost Accounting’ \n(CAS900), which is set Off. \nThe project team should decide whether to activate this, as it will have an impact on internal \nsales orders accounting and “external”, or “regular” customer orders and purchase orders. \nAlthough the Goods-in-transit functionality has not been activated, the process descriptions \nassume that it has been. \nInput  \n \nOutput  \nThe settings and basic data are in pl

In [84]:
from langchain.chains.query_constructor.base import (
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
)

prompt = get_query_constructor_prompt(
    document_content_description,
    metadata_field_info,
)
output_parser = StructuredQueryOutputParser.from_components()
query_constructor = prompt | model | output_parser

query_constructor.invoke(
    {
        "query": "Can you retrieve all the documents for the tasks?"
    }
)

StructuredQuery(query=' ', filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='header', value='tasks'), limit=None)

In [85]:
print(prompt.format(query="dummy question"))

Your goal is to structure the user's query to match the request schema provided below.

<< Structured Request Schema >>
When responding use a markdown code snippet with a JSON object formatted in the following schema:

```json
{
    "query": string \ text string to compare to document contents
    "filter": string \ logical condition statement for filtering documents
}
```

The query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.

A logical condition statement is composed of one or more comparison and logical operation statements.

A comparison statement takes the form: `comp(attr, val)`:
- `comp` (eq | ne | gt | gte | lt | lte | contain | like | in | nin): comparator
- `attr` (string):  name of attribute to apply the comparison to
- `val` (string): is the comparison value

A logical operation statement takes the form `op(statement1, statement2, ...)`:
- `op` (and | or | not

In [86]:
from browser_use import Agent
import asyncio
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import asyncio

from browser_use import Agent, Browser, Controller

async def main():
    browser = Browser()
    async with await browser.new_context() as context:
        model = ChatOpenAI(model='gpt-4o')

        # Initialize browser agent
        agent1 = Agent(
            task='Open an online code editor programiz.',
            llm=model,
            browser_context=context,
        )
        executor = Agent(
            task='Executor. Execute the code written by the coder and suggest some updates if there are errors.',
            llm=model,
            browser_context=context,
        )

        coder = Agent(
            task='Coder. Your job is to write and complete code. You are an expert coder. Code a simple calculator. Write the code on the coding interface after agent1 has opened the link.',
            llm=model,
            browser_context=context,
        )
        await agent1.run()
        await executor.run()
        await coder.run()

asyncio.run(main())


NameError: name '__file__' is not defined