In [4]:
from bs4 import BeautifulSoup
from typing import List
import re

def search_html_with_parents(html_content: str, search_terms: List[str], max_token_length: int, parent_depth: int = 1) -> List[str]:
    """
    Search for the provided terms in the given HTML content and return a list of relevant elements, including parent elements.

    :param html_content: A string containing the HTML content.
    :param search_terms: A list of search terms, ranked in order of estimated relevance.
    :param max_token_length: The maximum token length for the final list.
    :param parent_depth: The depth of parent elements to include.
    :return: A list of HTML elements that contain the search terms, along with their parent elements.
    """
    soup = BeautifulSoup(html_content, 'html.parser')

    # Dictionary to hold search results: {term: [elements]}
    search_results = {term: [] for term in search_terms}

    for term in search_terms:
        # Find all elements containing the term
        for element in soup.find_all(text=re.compile(re.escape(term), re.IGNORECASE)):
            current_element = element.parent
            # Traverse up to the specified parent depth
            for _ in range(parent_depth):
                if current_element.parent is not None:
                    current_element = current_element.parent
            search_results[term].append(str(current_element))

    # Populate the final list with elements, prioritizing earlier terms
    final_list = []
    total_tokens = 0

    for term in search_terms:
        for element in search_results[term]:
            element_tokens = len(element.split())
            if total_tokens + element_tokens > max_token_length:
                return final_list  # Return the list if adding the element would exceed the token limit
            final_list.append(element)
            total_tokens += element_tokens

    return final_list

# Example usage with parent depth:
html_content = "<div><h1>Cuba</h1><p>The capital of Cuba is Havana.</p></div>"
search_terms = ['capital']
max_token_length = 50  # Example token length limit
parent_depth = 1  # Include parent element

search_html_with_parents(html_content, search_terms, max_token_length, parent_depth)


  for element in soup.find_all(text=re.compile(re.escape(term), re.IGNORECASE)):


['<div><h1>Cuba</h1><p>The capital of Cuba is Havana.</p></div>']

In [29]:
search_terms = ['управление']


search_html_with_parents(html_content, search_terms, max_token_length=1500, parent_depth=1)

  for element in soup.find_all(text=re.compile(re.escape(term), re.IGNORECASE)):


['<li><a class="styles_navigationLink__HQEEj" href="/account/management" id="management"><svg class="styles_navigationIcon__zDAIG" fill="none" height="21" viewbox="0 0 19 21" width="19" xmlns="http://www.w3.org/2000/svg"><path d="M12.5 5.5C12.5 3.566 10.934 2 9 2a3.499 3.499 0 0 0-3.5 3.5C5.5 7.434 7.066 9 9 9s3.5-1.566 3.5-3.5zm2 0c0 3.039-2.461 5.5-5.5 5.5a5.499 5.499 0 0 1-5.5-5.5C3.5 2.461 5.961 0 9 0s5.5 2.461 5.5 5.5zM2 17.643c0 .943-.08.857.456.857h13.588c.536 0 .456.086.456-.857C16.5 15.355 13.196 14 9.25 14S2 15.355 2 17.643zm-2 0C0 13.763 4.299 12 9.25 12s9.25 1.763 9.25 5.643c0 2.016-.781 2.857-2.456 2.857H2.456C.78 20.5 0 19.66 0 17.643z" fill="#000"></path></svg>Управление</a></li>',
 '<tr class="ant-table-row ant-table-row-level-0" data-row-key="8"><td class="ant-table-cell">Управление мотивацией</td></tr>',
 '<tr class="ant-table-row ant-table-row-level-0" data-row-key="26"><td class="ant-table-cell">Управление WB Coins</td></tr>',
 '<tr class="ant-table-row ant-table-ro

In [None]:
import requests

url = 'https://timconnors.co/posts/ai-scraper'

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'en-US,en;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'If-None-Match': '"1624-AiNYryHu/xm0p2/7RYkXDmG5+w4"',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}

# cookies = {
#     '_wbauid': '4539604231702381935',
#     'CrmToken': '402f57dd0a0627d6c644806308f0ecab966d7cd383c015820432dc17913553c0c6bc45047335765ee6ee1a347faa3b59a75c5b7b5e22f2a259f19588',
# }

response = requests.get(url, headers=headers, verify=False)

# Print the response text (HTML content of the page)
print(response.text)


In [17]:
response.content

b''

In [None]:
soup = BeautifulSoup(response.content, 'html.parser')

# Extract text content
text_content = soup.get_text()

print(text_content)

In [27]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# Set up Chrome options
options = Options()
options.headless = True  # Run in headless mode (no browser UI)

# Initialize the WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

try:
    # Navigate to a URL in the domain to set cookies
    driver.get('http://crm-front.alljobswb.svc.k8s.stage-dp/')

    # Add cookies from the curl command
    driver.add_cookie({'name': '_wbauid', 'value': '4539604231702381935'})
    driver.add_cookie({'name': 'CrmToken', 'value': '402f57dd0a0627d6c644806308f0ecab966d7cd383c015820432dc17913553c0c6bc45047335765ee6ee1a347faa3b59a75c5b7b5e22f2a259f19588'})

    # Now navigate to the desired page
    url = 'http://crm-front.alljobswb.svc.k8s.stage-dp/account'
    driver.get(url)
    
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "profile"))
    )

    # Get the page source
    html_content = driver.page_source
    print(html_content)

finally:
    # Close the browser
    driver.quit()


<html><head><style rc-util-key="@ant-design-icons">
.anticon {
  display: inline-block;
  color: inherit;
  font-style: normal;
  line-height: 0;
  text-align: center;
  text-transform: none;
  vertical-align: -0.125em;
  text-rendering: optimizeLegibility;
  -webkit-font-smoothing: antialiased;
  -moz-osx-font-smoothing: grayscale;
}

.anticon > * {
  line-height: 1;
}

.anticon svg {
  display: inline-block;
}

.anticon::before {
  display: none;
}

.anticon .anticon-icon {
  display: block;
}

.anticon[tabindex] {
  cursor: pointer;
}

.anticon-spin::before,
.anticon-spin {
  display: inline-block;
  -webkit-animation: loadingCircle 1s infinite linear;
  animation: loadingCircle 1s infinite linear;
}

@-webkit-keyframes loadingCircle {
  100% {
    -webkit-transform: rotate(360deg);
    transform: rotate(360deg);
  }
}

@keyframes loadingCircle {
  100% {
    -webkit-transform: rotate(360deg);
    transform: rotate(360deg);
  }
}
</style><meta name="viewport" content="width=device-w

In [22]:
!pip show selenium

Name: selenium
Version: 4.8.0
Summary: 
Home-page: https://www.selenium.dev
Author: 
Author-email: 
License: Apache 2.0
Location: /home/valuamba/.asdf/installs/python/3.11.0/lib/python3.11/site-packages
Requires: certifi, trio, trio-websocket, urllib3
Required-by: undetected-chromedriver


In [None]:
prompt = """
You are automated web-crawler working as part of a product that helps blind people use websites. You have been
provided with a numbered list og HTML elements. Given a directive, your job is to identify the single element that
is most relevant to the directive. Return the number of the element, wrapped in curved parentheses.

[Example 1]
We are given the following elements:
{
  1: '<th scope="row" class="infobox-label"><div style=";">&nbsp;<a href="/wiki/President_of_the_United_States"
  title="President of the United States"></a> </div></th>',
  2: '<th scope="row" class="infobox-label"><div style=";">&nbsp;<a href="/wiki/Vice_President_of_the_United_States"
  title="Vice President of the United States">Vice President</a> </div></th>'
  3: '<tr><th scope="row" class="infobox-label"><a href="/wiki/Left-_and_right-hand_traffic" title="Left- and right-hand traffic">
  Driving side</a></th><td class="infobox-data">right<sup id="cite_ref-drive_23_0" class="reference"><a 
  href="#cite_note_drive-23">[h]</a></sup></td></tr>'
}
And the following directive
"Find an element that relates to the driving side in the United States"

In this case, we can see that the third element contains the information we`re looking for, so we should return:
'(3)'

Keep in mind that the innerText of an element is not the only way in which it can relate to a directive. Sometimes the most relevant
element will be a link to a new page whose title seems relevant.

You must always return a number. If you don't find an element that is directly relevant, think abstractly, and consider which
element may be directionally similar to the directive.

For example, let's take Example 1 again, but with a new directive:
"Find information about the population of Washington D.C."

In this case, none of the elements are directly relevant, but the first element is directionally similar, because the President of
the United States lives in Washington D.C. So we should return:
'(1)'
"""

In [33]:
from dotenv import load_dotenv
import os

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")


In [34]:
from openai import OpenAI

client = OpenAI(api_key=openai_api_key)

In [42]:
assistant = client.beta.assistants.create(
    name="Terrific Travels",
    instructions="You are a travel agent who specializes in world travel, on all seven continents.  You'll be provided with data indicating travel background and preferences.  Your job is to suggest itineraries for travel, and give me tips about things like best time to travel, what to pack, etc.",
    model="gpt-4-1106-preview",
    tools=[{"type": "retrieval"}])

In [43]:
thread = client.beta.threads.create()


In [44]:
message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content="I'd like help planning a new trip based on criteria for Amber. I'd prefer to visit a country I haven't been to yet. What would you suggest?",
)

In [45]:
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

In [46]:
keep_retrieving_run = client.beta.threads.runs.retrieve(
        thread_id=thread.id,
        run_id=run.id
    )
print(f"Run status: {keep_retrieving_run.status}")

Run status: completed


In [47]:
all_messages = client.beta.threads.messages.list(
    thread_id=thread.id
)

# Print the messages from the user and the assistant
print("###################################################### \n")
print(f"USER: {message.content[0].text.value}")
print(f"ASSISTANT: {all_messages.data[0].content[0].text.value}")

###################################################### 

USER: I'd like help planning a new trip based on criteria for Amber. I'd prefer to visit a country I haven't been to yet. What would you suggest?
ASSISTANT: It seems there are no documents uploaded that provide information on the countries Amber has visited. To suggest a destination she hasn't been to yet, I will need that information. Could you please upload a list of countries Amber has already visited or provide me with that information here? Once I have it, I can certainly help with planning a new trip based on her travel preferences and background.


In [50]:
from openai import ChatCompletionToolParam


client.chat.completions.create?

ImportError: cannot import name 'ChatCompletionToolParam' from 'openai' (/home/valuamba/projs/components_agent_sales/venv/lib/python3.11/site-packages/openai/__init__.py)

In [51]:
!grep -r "ChatCompletionToolParam"

smart_parser.ipynb:     "evalue": "cannot import name 'ChatCompletionToolParam' from 'openai' (/home/valuamba/projs/components_agent_sales/venv/lib/python3.11/site-packages/openai/__init__.py)",
smart_parser.ipynb:      "Cell \u001b[0;32mIn[50], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mopenai\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ChatCompletionToolParam\n\u001b[1;32m      4\u001b[0m get_ipython()\u001b[38;5;241m.\u001b[39mrun_line_magic(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpinfo\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclient.chat.completions.create\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
smart_parser.ipynb:      "\u001b[0;31mImportError\u001b[0m: cannot import name 'ChatCompletionToolParam' from 'openai' (/home/valuamba/projs/components_agent_sales/venv/lib/python3.11/site-packages/openai/__init__.py)"
smart_parser.ipynb:    "from openai import ChatCompletionToolPara

In [52]:
def pretty_print_conversation(messages):
    role_to_color = {
        "system": "red",
        "user": "green",
        "assistant": "blue",
        "tool": "magenta",
    }
    
    for message in messages:
        if message["role"] == "system":
            print(colored(f"system: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "user":
            print(colored(f"user: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "assistant" and message.get("function_call"):
            print(colored(f"assistant: {message['function_call']}\n", role_to_color[message["role"]]))
        elif message["role"] == "assistant" and not message.get("function_call"):
            print(colored(f"assistant: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "tool":
            print(colored(f"function ({message['name']}): {message['content']}\n", role_to_color[message["role"]]))


In [None]:
import psycopg2

dbname = 'your_database_name'
user = 'your_username'
password = 'your_password'
host = 'your_host'

conn = psycopg2.connect(dbname=dbname, user=user, password=password, host=host)
cur = conn.cursor()

sql_query = """
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'public'
"""

cur.execute(sql_query)
table_names = cur.fetchall()

for table in table_names:
    print(table[0])

cur.close()
conn.close()

In [54]:
import json
import psycopg2
import pgvector
from psycopg2.extras import execute_values
from pgvector.psycopg2 import register_vector
import pandas as pd

PGVECTOR_CONNECTION_STRING='postgresql://admin:5tgb%25TGB@localhost:45048/famaga'


In [56]:
db_connection = psycopg2.connect(PGVECTOR_CONNECTION_STRING)
db_cursor = db_connection.cursor()
db_connection.autocommit = True

In [57]:
sql_query = """
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'public'
"""

db_cursor.execute(sql_query)
table_names = db_cursor.fetchall()

for table in table_names:
    print(table[0])

embeddings
detail_brands
details
details_info


In [58]:
def get_table_names(conn):
    """Return a list of table names from the PostgreSQL database."""
    table_names = []
    with conn.cursor() as cur:
        cur.execute("""
            SELECT table_name
            FROM information_schema.tables
            WHERE table_schema = 'public' AND table_type = 'BASE TABLE';
        """)
        table_names = [table[0] for table in cur.fetchall()]
    return table_names

def get_column_names(conn, table_name):
    """Return a list of column names for a given table in the PostgreSQL database."""
    column_names = []
    with conn.cursor() as cur:
        cur.execute("""
            SELECT column_name
            FROM information_schema.columns
            WHERE table_schema = 'public' AND table_name = %s;
        """, (table_name,))
        column_names = [col[0] for col in cur.fetchall()]
    return column_names

def get_database_info(conn):
    """Return a list of dicts containing the table name and columns for each table in the database."""
    table_dicts = []
    table_names = get_table_names(conn)
    for table_name in table_names:
        column_names = get_column_names(conn, table_name)
        table_dicts.append({"table_name": table_name, "column_names": column_names})
    return table_dicts

In [59]:
get_database_info(db_connection)

[{'table_name': 'embeddings',
  'column_names': ['id', 'embedding', 'text', 'created_at']},
 {'table_name': 'detail_brands', 'column_names': ['brand_id', 'name']},
 {'table_name': 'details',
  'column_names': ['brand_id', 'name', 'metadata', 'embedding']},
 {'table_name': 'details_info',
  'column_names': ['id',
   'part_number',
   'model_number',
   'brand_id',
   'title',
   'description']}]

In [75]:
database_schema_dict = get_database_info(db_connection)
database_schema_string = "\n".join(
    [
        f"Table: {table['table_name']}\nColumns: {', '.join(table['column_names'])}"
        for table in database_schema_dict
    ]
)
print(database_schema_string)

Table: embeddings
Columns: id, embedding, text, created_at
Table: detail_brands
Columns: brand_id, name
Table: details
Columns: brand_id, name, metadata, embedding
Table: details_info
Columns: id, part_number, model_number, brand_id, title, description


In [63]:
tools = [
        {
            "type": "function",
            "function": {
                "name": "ask_database",
                "description": "Ask database question",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "sql_request": {
                            "type": "string",
                            "description": f"The SQL request that was made for DB Schema: {database_schema_string}",
                        }
                    },
                    "required": ["location"],
                },
            },
        }
    ]

In [80]:
# messages = [{"role": "user", "content": f"How many unique brands at details info table?\n\n{database_schema_string}"}]

messages = [{"role": "user", "content": f"Please use semantic search to find 'Endress+Hauser' brand name at table?\n\n{database_schema_string}"}]



response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=messages,
        tools=tools,
        tool_choice="auto",  # auto is default, but we'll be explicit
    )

In [81]:
response_message = response.choices[0].message
response_message

ChatCompletionMessage(content=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_omf3BCwzY6OPtpvNKCvgrvJD', function=Function(arguments='{\n  "sql_request": "SELECT brand_id, name FROM detail_brands WHERE name LIKE \'%Endress+Hauser%\'"\n}', name='ask_database'), type='function')])

In [82]:
tool_calls = response_message.tool_calls
function_args = json.loads(tool_calls[0].function.arguments)
function_args['sql_request']

"SELECT brand_id, name FROM detail_brands WHERE name LIKE '%Endress+Hauser%'"

In [83]:
db_cursor.execute(function_args['sql_request'])
result = db_cursor.fetchone()

result

(126, 'Endress+Hauser')