In [34]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
from typing import List
import re

In [35]:
from dotenv import load_dotenv
import os
from openai import OpenAI


load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=openai_api_key)

In [45]:
def extract_html_from_web_page(url: str):
    options = Options()
    options.headless = True  # Run in headless mode (no browser UI)

    # Initialize the WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    try:
        # Navigate to a URL in the domain to set cookies
        driver.get(url)

        # Add cookies from the curl command
        driver.add_cookie({'name': '_wbauid', 'value': '4539604231702381935'})
        driver.add_cookie({'name': 'CrmToken', 'value': '402f57dd0a0627d6c644806308f0ecab966d7cd383c015820432dc17913553c0c6bc45047335765ee6ee1a347faa3b59a75c5b7b5e22f2a259f19588'})

        # Now navigate to the desired page
        # url = 'http://crm-front.alljobswb.svc.k8s.stage-dp/account'
        # driver.get(url)

        # element = WebDriverWait(driver, 10).until(
        #     EC.presence_of_element_located((By.ID, "profile"))
        # )

        # Get the page source
        html_content = driver.page_source
        return html_content

    finally:
        # Close the browser
        driver.quit()
    

In [46]:
detail_page_url_1 = 'https://www.endress.com/en/field-instruments-overview/level-measurement/Float-switch-Liquifloat-FTS20?t.tabId=product-overview'
detail_page_url_2 = 'https://www.galco.com/cd1-k-400-30-131427.html'
detail_page_url_3 = 'https://www.nriparts.com/products/masterflex-7017-20-miscellaneous/268086'
detail_page_url_4 = 'https://www.palmindustrial.com/products/cole-parmer-masterflex-7017-20-peristaltic-pump-head'
detail_page_url_5 = 'https://www.walkerindustrial.com/Bircher-Reglomat-212174-p/212174.htm'

detail_page_url = detail_page_url_1


detail_page_content = extract_html_from_web_page(detail_page_url)
detail_page_content[:4000]

'<html lang="en"><head>\n  <meta http-equiv="x-ua-compatible" content="ie=edge">\n  <title>Float switch - Liquifloat FTS20 | Endress+Hauser</title>\n  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">\n  <meta name="description" content="The Float switch FTS20 is a simple and cost-effective solution for point level detection in appropriate fluids. It is used in tanks and vessels as a pump protector or level alarm in open basins, e. g. in sewerage treatment plants. The float switch has two output options, a NAMUR switching signal or a change-over contact.">\n  <script>\n  //<![CDATA[\n  var domain = \'endress.com\';\n  if (location.hostname.substr(location.hostname.length - domain.length, domain.length) === domain) {\n    document.domain = domain;\n  }\n  //-->\n  </script>\n    <meta name="gwt:property" content="baseUrl=/nebp-resources/js/">\n<script>\n//<![CDATA[\ndocument.nebpContext = {\n  backendBaseURL: "https://portal.endress.com/webapp/nebp-s

In [None]:
detail_page_content

In [7]:
def pre_cleaning(soup):
    for script in soup(["script", "nav", "head", "style"]):  # remove all javascript code
        script.decompose()

In [53]:
soup = BeautifulSoup(detail_page_content, 'html.parser')

# pre_cleaning(soup)

In [48]:

def search_html_with_parents(soup, search_terms: List[str], max_token_length: int, parent_depth: int = 1) -> List[str]:
    """
    Search for the provided terms in the given HTML content and return a list of relevant elements, including parent elements.

    :param html_content: A string containing the HTML content.
    :param search_terms: A list of search terms, ranked in order of estimated relevance.
    :param max_token_length: The maximum token length for the final list.
    :param parent_depth: The depth of parent elements to include.
    :return: A list of HTML elements that contain the search terms, along with their parent elements.
    """
    # Dictionary to hold search results: {term: [elements]}
    search_results = {term: [] for term in search_terms}

    for term in search_terms:
        # Find all elements containing the term
        for element in soup.find_all(text=re.compile(re.escape(term), re.IGNORECASE)):
            current_element = element.parent
            # Traverse up to the specified parent depth
            for _ in range(parent_depth):
                if current_element.parent is not None:
                    current_element = current_element.parent
            search_results[term].append(str(current_element))

    # Populate the final list with elements, prioritizing earlier terms
    final_list = []
    total_tokens = 0

    for term in search_terms:
        for element in search_results[term]:
            element_tokens = len(element.split())
            if total_tokens + element_tokens > max_token_length:
                return final_list  # Return the list if adding the element would exceed the token limit
            final_list.append(element)
            total_tokens += element_tokens

    return final_list

In [49]:
from urllib.parse import urljoin

def extract_pdf_links(soup, url):
    all_links = soup.find_all('a', href=True)
    pdf_links = [link['href'] for link in all_links if link['href'].lower().endswith('.pdf')]        
    pdf_links = [urljoin(url, link) for link in pdf_links]    
    pdf_links = list(dict.fromkeys(pdf_links))
    return pdf_links

In [24]:
extract_pdf_links(soup, detail_page_url)

['https://bdih-download.endress.com/files/DLA/005056A500261ED98AD8C38FB68AB332/KA00180FA3_1719.pdf',
 'https://bdih-download.endress.com/files/DLA/005056A500261EDBB8E104F635F12ACF/XA02429FEN_0121.pdf',
 'https://bdih-download.endress.com/files/DLA/005056A500261EEDB3D93231997DE0E0/FA00001F00EN2522.pdf',
 'https://bdih-download.endress.com/files/DLA/005056A500261EECB8E5C30F3045CCF9/FA00001Fes%20Catalogo%20Nivel_24.22.pdf',
 'https://bdih-download.endress.com/files/DLA/005056A500261EDDAEB9613659243E9F/FA00001F00IT2522.pdf',
 'https://bdih-download.endress.com/files/DLA/005056A500261EDDB3D92A9836B17B0A/FA00001F00DE2522.pdf',
 'https://bdih-download.endress.com/files/DLA/0200030000091ED785BBCB2AC220053F/CP01266Z.pdf',
 'https://bdih-download.endress.com/files/DLA/005056A500261ED8BB97BC80E3035109/EG-01033%20f.pdf',
 'https://bdih-download.endress.com/files/DLA/005056A500261EED81837733718911E5/EU_01044_01.22.pdf',
 'https://bdih-download.endress.com/files/DLA/005056A500261EDAA9AFA754A7DFE612/

In [79]:
import re
from bs4 import BeautifulSoup

def extract_contact_info(soup):
    # Define regex patterns for email and phone number
    email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
    phone_pattern = r'\s*(?:\+?(\d{1,3})[-. ]?)?(?:\((\d{1,4})\)[-.\s]*|(\d{1,4})[-.\s]*)?(\d{1,4})[-.\s]*(\d{1,4})[-.\s]*(\d{1,4})\s*'
    
    # Compile regex patterns
    email_regex = re.compile(email_pattern)
    
    matches = re.findall(phone_pattern, text, re.MULTILINE)
    phone_numbers = [''.join(match) for match in matches]

    # Parse HTML content using BeautifulSoup
    # Extract text content from HTML
    text_content = soup.get_text()
    # text_content = soup.find('footer').get_text()

    # Find email addresses and phone numbers using regex
    emails = email_regex.findall(text_content)

    return emails, phone_numbers

In [86]:
extract_contact_info(soup)

(['info@endress.com'],
 ['12125551234',
  '02079460991',
  '0312345678',
  '4930123456',
  '0612345678',
  '819012345678',
  '270111234567',
  '5550199',
  '74951234567',
  '1523456789',
  '331234567',
  '0212345678',
  '912223456789',
  '0301234567',
  '5215512345678',
  '0800123456',
  '861012345678',
  '351211234567',
  '5551234',
  '6431234567',
  '41617157700'])

In [85]:
footer = soup.find('footer')

footer.get_text()

phone_pattern = r'\b(?:\+\d{1,2}\s?)?(?:\(\d{1,4}\)\s?)?[0-9\-]+\b'
phone_regex = re.compile(phone_pattern)


In [None]:
soup.get_text()

In [73]:
import re

def find_phone_numbers(text):
    # Regex pattern to match phone numbers
    pattern = r'\s*(?:\+?(\d{1,3})[-. ]?)?(?:\((\d{1,4})\)[-.\s]*|(\d{1,4})[-.\s]*)?(\d{1,4})[-.\s]*(\d{1,4})[-.\s]*(\d{1,4})\s*'

    # Finding all matches
    matches = re.findall(pattern, text, re.MULTILINE)

    # Formatting matched numbers for output
    phone_numbers = [''.join(match) for match in matches]

    return phone_numbers

# Example text
text = '''
Offering

Products

Solutions

Services

Industries

Company

+1-212-555-1234 (International format with country code for the USA)
020 7946 0991 (UK landline format without country code)
(03) 1234 5678 (Australian landline format with area code)
+49 30 123456 (German format with country code and area code)
06-12345678 (Dutch mobile format without country code)
+81-90-1234-5678 (Japanese mobile format with country code)
+27 (0)11 123 4567 (South African format with country code and area code)
555-0199 (North American local number without area code)
+7 495 123-45-67 (Russian format with country code and area code)
15 2345 6789 (Argentinian mobile format without country code)
+33 1 23 45 67 89 (French format with country code and area code)
(02) 1234 5678 (Australian landline format with a different area code)
+91-22-23456789 (Indian format with country code and area code)
030 1234567 (Italian landline format without country code)
+52 1 55 1234 5678 (Mexican mobile format with country code and area code)
0800 123 456 (UK freephone number format)
+86 10 1234 5678 (Chinese format with country code and area code)
+351 21 123 4567 (Portuguese format with country code and area code)
555.1234 (North American local number in an alternative format without area code)
+64 3-123 4567 (New Zealand format with country code and area code)

Switzerland

Tel.
                              +41 61 715 7700

info@endress.com

Copyright © Endress+Hauser Group Services AG

Imprint

Terms of use

Data Protection 

Legal - GTC
'''

# Finding phone numbers in the text
phone_numbers = find_phone_numbers(text)

# Output the phone numbers
for idx, number in enumerate(phone_numbers):
    print(f'{idx} {number}')


0 12125551234
1 02079460991
2 0312345678
3 4930123456
4 0612345678
5 819012345678
6 270111234567
7 5550199
8 74951234567
9 1523456789
10 331234567
11 0212345678
12 912223456789
13 0301234567
14 5215512345678
15 0800123456
16 861012345678
17 351211234567
18 5551234
19 6431234567
20 41617157700


In [81]:

# Example usage with parent depth:
popular_currency_symbols = ["$", "€", "£", "¥", "₹", "₽", "₩", "₣", "C$", "A$", "R$", "₺", "RM", "฿", "₱", "S$", "HK$", "NZ$", "kr", "zł"]
popular_currency_codes = ["USD", "EUR", "GBP", "JPY", "INR", "RUB", "KRW", "CHF", "CAD", "AUD", "BRL", "TRY", "MYR", "THB", "PHP", "SGD", "HKD", "NZD", "SEK", "NOK", "DKK", "PLN"]


search_terms = popular_currency_symbols + popular_currency_codes

search_terms = ['(\(\d{3}\)\s*[-\.\s]\s*\d{3}\s*[-\.\s]??\s*\d{4}|\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})']
max_token_length = 50  # Example token length limit
parent_depth = 1

matched_elements = search_html_with_parents(soup, search_terms, max_token_length, parent_depth)
matched_elements = list(set(matched_elements))
matched_elements

  for element in soup.find_all(text=re.compile(re.escape(term), re.IGNORECASE)):


[]

In [82]:
import re
from bs4 import BeautifulSoup
from typing import List

def search_html_for_phone_numbers(soup, regex_pattern: str, max_token_length: int, parent_depth: int = 1) -> List[str]:
    """
    Search for phone numbers in the given HTML content and return a list of relevant elements, including parent elements.

    :param soup: BeautifulSoup object containing the HTML content.
    :param regex_pattern: Regex pattern to match phone numbers.
    :param max_token_length: The maximum token length for the final list.
    :param parent_depth: The depth of parent elements to include.
    :return: A list of HTML elements that contain phone numbers, along with their parent elements.
    """
    # Find all elements containing phone numbers
    phone_number_elements = soup.find_all(text=re.compile(regex_pattern))

    final_list = []
    total_tokens = 0

    for element in phone_number_elements:
        current_element = element.parent
        # Traverse up to the specified parent depth
        for _ in range(parent_depth):
            if current_element.parent is not None:
                current_element = current_element.parent
        element_str = str(current_element)
        element_tokens = len(element_str.split())

        if total_tokens + element_tokens <= max_token_length:
            final_list.append(element_str)
            total_tokens += element_tokens

    return final_list


In [100]:
pattern = '(\(\d{3}\)\s*[-\.\s]\s*\d{3}\s*[-\.\s]??\s*\d{4}|\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})'
phone_pattern = r'\s*(?:\+?(\d{1,3})[-. ]?)?(?:\((\d{1,4})\)[-.\s]*|(\d{1,4})[-.\s]*)?(\d{1,4})[-.\s]*(\d{1,4})[-.\s]*(\d{1,4})\s*'
phone_pattern = '7700'

max_token_length = 50  # Example token length limit
parent_depth = 3

matched_elements = search_html_with_parents(el, phone_pattern, max_token_length, parent_depth)
matched_elements

  for element in soup.find_all(text=re.compile(re.escape(term), re.IGNORECASE)):


['<address class="eh-address">\n<p>Endress+Hauser AG</p>\n<p>Switzerland</p>\n<ul class="eh-link--list vertical">\n<li class="eh-link--list-item">\n<a class="eh-link eh-link--01" href="tel:+41617157700">Tel.\n                              +41 61 715 7700</a>\n</li>\n<li class="eh-link--list-item">\n<a class="eh-link eh-link--01" href="mailto:info@endress.com">info@endress.com</a>\n</li>\n</ul>\n</address>']

In [97]:
el = soup.find('footer')

el

<footer class="eh-footer" id="footer" style="padding-bottom: 160px;">
<div class="eh-center-page">
<div class="eh-grid eh-grid--no-spacing eh-footer--layout-grid">
<div class="eh-cell eh-cell--12-col">
<section class="eh-secondary-navigation eh-no-p-h eh-no-p-t">
<div class="eh-grid eh-grid--no-spacing">
<div class="eh-p-l eh-p-t eh-cell eh-cell--3-col eh-cell--12-col-small marker-footer-nav">
<label class="eh-label eh-label--01">Offering</label>
<ul class="eh-link--list vertical">
<li class="eh-link--list-item">
<a class="eh-link eh-link--01" href="/en/field-instruments-overview">Products</a>
</li><li class="eh-link--list-item">
<a class="eh-link eh-link--01" href="/en/process-solutions">Solutions</a>
</li><li class="eh-link--list-item">
<a class="eh-link eh-link--01" href="/en/instrumentation-services">Services</a>
</li><li class="eh-link--list-item">
<a class="eh-link eh-link--01" href="/en/industry-expertise">Industries</a>
</li>
</ul>
</div>
<div class="eh-p-l eh-p-t eh-cell eh-ce

In [79]:
matched_elements_str = '\n'.join([f'{idx}: \'{m}\'' for idx, m in enumerate(matched_elements)])
print(matched_elements_str)

0: '<span class="price-wrapper" data-price-amount="1754.5000" data-price-type="finalPrice" id="product-price-7364554">
<span class="price">$1,754.50</span>
</span>'
1: '<div class="modal-content" data-role="content" id="modal-content-29"><div class="modal-component" data-bind="css: modalClass, hasFocus: focused">
<!-- ko if: state() || $data.modal --><!-- /ko -->
</div></div>'


In [80]:
system_message = """
You are automated web-crawler working as part of a product that helps blind people use websites. You have been
provided with a numbered list og HTML elements. Given a directive, your job is to identify the single element that
is most relevant to the directive. Return the number of the element, wrapped in curved parentheses.

[Example 1]
We are given the following elements:
{
  1: '<th scope="row" class="infobox-label"><div style=";">&nbsp;<a href="/wiki/President_of_the_United_States"
  title="President of the United States"></a> </div></th>',
  2: '<th scope="row" class="infobox-label"><div style=";">&nbsp;<a href="/wiki/Vice_President_of_the_United_States"
  title="Vice President of the United States">Vice President</a> </div></th>'
  3: '<tr><th scope="row" class="infobox-label"><a href="/wiki/Left-_and_right-hand_traffic" title="Left- and right-hand traffic">
  Driving side</a></th><td class="infobox-data">right<sup id="cite_ref-drive_23_0" class="reference"><a 
  href="#cite_note_drive-23">[h]</a></sup></td></tr>'
}
And the following directive
"Find an element that relates to the driving side in the United States"

In this case, we can see that the third element contains the information we`re looking for, so we should return:
'(3)'

Keep in mind that the innerText of an element is not the only way in which it can relate to a directive. Sometimes the most relevant
element will be a link to a new page whose title seems relevant.

You must always return a number. If you don't find an element that is directly relevant, think abstractly, and consider which
element may be directionally similar to the directive.

For example, let's take Example 1 again, but with a new directive:
"Find information about the population of Washington D.C."

In this case, none of the elements are directly relevant, but the first element is directionally similar, because the President of
the United States lives in Washington D.C. So we should return:
'(1)'
"""

user_request = """
As an automated web-crawler, please find relevant price and manufacturer details from the following elements:
{{
  {matched_elements}
}}
""".format(matched_elements=matched_elements_str)

In [81]:
response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "assistant", "content": system_message},
        {"role": "user", "content": user_request}
    ],
    stream=False,
)
# response

In [82]:
print(response.choices[0].message.content)

The directive to find price and manufacturer details is most relevant to the first element, thus return:
'(0)'


In [50]:
print(user_request)


As an automated web-crawler, please find relevant price and manufacturer details from the following elements:
{
  0: '<div class="eh-price-widget--price">
<span class="eh-price-widget--price-prefix eh-label eh-font-metrics--14-22">from</span>
<span class="eh-price-widget--price-formatted eh-label eh-font-metrics--20-26">€59.00</span>
</div>'
1: '<div class="eh-price-widget--price">
<span class="eh-price-widget--price-prefix eh-label eh-font-metrics--14-22">from</span>
<span class="eh-price-widget--price-formatted eh-label eh-font-metrics--20-26">€67.00</span>
</div>'
2: '<div class="eh-quickselector--product-price-scale-price-value eh-font-bold">
<span>€67.00</span>
</div>'
3: '<div class="eh-quickselector--product-price-scale-price-value eh-font-bold">
<span>€63.00</span>
</div>'
4: '<div class="eh-quickselector--product-price-scale-price-value eh-font-bold">
<span>€59.00</span>
</div>'
}



In [98]:
print(soup.get_text().replace('  ', '').replace('\n\n', ''))


NEED HELP? CALL US AT 949.446.6757Log inorCreate account
Cart
0US DollarEuroBritish Pound SterlingCanadian DollarAustralian Dollar  USD  US Dollar  Euro  British Pound Sterling  Canadian Dollar  Australian Dollar
Search
MenuUS DollarEuroBritish Pound SterlingCanadian DollarAustralian Dollar  USD  US Dollar  Euro  British Pound Sterling  Canadian Dollar  Australian DollarCart 0FIND YOUR PART
+-
PLC's
AUTOMATION
COMPUTER PARTS
CONNECTORS
ELECTRICAL
CIRCUIT BREAKERS
SENSORS
SWAGELOK
FITTINGS
VALVES
LAB EQUIPMENT
TEST EQUIPMENT
PNEUMATICS
POWER SUPPLIES
PUMPS
BEARINGS
NETWORKINGABOUT PALM INDUSTRIAL
PRODUCT GUARANTEE
Sell Your Surplus
OUR PARENT COMPANYLog in
Create account
SearchNEED HELP? CALL US AT 949.446.6757
FIND YOUR PART
PLC's
AUTOMATION
COMPUTER PARTS
CONNECTORS
ELECTRICAL
CIRCUIT BREAKERS
SENSORS
SWAGELOK
FITTINGS
VALVES
LAB EQUIPMENT
TEST EQUIPMENT
PNEUMATICS
POWER SUPPLIES
PUMPS
BEARINGS
NETWORKING
ABOUT PALM INDUSTRIAL
PRODUCT GUARANTEE
Sell Your Surplus
OUR PARENT COMPANYcol

In [102]:
# Regular expression pattern to match URLs ending in .pdf
pdf_regex = re.compile('.+\.pdf$')

# Find all links that match the regex pattern
pdf_links = [link.get('href') for link in soup.find_all('a', href=pdf_regex)]

print(pdf_links)

['/v/vspfiles/pdf/datasheet/Bircher/SpotScan-Datasheet.pdf', '/v/vspfiles/pdf/datasheet/Bircher/SpotScan-Usermanual.pdf']


In [103]:
response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "assistant", "content": system_message},
        {"role": "user", "content": user_request}
    ],
    stream=False,
)
# response

In [104]:
response

ChatCompletion(id='chatcmpl-8ZvhLbXfH0Un0BLrmyevGAIUIzeKo', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content="'(0)'", role='assistant', function_call=None, tool_calls=None), logprobs=None)], created=1703573963, model='gpt-4-0613', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=3, prompt_tokens=632, total_tokens=635))