In [14]:
import requests
from bs4 import BeautifulSoup

def fetch_text_from_url(url):
    try:
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/117.0.0.0 Safari/537.36"
            )
        }
        # 发送带有 User-Agent 的 HTTP 请求 -- 如果没有headers，会被判断为爬虫(Web Crawler)而返回503
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
        soup = BeautifulSoup(response.text, 'html.parser')   # Parse the HTML content using BeautifulSoup

        # Convert to Markdown outline
        headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        outline = []
        for heading in headings:
            level = int(heading.name[1])  # Get the heading level (1 to 6)
            text = heading.get_text(strip=True)
            outline.append(f"{'#' * level} {text}")
        markdown_outline = "\n".join(outline)

        # Try to extract text from the <main> tag
        main_content = soup.find('main')
        if main_content:
            text_content = main_content.get_text(separator='\n')
            print("--------get main---------")
        else:
            # Fallback to extracting all text if <main> is not present
            text_content = soup.get_text(separator='\n')
        clean_text = '\n'.join(line.strip() for line in text_content.splitlines() if line.strip())

        title = soup.title.string if soup.title else "No title found"
        
        return title, clean_text, markdown_outline
    except requests.exceptions.RequestException as e:
        return f"Error fetching URL content: {e}"

# url = "https://en.wikipedia.org/wiki/Camera"
url = "https://www.amazon.co.uk/s?k=amazon+camera&adgrpid=59911676550&hvadid=578425084723&hvdev=c&hvlocphy=9046400&hvnetw=g&hvqmt=e&hvrand=7231952894446958386&hvtargid=kwd-490591599792&hydadcr=3954_2155624&tag=googhydr-21&ref=pd_sl_3ytx88w35c_e"
title, clean_text, markdown_outline = fetch_text_from_url(url)  # --------- will store into the data base for history view
print(title)  # Print the first 1000 characters for brevity

# Write the cleaned text to a file
with open('benchMark.txt', 'w', encoding='utf-8') as file:
    file.write(clean_text)   # future bench marks  


Amazon.co.uk : amazon camera


### extract text info from a website

In [15]:
import openai
from openai import AzureOpenAI
from config import api_base, api_key

api_base = api_base
api_key = api_key
deployment_name = "gpt-35-turbo-16k"  
api_version = "2023-06-01-preview"

client = AzureOpenAI(azure_endpoint=api_base, api_key=api_key, api_version=api_version)

prompt = "Following text is extracted from a website, including its title, main context, outline. Please help me analysis the following content and return a summery that less than 50 words:\n ##################extracted text##################\n"
prompt = prompt + "#######title#######\n" + title[:500]
prompt = prompt + "#######main context#######\n" + clean_text[:5000]
prompt = prompt + "#######outline#######\n" + markdown_outline[:500]

response = client.chat.completions.create(
        model=deployment_name, 
        messages=[
            {"role": "system", "content": "You are a helpful webpage analysis assistant."}, 
            {"role": "user", "content": prompt}
        ],
        max_tokens=500
    )   

#去除回复中的所有\n以及结尾的空格
mapped_value = response.choices[0].message.content.strip().replace("\n", "")
print(mapped_value) 

# to do:
# - linked to data beas
# - try using javaScript(get info directly from user's web page) -- together with HTML, tag 'main'
# - try longchain
# - grouping
# - frontend


The webpage is about Amazon.co.uk's selection of cameras. It includes information on cookie preferences and privacy notice. There are 124 results for "amazon camera" with options such as Blink Mini and Ring Indoor Camera, both with high customer reviews and discounts.
