In [37]:
import requests
from bs4 import BeautifulSoup

def fetch_text_from_url(url):
    try:
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/117.0.0.0 Safari/537.36"
            )
        }
        # 发送带有 User-Agent 的 HTTP 请求 -- 如果没有headers，会被判断为爬虫(Web Crawler)而返回503
        response = requests.get(url, headers=headers)
        print(response.status_code)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
        soup = BeautifulSoup(response.text, 'html.parser')   # Parse the HTML content using BeautifulSoup
    
        # Convert to Markdown outline
        headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])   # only need to focusing the first three level
        outline = []
        for heading in headings:
            level = int(heading.name[1])  # Get the heading level (1 to 6)
            text = heading.get_text(strip=True)
            outline.append(f"{'#' * level} {text}")
        markdown_outline = "\n".join(outline)

        # Try to extract text from the <main> tag
        main_content = soup.find('main')
        if main_content:
            text_content = main_content.get_text(separator='\n')
            print("--------get main---------")
        else:
            # Fallback to extracting all text if <main> is not present
            text_content = soup.get_text(separator='\n')
        clean_text = '\n'.join(line.strip() for line in text_content.splitlines() if line.strip())

        title = soup.title.string if soup.title else "No title found"
        
        return title, clean_text, markdown_outline
    except requests.exceptions.RequestException as e:
        print("----!!!error!!!----")
        return f"Error fetching URL content: {e}"

url = "https://en.wikipedia.org/wiki/Camera"
# url = "https://www.amazon.co.uk/s?k=amazon+camera&crid=2J5OAN2PC30VG&sprefix=amazon+camera%2Caps%2C69&ref=nb_sb_noss_1"
title, clean_text, markdown_outline = fetch_text_from_url(url)  # --------- will store into the data base for history view
print(title)  # Print the first 1000 characters for brevity
print(markdown_outline)

# Write the cleaned text to a file
with open('benchMark.txt', 'w', encoding='utf-8') as file:
    file.write(clean_text)   # future bench marks  


200
--------get main---------
Camera - Wikipedia
## Contents
# Camera
## History
### 19th century
### 20th century
### 21st century
## Mechanics
### Exposure control
#### Aperture
#### Shutter
#### Light meter
### Lens
### Viewfinder
### Film and sensor
### Camera accessories
#### Flash
#### Other accessories
## Primary types
### Single-lens reflex (SLR) camera
### Large-format camera
#### Plate camera
### Medium-format camera
#### Twin-lens reflex camera
### Compact cameras
#### Instant camera
#### Subminiature camera
#### Folding camera
#### Box camera
### Rangefinder camera
### Motion picture cameras
#### Professional video camera
#### Camcorders
### Digital camera
#### Camera phone
## See also
## Footnotes
## References
## Further reading
## External links


### extract text info from a website

In [38]:
import openai
from openai import AzureOpenAI
from config import api_base, api_key

api_base = api_base
api_key = api_key
deployment_name = "gpt-35-turbo-16k"  
api_version = "2023-06-01-preview"

client = AzureOpenAI(azure_endpoint=api_base, api_key=api_key, api_version=api_version)

prompt = "Following text is extracted from a website, including its title, main context, and outline. Please help me analysis the following content and return a summery that less than 50 words:\n ##################extracted text##################\n"
prompt = prompt + "#######title#######\n" + title[:500]
prompt = prompt + "#######main context#######\n" + clean_text[:5000]
prompt = prompt + "#######outline#######\n" + markdown_outline[:500]

response = client.chat.completions.create(
        model=deployment_name, 
        messages=[
            {"role": "system", "content": "You are a helpful webpage analysis assistant."}, 
            {"role": "user", "content": prompt}
        ],
        max_tokens=500,
        stream=True  # 使用流式返回
    )   

# 处理流式数据
mapped_value = ""
for chunk in response:
    # print(chunk)
    if len(chunk.choices) >= 1:
        choice = chunk.choices[0]
        delta = getattr(choice, "delta", {})  # 获取delta属性
        # print(delta)
        if delta!={}:
            content = getattr(delta, "content", None)  # 获取content部分
            if content != None:
                mapped_value += content
                mapped_value = mapped_value.strip().replace("\n", "")
                print(content, end="", flush=True)   # 显示流式输出

#去除回复中的所有\n以及结尾的空格
mapped_value = mapped_value.strip().replace("\n", "")
print("\nFinal Output:", mapped_value)



Summary: The Wikipedia article on cameras discusses the history, mechanics, and types of cameras. It highlights the evolution of camera technology and the impact of smartphones on visual content creation and consumption. The article also includes an outline of the content covered.
Final Output: Summary: The Wikipedia article on cameras discusses the history, mechanics, and types of cameras. It highlights the evolution of camera technology and the impact of smartphones on visual content creation and consumption. The article also includes an outline of the content covered.


## tencent Hunyuan API try

### text-to-text

In [8]:
# sk-NUTAvhpexZBv93vaLslKhoDRa5EGIcmbMTgJA2LbaKtUZiYc

# need to execute "pip install tencentcloud-sdk-python"
import json
import types
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.hunyuan.v20230901 import hunyuan_client, models

import config
import importlib
# 强制重新加载模块
importlib.reload(config)
from config import tencent_secretId, tencent_secretKey

# print(type(tencent_secretId), tencent_secretId)
# print(type(tencent_secretKey), tencent_secretKey)

try:
    # 实例化一个认证对象，入参需要传入腾讯云账户 SecretId 和 SecretKey，此处还需注意密钥对的保密
    # 代码泄露可能会导致 SecretId 和 SecretKey 泄露，并威胁账号下所有资源的安全性。以下代码示例仅供参考，建议采用更安全的方式来使用密钥，请参见：https://cloud.tencent.com/document/product/1278/85305
    # 密钥可前往官网控制台 https://console.cloud.tencent.com/cam/capi 进行获取
    cred = credential.Credential(tencent_secretId, tencent_secretKey)
    # 实例化一个http选项，可选的，没有特殊需求可以跳过
    httpProfile = HttpProfile()
    httpProfile.endpoint = "hunyuan.tencentcloudapi.com"

    # 实例化一个client选项，可选的，没有特殊需求可以跳过
    clientProfile = ClientProfile()
    clientProfile.httpProfile = httpProfile
    # 实例化要请求产品的client对象,clientProfile是可选的
    client = hunyuan_client.HunyuanClient(cred, "", clientProfile)

    # 实例化一个请求对象,每个接口都会对应一个request对象
    req = models.ChatCompletionsRequest()
    params = {
        "Model": "hunyuan-turbo",
        "Messages": [
            {
                "Role": "user",
                "Content": "早上好"
            }
        ]
    }
    req.from_json_string(json.dumps(params))

    # 返回的resp是一个ChatCompletionsResponse的实例，与请求对象对应
    resp = client.ChatCompletions(req)
    # 输出json格式的字符串回包
    if isinstance(resp, types.GeneratorType):  # 流式响应
        for event in resp:
            print(event)
    else:  # 非流式响应
        print(resp)


except TencentCloudSDKException as err:
    print(err)

{"Created": 1733268768, "Usage": {"PromptTokens": 16, "CompletionTokens": 9, "TotalTokens": 25}, "Note": "以上内容为AI生成，不代表开发者立场，请勿删除或修改本标记", "Id": "9271a714-0de1-4b2c-8aed-7c7b3e18ff66", "Choices": [{"FinishReason": "stop", "Delta": null, "Message": {"Role": "assistant", "Content": "早上好！有什么需要帮助的吗？", "Contents": null, "ToolCallId": null, "ToolCalls": null, "FileIDs": null}, "Index": 0}], "ErrorMsg": null, "ModerationLevel": null, "SearchInfo": null, "Replaces": null, "RequestId": "9271a714-0de1-4b2c-8aed-7c7b3e18ff66"}


### imgae-to-text

#### 1. make use of AWS S3 -- for cloud storage

In [7]:
import boto3
from botocore.exceptions import NoCredentialsError, PartialCredentialsError

import importlib
# 强制重新加载模块
importlib.reload(config)
import config
from config import aws_access_key, aws_secret_key, region_name, bucket_name

# S3 配置
file_path = "camera.webp" # 替换为本地文件路径
object_key = "camera.webp"    # S3 中的文件名

try:
    # 创建 S3 客户端
    s3_client = boto3.client(
        's3',
        aws_access_key_id=aws_access_key,
        aws_secret_access_key=aws_secret_key,
        region_name=region_name
    )

    # 上传文件
    s3_client.upload_file(file_path, bucket_name, object_key)
    print(f"File uploaded successfully to S3 bucket: {bucket_name}/{object_key}")

    # 生成文件的访问 URL
    file_url = f"https://{bucket_name}.s3.{region_name}.amazonaws.com/{object_key}"
    print(f"File URL: {file_url}")

except FileNotFoundError:
    print("Error: The file was not found.")
except NoCredentialsError:
    print("Error: Credentials not available.")
except PartialCredentialsError:
    print("Error: Incomplete credentials provided.")
except Exception as e:
    print(f"Error: {e}")


File uploaded successfully to S3 bucket: ningbobucket/camera.webp
File URL: https://ningbobucket.s3.eu-north-1.amazonaws.com/camera.webp


#### use online model

有尝试使用腾讯混元LLM，但是文件上传api还没有开放，无法获得必填参数文件id

In [None]:
from tencentcloud.common import credential
from tencentcloud.iai.v20200303 import iai_client, models

# 设置参数
cred = credential.Credential(tencent_secretId, tencent_secretKey)
client = iai_client.IaiClient(cred, "ap-guangzhou")

# 请求图像分析
req = models.DetectFaceRequest()
req.Image = "camera.webp"
resp = client.DetectFace(req)
print(resp.to_json_string())


NameError: name 'tencent_secretId' is not defined

#### use local model

In [3]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

# 加载模型和处理器
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# 加载图像
# image = Image.open("camera.webp")
image = Image.open("j16.jpg")

# 生成描述
inputs = processor(image, return_tensors="pt")
outputs = model.generate(**inputs)
print(processor.decode(outputs[0], skip_special_tokens=True))


a fighter jet flying through the sky


In [52]:
import torch
print(torch.__version__)  # 如果 PyTorch 安装成功，显示版本号

2.5.1+cu118
