In [1]:
import requests

def get_web_info(url):
    """
    获取在线网页信息的通用函数
    """
    try:
        # 向目标网页发出请求
        response = requests.get(url)

        # 检查请求状态
        print(f"状态码: {response.status_code}")

        # 自动检测并设置编码
        response.encoding = response.apparent_encoding

        # 返回网页内容
        return response.text

    except Exception as e:
        print(f"请求失败: {e}")
        return None

# 使用示例
url = 'http://www.weather.com.cn/'
html_text = get_web_info(url)

if html_text:
    print("网页获取成功！")
    print(f"网页内容长度: {len(html_text)} 字符")
    # 打印前500个字符预览
    print(html_text[:500])
else:
    print("网页获取失败！")

状态码: 200
网页获取成功！
网页内容长度: 139689 字符
<!DOCTYPE html>
<html>
<head>
    <link rel="dns-prefetch" href="https://i.tq121.com.cn" />
    <meta charset="utf-8" />
    <title>天气网</title>
    <link rel="icon" href="https://www.weather.com.cn/m2/i/favicon.ico?v=3" type="image/x-icon" />
    <meta http-equiv="Content-Security-Policy" content="default-src 'self' https://*.weather.com.cn; script-src 'self' http://*.tq121.com.cn http://*.i8tq.com http://*.weather.com.cn http://*.baidu.com http://3gimg.qq.com   'unsafe-inline' 'unsafe-ev


In [2]:
def get_web_info_enhanced(url, timeout=10, headers=None):
    """
    增强版网页信息获取函数
    """
    # 默认请求头，模拟浏览器访问
    if headers is None:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    try:
        response = requests.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()  # 如果状态码不是200，抛出异常

        response.encoding = response.apparent_encoding
        print(f"✅ 成功获取网页: {url}")
        print(f"   状态码: {response.status_code}")
        print(f"   编码: {response.encoding}")
        print(f"   内容长度: {len(response.text)} 字符")

        return response.text

    except requests.exceptions.RequestException as e:
        print(f"❌ 请求失败: {e}")
        return None

# 测试增强版函数
test_url = 'https://httpbin.org/json'
html_content = get_web_info_enhanced(test_url)

✅ 成功获取网页: https://httpbin.org/json
   状态码: 200
   编码: ascii
   内容长度: 429 字符


In [5]:
def get_local_info(file_path):
    """
    获取本地网页信息的函数
    """
    try:
        # 打开指定的本地网页文件
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

        print(f"✅ 成功读取本地文件: {file_path}")
        print(f"   文件内容长度: {len(html_content)} 字符")

        return html_content

    except FileNotFoundError:
        print(f"❌ 文件未找到: {file_path}")
        return None
    except Exception as e:
        print(f"❌ 读取文件失败: {e}")
        return None

# 使用示例
local_file = 'D:/PythonProject/Practice/html/index.html'
local_html = get_local_info(local_file)

if local_html:
    print("本地文件内容预览:")
    print(local_html)  # 预览

✅ 成功读取本地文件: D:/PythonProject/Practice/html/index.html
   文件内容长度: 532 字符
本地文件内容预览:
<!DOCTYPE html>
<html>
	<head>
		<meta charset="utf-8">
		<title>网页中的黄鹤楼</title>
	</head>
	<body style="text-align:center;">
		<div >
			<h2>黄鹤楼风采</h2></br>
			<p> <img src="img1.jpg"> </p></br>
			<h3>黄鹤楼实习合影</h3></br>
			<p> <img src="img2.jpg"></p></br>
			<h4>《黄鹤楼送孟浩然之广陵》</br> 李白</h4>
			<p>故人西辞黄鹤楼，烟花三月下扬州。孤帆远影碧空尽，唯见长江天际流。</p></br>
			<p> <img src="img3.jpg"></p>
			<h4>《黄鹤楼》崔颢</h4></br>
			<p> <img src="img4.jpg"></p>
			<p>
				<audio controls>
					<source src="music.mp3" >
				</audio>
			</p>
		</div>
	</body>
</html>


In [6]:
from bs4 import BeautifulSoup

def prettify_html(html_text):
    """
    结构化网页内容的函数
    """
    try:
        # 将网页信息导入BeautifulSoup
        soup = BeautifulSoup(html_text, 'html.parser')

        # 结构化网页信息
        prettified_doc = soup.prettify()

        print("✅ 网页内容结构化完成")
        return prettified_doc, soup

    except Exception as e:
        print(f"❌ 结构化失败: {e}")
        return None, None

# 使用本地HTML文件进行测试
if local_html:
    pretty_html, soup_object = prettify_html(local_html)

    if pretty_html:
        print("结构化后的网页内容预览:")
        print(pretty_html[:500])  # 预览前500个字符

✅ 网页内容结构化完成
结构化后的网页内容预览:
<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <title>
   网页中的黄鹤楼
  </title>
 </head>
 <body style="text-align:center;">
  <div>
   <h2>
    黄鹤楼风采
   </h2>
   <p>
    <img src="img1.jpg"/>
   </p>
   <h3>
    黄鹤楼实习合影
   </h3>
   <p>
    <img src="img2.jpg"/>
   </p>
   <h4>
    《黄鹤楼送孟浩然之广陵》
    李白
   </h4>
   <p>
    故人西辞黄鹤楼，烟花三月下扬州。孤帆远影碧空尽，唯见长江天际流。
   </p>
   <p>
    <img src="img3.jpg"/>
   </p>
   <h4>
    《黄鹤楼》崔颢
   </h4>
   <p>
    <img src="img4.jpg"/>
   </p>
   <p>
    <audio


In [7]:
def extract_hhl_info(html_content):
    """
    提取黄鹤楼网页特定信息的函数
    """
    soup = BeautifulSoup(html_content, 'html.parser')

    print("=== 黄鹤楼网页信息提取结果 ===\n")

    # 1. 提取所有标题
    print("1. 网页中的标题:")
    headings = soup.find_all(['h1', 'h2', 'h3', 'h4'])
    for heading in headings:
        print(f"   - {heading.name}: {heading.get_text().strip()}")

    print("\n" + "="*50 + "\n")

    # 2. 提取所有段落文本
    print("2. 网页中的诗歌内容:")
    paragraphs = soup.find_all('p')
    for i, p in enumerate(paragraphs, 1):
        text = p.get_text().strip()
        if text and not p.find('img') and not p.find('audio'):
            print(f"   {i}. {text}")

    print("\n" + "="*50 + "\n")

    # 3. 提取所有图片
    print("3. 网页中的图片:")
    images = soup.find_all('img')
    for i, img in enumerate(images, 1):
        src = img.get('src', '无src属性')
        print(f"   图片{i}: {src}")

    print("\n" + "="*50 + "\n")

    # 4. 提取多媒体内容
    print("4. 网页中的多媒体:")
    audio_elements = soup.find_all('audio')
    for i, audio in enumerate(audio_elements, 1):
        source = audio.find('source')
        if source:
            src = source.get('src', '无音频源')
            print(f"   音频{i}: {src}")

    return {
        'headings': headings,
        'paragraphs': paragraphs,
        'images': images,
        'audio': audio_elements
    }

# 执行信息提取
if local_html:
    extracted_data = extract_hhl_info(local_html)

=== 黄鹤楼网页信息提取结果 ===

1. 网页中的标题:
   - h2: 黄鹤楼风采
   - h3: 黄鹤楼实习合影
   - h4: 《黄鹤楼送孟浩然之广陵》 李白
   - h4: 《黄鹤楼》崔颢


2. 网页中的诗歌内容:
   3. 故人西辞黄鹤楼，烟花三月下扬州。孤帆远影碧空尽，唯见长江天际流。


3. 网页中的图片:
   图片1: img1.jpg
   图片2: img2.jpg
   图片3: img3.jpg
   图片4: img4.jpg


4. 网页中的多媒体:
   音频1: music.mp3


In [8]:
class WebScraper:
    """
    通用网页采集分析类
    """

    def __init__(self):
        self.session = requests.Session()
        # 设置默认请求头
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

    def get_web_content(self, url, is_local=False):
        """
        获取网页内容
        """
        if is_local:
            return get_local_info(url)
        else:
            return get_web_info_enhanced(url)

    def analyze_page(self, html_content):
        """
        分析网页结构
        """
        if not html_content:
            return None

        soup = BeautifulSoup(html_content, 'html.parser')

        analysis = {
            'title': soup.title.string if soup.title else '无标题',
            'headings': {},
            'links': [],
            'images': [],
            'text_length': len(soup.get_text()),
            'tag_count': len(soup.find_all())
        }

        # 统计各级标题
        for i in range(1, 7):
            headings = soup.find_all(f'h{i}')
            analysis['headings'][f'h{i}'] = len(headings)

        # 提取链接
        links = soup.find_all('a', href=True)
        for link in links[:10]:  # 只取前10个链接
            analysis['links'].append({
                'text': link.get_text(strip=True),
                'url': link['href']
            })

        # 提取图片
        images = soup.find_all('img')
        for img in images[:5]:  # 只取前5个图片
            analysis['images'].append(img.get('src', '无src'))

        return analysis

    def generate_report(self, analysis):
        """
        生成分析报告
        """
        if not analysis:
            return "无分析数据"

        report = []
        report.append("=== 网页分析报告 ===")
        report.append(f"网页标题: {analysis['title']}")
        report.append(f"文本总长度: {analysis['text_length']} 字符")
        report.append(f"HTML标签总数: {analysis['tag_count']} 个")

        report.append("\n--- 标题统计 ---")
        for level, count in analysis['headings'].items():
            report.append(f"{level.upper()}: {count} 个")

        report.append("\n--- 前5个链接 ---")
        for i, link in enumerate(analysis['links'][:5], 1):
            report.append(f"{i}. {link['text']} -> {link['url']}")

        report.append("\n--- 图片资源 ---")
        for i, img in enumerate(analysis['images'][:3], 1):
            report.append(f"{i}. {img}")

        return "\n".join(report)

# 使用示例
def main():
    scraper = WebScraper()

    # # 分析本地网页
    # print("正在分析本地网页...")
    # local_content = scraper.get_web_content('html/index.html', is_local=True)
    # local_analysis = scraper.analyze_page(local_content)

    # if local_analysis:
    #     report = scraper.generate_report(local_analysis)
    #     print(report)

    # print("\n" + "="*60 + "\n")

    # 尝试分析在线网页（可选）
    try:
        print("正在分析在线网页（示例）...")
        online_content = scraper.get_web_content('https://books.toscrape.com/')
        online_analysis = scraper.analyze_page(online_content)

        if online_analysis:
            report = scraper.generate_report(online_analysis)
            print(report)
    except Exception as e:
        print(f"在线网页分析跳过: {e}")

if __name__ == "__main__":
    main()

正在分析在线网页（示例）...
✅ 成功获取网页: https://books.toscrape.com/
   状态码: 200
   编码: utf-8
   内容长度: 51274 字符
=== 网页分析报告 ===
网页标题: 
    All products | Books to Scrape - Sandbox

文本总长度: 8851 字符
HTML标签总数: 541 个

--- 标题统计 ---
H1: 1 个
H2: 0 个
H3: 20 个
H4: 0 个
H5: 0 个
H6: 0 个

--- 前5个链接 ---
1. Books to Scrape -> index.html
2. Home -> index.html
3. Books -> catalogue/category/books_1/index.html
4. Travel -> catalogue/category/books/travel_2/index.html
5. Mystery -> catalogue/category/books/mystery_3/index.html

--- 图片资源 ---
1. media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg
2. media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg
3. media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg
