In [8]:
import requests
import os
from bs4 import BeautifulSoup

In [13]:
# 目标网址
base_url = "https://www.deanza.edu/hefas/"

In [10]:
# 创建存储目录
output_dir = "HEFAS_Knowledge"
os.makedirs(output_dir, exist_ok=True)

In [15]:
def get_soup(url):
    """ 获取网页的 BeautifulSoup 对象 """
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return BeautifulSoup(response.text, "html.parser")
    else:
        print(f"❌ Failed to fetch {url} (Status: {response.status_code})")
        return None

def clean_text(text):
    """ 清理文本，去除多余空格和换行 """
    return " ".join(text.split()).strip()

def extract_main_content(soup, url):
    """ 提取网页主要内容，并处理格式 """
    title = clean_text(soup.title.text) if soup.title else "Untitled"

    # 提取所有段落、标题、列表
    content = []
    
    # 提取主标题
    content.append(f"# {title}\n")

    # 提取所有子标题 <h1> - <h6>
    for heading in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
        level = heading.name[1]  # 获取 h1-h6 的级别
        content.append(f"{'#' * int(level)} {clean_text(heading.text)}\n")

    # 提取段落 <p>
    for paragraph in soup.find_all("p"):
        text = clean_text(paragraph.text)
        if text:
            content.append(text + "\n")

    # 提取列表 <ul> <li>
    for ul in soup.find_all("ul"):
        for li in ul.find_all("li"):
            text = clean_text(li.text)
            if text:
                content.append(f"- {text}")

    # 提取链接 <a>
    links = []
    for a in soup.find_all("a", href=True):
        link_text = clean_text(a.text) or "Link"
        href = a["href"]
        if not href.startswith("http"):  # 处理相对路径
            href = "https://www.deanza.edu" + href
        links.append(f"- [{link_text}]({href})")

    if links:
        content.append("\n## Related Links\n" + "\n".join(links))

    return title, "\n".join(content)

def save_markdown(title, content):
    """ 保存内容为 Markdown 格式 """
    filename = f"{output_dir}/{title.replace(' ', '_').replace('/', '-')}.md"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(content)
    print(f"✅ Saved: {filename}")

def scrape_page(url):
    """ 爬取单个网页并存为 Markdown """
    soup = get_soup(url)
    if soup:
        title, content = extract_main_content(soup, url)
        save_markdown(title, content)

def scrape_all_pages():
    """ 爬取 HEFAS 主页面 + 其所有子页面 """
    soup = get_soup(base_url)
    if not soup:
        return

    # 爬取主页面
    scrape_page(base_url)

    # 获取所有子链接（同域名下的页面）
    sub_links = set()
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith("/hefas/") and not href.startswith("http"):
            full_url = "https://www.deanza.edu" + href
            sub_links.add(full_url)

    print(f"🔍 Found {len(sub_links)} sub-pages")

    # 爬取所有子页面
    for link in sub_links:
        scrape_page(link)


In [16]:
# 运行爬取
scrape_all_pages()

✅ Saved: HEFAS_Knowledge/HEFAS.md
🔍 Found 15 sub-pages
✅ Saved: HEFAS_Knowledge/Important_Legislation.md
✅ Saved: HEFAS_Knowledge/Untitled.md
✅ Saved: HEFAS_Knowledge/Undocumented_Student_Week_of_Action.md
✅ Saved: HEFAS_Knowledge/Volunteering.md
✅ Saved: HEFAS_Knowledge/Resources.md
✅ Saved: HEFAS_Knowledge/HEFAS_Interns.md
✅ Saved: HEFAS_Knowledge/Members.md
✅ Saved: HEFAS_Knowledge/Internships.md
✅ Saved: HEFAS_Knowledge/HEFAS.md
✅ Saved: HEFAS_Knowledge/UndocuSTEM_Program.md
✅ Saved: HEFAS_Knowledge/UndocuSol.md
✅ Saved: HEFAS_Knowledge/Donations.md
✅ Saved: HEFAS_Knowledge/Legal_Services.md
✅ Saved: HEFAS_Knowledge/HEFAS_Annual_Summit.md
✅ Saved: HEFAS_Knowledge/Meet_With_Us.md
