In [6]:
import time, json, requests
from collections import deque, OrderedDict
from urllib.parse import urljoin, urlparse, urldefrag
from bs4 import BeautifulSoup

START_URL   = "https://www.musashino-u.ac.jp/"
DOMAIN_ROOT = "musashino-u.ac.jp"
MAX_PAGES   = 2000

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Assignment Crawler)"
}

NON_HTML_EXT = (
    ".pdf",".jpg",".jpeg",".png",".gif",".svg",".webp",
    ".zip",".rar",".7z",".tar",".gz",
    ".mp4",".mp3",".mov",".avi",
    ".doc",".docx",".xls",".xlsx",".ppt",".pptx",
    ".css",".js",".json",".xml",".txt",".csv"
)

def is_same_domain(url: str) -> bool:
    try:
        return urlparse(url).netloc.lower().endswith(DOMAIN_ROOT)
    except Exception:
        return False

def is_html_like(url: str) -> bool:
    return not urlparse(url).path.lower().endswith(NON_HTML_EXT)

def normalize(base: str, href: str) -> str | None:
    if not href or href.startswith(("mailto:", "tel:", "javascript:")):
        return None
    try:
        abs_url = urljoin(base, href)
    except ValueError:
        return None
    abs_url, _ = urldefrag(abs_url)
    return abs_url

def fetch(url: str):
    try:
        r = requests.get(url, headers=HEADERS, timeout=10)
        r.encoding = r.apparent_encoding or r.encoding
        return r
    except requests.RequestException:
        return None

def crawl(start_url: str):
    q = deque([start_url])
    seen = set([start_url])
    url_title = OrderedDict()

    while q and len(url_title) < MAX_PAGES:
        cur = q.popleft()
        if not is_same_domain(cur) or not is_html_like(cur):
            continue

        time.sleep(1.0)
        resp = fetch(cur)
        if not resp or resp.status_code >= 400:
            continue

        soup = BeautifulSoup(resp.text, "html.parser")
        title = soup.title.get_text(strip=True) if soup.title else ""
        title = " ".join(title.split())
        url_title[cur] = title

        for a in soup.find_all("a", href=True):
            nxt = normalize(cur, a["href"])
            if not nxt or not is_same_domain(nxt) or not is_html_like(nxt):
                continue
            if nxt not in seen:
                seen.add(nxt)
                q.append(nxt)

    return url_title

url_title = crawl(START_URL)

batch = 200
count = 0
for k, v in url_title.items():
    print(f'{k} : {v}')
    count += 1
    if count % batch == 0:
        print(f'--- printed {count} items ---')

print(f"\nTotal collected: {len(url_title)} (limit {MAX_PAGES})")

with open("mu_sitemap.txt", "w", encoding="utf-8") as f:
    for k, v in url_title.items():
        f.write(f"{k}\t{v}\n")

with open("mu_sitemap.json", "w", encoding="utf-8") as f:
    json.dump(url_title, f, ensure_ascii=False, indent=2)

print("Saved: mu_sitemap.txt, mu_sitemap.json")


https://www.musashino-u.ac.jp/ : 武蔵野大学
https://ef.musashino-u.ac.jp/donation/ : ご寄付のお願い | 学校法人武蔵野大学
https://www.musashino-u.ac.jp/access.html : 交通アクセス | 武蔵野大学
https://www.musashino-u.ac.jp/admission/request.html : 資料請求 | 入試情報 | 武蔵野大学
https://www.musashino-u.ac.jp/contact.html : お問い合わせ | 武蔵野大学
https://www.musashino-u.ac.jp/prospective-students.html : 武蔵野大学で学びたい方 | 武蔵野大学
https://www.musashino-u.ac.jp/students.html : 在学生の方 | 武蔵野大学
https://www.musashino-u.ac.jp/alumni.html : 卒業生の方 | 武蔵野大学
https://www.musashino-u.ac.jp/parents.html : 保護者の方 | 武蔵野大学
https://www.musashino-u.ac.jp/business.html : 企業・研究者の方 | 武蔵野大学
https://www.musashino-u.ac.jp/guide/ : 大学案内 | 武蔵野大学
https://www.musashino-u.ac.jp/guide/profile/ : 大学紹介 | 大学案内 | 武蔵野大学
https://www.musashino-u.ac.jp/guide/activities/ : 大学の取り組み | 大学案内 | 武蔵野大学
https://www.musashino-u.ac.jp/guide/campus/ : キャンパス | 大学案内 | 武蔵野大学
https://www.musashino-u.ac.jp/guide/facility/ : 附置機関・センター・附属施設 | 大学案内 | 武蔵野大学
https://www.musashino-u.ac.jp/guide/information/ : 