In [None]:
import os
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

In [None]:
def safe_get(url, retries=3, timeout=10, delay=2):
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=timeout)
            return response
        except requests.exceptions.RequestException as e:
            print(f"Try {attempt+1}/{retries} Failure：{e}")
            time.sleep(delay)
    print(f"Finall failure: {url}")
    return None

In [None]:
def crawl_single_detail_page(base_url, detail_url, save_folder):
    try:
        response = requests.get(detail_url, timeout=10)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the item's name
        name_tag = soup.find('div', class_='detail_h')
        if not name_tag:
            raise ValueError("name is invalid!")
        name = name_tag.text.strip()
        safe_name = "".join(c for c in name if c.isalnum() or c in "_-（）()").strip()

        # Extract images and description text
        detail_text_div = soup.find('div', class_='detail_text')
        if not detail_text_div:
            raise ValueError("description is invalid!")

        imgs = detail_text_div.find_all('img')
        description = detail_text_div.get_text(separator='\n', strip=True)

        # Download and save all images
        for i, img in enumerate(imgs):
            img_url = urljoin(base_url, img['src'])
            ext = os.path.splitext(img_url)[-1] or '.jpg'
            if len(imgs) == 1:
                img_filename = f"{safe_name}{ext}"
            else:
                img_filename = f"{safe_name}_{i+1}{ext}"
            img_path = os.path.join(save_folder, img_filename)

            img_data = requests.get(img_url, timeout=10).content
            with open(img_path, 'wb') as f:
                f.write(img_data)

        print(f"finished: {name}")
        return {'url': detail_url, 'name': name, 'description': description}

    except Exception as e:
        print(f"Error: {detail_url}, Reason: {e}")
        # Even on failure, return the URL with None for name and description for later review
        return {'url': detail_url, 'name': None, 'description': None}


In [None]:
def crawl_all_pages_and_details(base_url, first_page, save_folder='data_museum/image'):
    """
    Crawl through all pagination pages, extract detail page links on each page,
    and immediately crawl content from each detail page.
    
    :param base_url: The base URL of the website
    :param first_page: The initial pagination path to start crawling
    :param save_folder: Folder to save downloaded images
    :return: A DataFrame containing item name and description
    """
    os.makedirs(save_folder, exist_ok=True)
    current_url = urljoin(base_url, first_page)
    data = []

    while True:
        print(f"The current result page being processed is: {current_url}")
        response = requests.get(current_url)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract detail page links from <p class="p"> and crawl them immediately
        for p_tag in soup.find_all('p', class_='p'):
            a_tag = p_tag.find('a', href=True)
            if a_tag and '/zggd/info_21.aspx?itemid=' in a_tag['href']:
                full_url = urljoin(base_url, a_tag['href'])
                item_data = crawl_single_detail_page(base_url, full_url, save_folder)
                if item_data:
                    data.append(item_data)

        # Handle pagination: find the "next page" link
        next_page_tag = soup.find('a', class_='a_next')
        if not next_page_tag or next_page_tag['href'] == 'javascript:void(0);':
            print("Reached the last page.")
            break

        next_href = next_page_tag['href']
        current_url = urljoin(base_url, next_href)

        time.sleep(2)  # Prevent sending requests too frequently

    return pd.DataFrame(data)


In [50]:
base_url = 'https://www.chinasilkmuseum.com/zggd/'
SuiTang_url = ['list_21.aspx?lcid=&jishu=&era=%u968B%u5510&title=&storage=','SuiTang']
Song_url = ['list_21.aspx?lcid=&jishu=&era=%u5B8B%u4EE3&title=&storage=','Song']
Liao_url = ['list_21.aspx?lcid=&jishu=&era=%u8FBD%u4EE3&title=&storage=','Liao']
Yuan_url = ['list_21.aspx?lcid=&jishu=&era=%u5143%u4EE3&title=&storage=','Yuan']
Ming_url = ['list_21.aspx?lcid=&jishu=&era=%u660E%u4EE3&title=&storage=','Ming']
Qing_url = ['list_21.aspx?lcid=&jishu=&era=%u6E05%u4EE3&title=&storage=','Qing']
first_pages = [SuiTang_url,Song_url,Liao_url,Yuan_url,Ming_url,Qing_url]

In [None]:
all_data_df = pd.DataFrame(columns=['url', 'name', 'description', 'dynasty'])

# Iterate through each entry page (dynasty section)
for first_page in first_pages:
    print(f"\nBegin to process: {first_page[1]}")
    page_df = crawl_all_pages_and_details(base_url, first_page[0])
    page_df['dynasty'] = first_page[1]
    all_data_df = pd.concat([all_data_df, page_df], ignore_index=True)

# Save all collected data to CSV
all_data_df.to_csv('data_museum/metadata_silk_museum.csv', index=False, encoding='utf-8-sig')



Begin to process：SuiTang
The current result page being processed is: https://www.chinasilkmuseum.com/zggd/list_21.aspx?lcid=&jishu=&era=%u968B%u5510&title=&storage=
finished: 唐绿地团窠联珠对山羊纹锦
finished: 唐代团窠联珠对狮纹锦
finished: 唐代团窠联珠对孔雀纹锦
finished: 唐黄地团窠联珠对马纹锦
finished: 唐蓝绿色地团窠联珠对格力芬纹锦
finished: 唐团窠对羊纹锦
finished: 唐代联珠兽纹锦
finished: 唐代大窠宝花纹绫
finished: 唐花鸟纹锦
finished: 唐对鸟纹锦
finished: 唐联珠对孔雀纹锦
finished: 唐墨绘腰带
The current result page being processed is: https://www.chinasilkmuseum.com/zggd/list_21.aspx?era=%u968B%u5510&page=2
finished: 唐红综色纬锦
finished: 唐彩绘小花罗
finished: 唐黄地对鸟纹锦边饰
finished: 唐大红绢覆面
finished: 唐团窠对鸟衔授带锦
finished: 唐红地对鸟纹锦
finished: 唐宝花纹锦
finished: 唐小菱格纹绫
finished: 唐黄地团窠尖瓣对鸟纹锦
finished: 唐红地花瓣联珠对鹿纹锦
finished: 唐红地对雕纹纬锦
finished: 唐绿色菱格小花纹绫
The current result page being processed is: https://www.chinasilkmuseum.com/zggd/list_21.aspx?era=%u968B%u5510&page=3
finished: 唐综红地带式联珠纹锦
finished: 唐团窠纹锦
finished: 唐代联珠对鸟纹锦
finished: 唐团窠纹锦
finished: 唐代团窠对鹿纹锦袖头
finished: 唐代对鹿纹锦
finished: 唐代团窠对鹿纹锦
finished