# 将html网页转换为json格式

In [7]:
import json
from bs4 import BeautifulSoup

In [12]:
import os

In [13]:
def save_json(obj, fpath):
    if not os.path.isdir('../data/json/'):
        os.mkdir('../data/json/')
    with open(os.path.join('../data/json/',fpath),'w',encoding='utf-8') as file:
        json.dump(obj,file)

## 主页解析

In [9]:
def extract_navbar(soup):
    navbar = soup.find('nav', class_='navbar')
    if not navbar:
        return []
    
    navbar_data = []
    
    for item in navbar.find_all('li', class_='nav-item'):
        nav_item = {}
        nav_link = item.find('a', class_='nav-link')
        if nav_link:
            nav_item['text'] = nav_link.get_text(strip=True)
            nav_item['href'] = nav_link.get('href')
        
        dropdown_menu = item.find('ul', class_='dropdown-menu')
        if dropdown_menu:
            nav_item['dropdown'] = []
            for dropdown_item in dropdown_menu.find_all('li'):
                dropdown_link = dropdown_item.find('a', class_='dropdown-item')
                if dropdown_link:
                    nav_item['dropdown'].append({
                        'text': dropdown_link.get_text(strip=True),
                        'href': dropdown_link.get('href')
                    })
        
        navbar_data.append(nav_item)
    
    return navbar_data

def extract_page(soup):
    page = soup.find('div', class_='page')
    if not page:
        return []
    
    page_data = []
    
    for card in page.find_all('div', class_='card'):
        card_data = {}
        card_header = card.find('div', class_='card-header')
        if card_header:
            card_data['header'] = card_header.get_text(strip=True)
        
        card_body = card.find('div', class_='card-body')
        if card_body:
            card_data['body'] = []
            
            for link in card_body.find_all('a'):
                card_data['body'].append({
                    'text': link.get_text(strip=True),
                    'href': link.get('href')
                })
        
        page_data.append(card_data)
    
    return page_data

def html_to_json(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    navbar_data = extract_navbar(soup)
    page_data = extract_page(soup)
    
    result = {
        'navbar': navbar_data,
        'page': page_data
    }
    
    return json.dumps(result, ensure_ascii=False, indent=4)

In [5]:
# 假设html_content是已经获取到的HTML内容
html_content = open('../data/site/cn.html','r',encoding='utf-8').read()

In [11]:
json_output = html_to_json(html_content)

In [14]:
save_json(json_output,'home.json')