# 解析技能页面

包含主页面以及子项

In [3]:
import json
from bs4 import BeautifulSoup

In [82]:
import os
import pandas as pd

In [77]:
def save_json(obj, fpath):
    if not os.path.isdir('../data/json/'):
        os.mkdir('../data/json/')
    with open(os.path.join('../data/json/',fpath),'w',encoding='utf-8') as file:
        json.dump(obj,file,ensure_ascii=False)

In [12]:
skill_page_name = [
    'Active_Skill',
    'Support_Skill',
    'Passive_Skill',
    'Activation_Medium_Skill',
    'Noble_Support_Skill',
    'Magnificent_Support_Skill'
]

## 1. 技能页面解析
https://tlidb.com/cn/Active_Skill

### 1.1 抓取所有tag

In [67]:
def parse_skill_tags(fpath):
    soup = BeautifulSoup(open(fpath).read(), 'html.parser')
    
    # 查找所有class为"btn btn-sm border mb-1"的span标签，并提取其文本内容
    return [tag.text for tag in soup.find_all('span', class_='btn btn-sm border mb-1')]

### 1.2 抓取技能名称

In [68]:
def parse_skill_fname(fpath):
    soup = BeautifulSoup(open(fpath).read(), 'html.parser')
    
    # 查找所有class为"btn btn-sm border mb-1"的span标签，并提取其文本内容
    divs = soup.find_all('div', class_='flex-grow-1 mx-2 my-1')
    return [a['href'] for div in divs for a in div.find_all('a', href=True)]

### 1.3 抓取技能详情

In [78]:
def parse_skill_detail(fpath):
    # 使用BeautifulSoup解析HTML
    soup = BeautifulSoup(open(fpath).read(), 'html.parser')
    
    # 查找class为"card ui_item popupItem"的div标签
    card = soup.find('div', class_='card ui_item popupItem')
    
    # 提取card-title
    card_title = card.find('h5', class_='card-title').text.strip()
    
    # 提取border p-1 mb-1 tag
    tags = card.find_all('span', class_='border p-1 mb-1 tag')
    tag_text = [tag.text.strip() for tag in tags]
    
    # 提取weapon_restrict_description
    weapon_restrict_node = card.find('div', {'data-block': 'weapon_restrict_description'})
    weapon_restrict_description = weapon_restrict_node.text.strip() if weapon_restrict_node else ''
    
    # 提取explicitMod
    explicit_mods = card.find_all('div', class_='explicitMod')
    explicit_mod_text = '\n'.join([mod.text.strip() for mod in explicit_mods])
    
    # 查找所有<e>标签，并提取其文本和data-bs-title属性
    tooltips = []
    for e_tag in card.find_all('e', attrs={'data-bs-toggle': 'tooltip'}):
        tooltip_text = e_tag.text.strip()
        tooltip_title = e_tag.get('data-bs-title', '')
        tooltips.append({'text': tooltip_text, 'data-bs-title': tooltip_title})
    
    
    # 查找所有class为"btn btn-sm mb-1 border"的button标签，并提取其text和href属性
    buttons = soup.find_all('button', class_='btn btn-sm mb-1 border')
    hero_binding = []
    
    for button in buttons:
        a_tag = button.find('a')
        text = a_tag.text.strip()
        unlock_level = button.text.strip().split(':')[1].strip()
        href = a_tag['href']
        hero_binding.append({'hero_name': text, 'unlock_level':unlock_level, 'href': href})
    
    # 查找class为"col-sm-12"的div标签
    table_div = soup.find('div', class_='card mb-2')

    if table_div:
        # 查找table标签
        table = table_div.find('table')
        
        # 提取表头
        headers = [th.text.strip() for th in table.find('thead').find_all('th')]
        
        # 提取表格内容
        rows = table.find('tbody').find_all('tr')
        detail_data = {header: [] for header in headers}
        
        for row in rows:
            cols = row.find_all('td')
            for i, col in enumerate(cols):
                detail_data[headers[i]].append(col.text)
    else:
        detail_data = {'null':''}
    # 构建字典
    result = {
        'skill_name': card_title,
        'skill_tag': tag_text,
        'skill_hero_binding':hero_binding,
        'skill_weapon_restrict_description': weapon_restrict_description,
        'skill_description_text': explicit_mod_text,
        'skill_tooltip':tooltips,
        'skill_detail':detail_data
    }
    return result

### 1.4 分别抽取数据并存为json

In [79]:
base_dir = '../data/site/cn/'
for skill_key in skill_page_name:
    fpath = os.path.join(base_dir,f'{skill_key}.html')
    skill_tag = parse_skill_tags(fpath)
    skill_sub_name = parse_skill_fname(fpath)

    skill_data = {
        'skill_main_name':skill_key,
        'skill_all_tags':skill_tag,
        'skill_detail_node':[]
    }
    for skill_name in skill_sub_name:
        detail_fpath = os.path.join(base_dir,f'{skill_name}.html')
        detail_data = parse_skill_detail(detail_fpath)
        detail_data['skill_fname'] = skill_name
        skill_data['skill_detail_node'].append(detail_data)
    save_json(skill_data,f'{skill_key}.json')

## 2. tooltips页面解析

In [65]:
def parse_tooltips_page(fpath):
    
    soup = BeautifulSoup(open(fpath).read(), 'html.parser')
    div = soup.find('div', class_='card mb-2')
    
    # 查找table标签
    table = div.find('table')
    
    # 提取表头
    headers = [th.text.strip() for th in table.find('thead').find_all('th')]
    
    # 提取表格内容
    rows = table.find('tbody').find_all('tr')
    detail_data = {header: [] for header in headers}
    
    for row in rows:
        cols = row.find_all('td')
        for i, col in enumerate(cols):
            detail_data[headers[i]].append(col.text)
    return detail_data

In [80]:
tooltips_data = parse_tooltips_page('../data/site/cn/Hyperlink.html')

In [81]:
save_json(tooltips_data,f'tooltips.json')

In [83]:
tooltips_df = pd.DataFrame(tooltips_data)

In [85]:
del tooltips_df['manual_rule_id']

In [86]:
tooltips_df.to_excel('../data/tips.xlsx')