In [122]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from io import StringIO

def html_to_soup(url):
	try:
		res = requests.get('https://m.prts.wiki'+url, timeout=(5, 5))
	except requests.exceptions.Timeout:
		return None
	soup = BeautifulSoup(res.text, 'html.parser')
	return soup

# soup -> df
def make_df(soup):
	script = soup.find('script', {'id': 'datas_txt', 'type': 'csv'})
	lines = [line.strip() for line in script.string.splitlines() if line.strip()]
	records = []
	pattern = re.compile(
		r'^\[(?P<cmd>\w+)'
		r'(?:\((?P<args>.*?)\)|=(?P<val>[^]]+))?'
		r'\](?:\s*(?P<text>.*))?$'
	)
	for ln in lines:
		m = pattern.match(ln)
		if m:
			cmd   = m.group('cmd').lower()
			args  = m.group('args')
			val   = m.group('val')
			text  = m.group('text') or ''

			params = {}
			if args:
				# (key=val, …) をパース
				for part in re.split(r',\s*(?=\w+=)', args):
					k, v = part.split('=', 1)
					params[k] = v.strip().strip('"')
			elif val is not None:
				# = "…" 形式はコマンド名をキーに
				params[cmd] = val.strip('"')

			records.append({'command': cmd, 'text': text, **params})
		else:
			records.append({'command': None, 'text': ln})

	# DataFrame に
	df = pd.DataFrame(records)
	df['text'] = df['text'].str.replace('Dr.{@nickname}', '博士')
	df['text'] = df['text'].str.replace('{@nickname}博士', '博士')
	if 'name' not in df.columns:
		df['name'] = ''

	# 自分の発言をoptionsからtextに移す
	if 'options' in df.columns:
		df['option_number'] = df['options'].fillna('').apply(
			lambda x: len(x.split(';')) if x else 0
		)
		# 1. explode 対象のマスク（text が NaN または 空文字、かつ options が非空）
		mask = (df['text'].isna() | df['text'].eq('')) & df['options'].notna() & df['options'].ne('')

		# 2. explode 対象をコピーして split ＆ explode
		to_exp = df[mask].copy()
		to_exp['text'] = to_exp['options'].str.split(';')
		to_exp = to_exp.explode('text')

		# 3. 対象外はそのままコピー
		to_keep = df[~mask].copy()

		# 4. 結合して元の順序に並び替え
		df= pd.concat([to_keep, to_exp], axis=0).sort_index().reset_index(drop=True)
	return df


# soup,df -> キャラのurlのdict
def char_img_dict(soup):
	datas_char= soup.find('script', {'id': 'datas_char', 'type': 'csv'}).string
	datas_char =  pd.read_csv(StringIO(datas_char), header=None)
	datas_char.columns = ['name', 'html']
	datas_char['name'] = datas_char['name'].str.lower()
	datas_char = dict(zip(datas_char['name'], datas_char['html']))
	return datas_char

In [123]:
from openai import OpenAI
import json
from typing import List
from pydantic import BaseModel
import os

client = OpenAI()
class TranslateAllModel(BaseModel):
	# title: str
	class TranslateModel(BaseModel):
		index: int
		name: str
		jp_name:str
		text: str
		jp_text: str
	translation: List[TranslateModel]

def translate(df,mini):
	df["index"] = df.index
	p = df[df['command'].isin(['name', 'decision'])]
	p = p[['index','name','text']]

	if mini:model = 'gpt-4.1-mini'
	else:model = 'gpt-4.1'

	chunk_size = 80
	chunks = [p.iloc[i : i + chunk_size] for i in range(0, len(p), chunk_size)]

	processed = []
	for chunk in chunks:
		p = chunk.to_json(orient='records',force_ascii=False)
		prompt = json.dumps(p, ensure_ascii=False)
		response = client.responses.parse(
			model=model,
			input=[
				{"role": "system", "content":
					"""あなたは優秀な翻訳者です。以下のリスト形式の JSON を読み込み、各オブジェクトの 'name',“text” を日本語に翻訳し、キー “jp_name”, “jp_text” として追加してください。出力は同じリスト形式の JSON のまま返してください。
					# 注意:「博士」は「ドクター」と訳すこと。入力が「？」など記号のみの場合，そのまま出力せよ。阿米娅のセリフは敬体にしなさい。"""},
				{"role": "user",   "content": prompt},
			],
			text_format=TranslateAllModel,
		)
		processed_chunk = pd.DataFrame(response.output_parsed.dict()['translation'])
		processed.append(processed_chunk)

	# 処理結果を結合
	trans_df =  pd.concat(processed, ignore_index=True)
	trans_df['text'] = trans_df['text'].str.replace('\\\\', '\\')
	trans_df['text'] = trans_df['text'].str.replace('\\"', '"')

	merged = (
		df.merge(
			trans_df[["index", "text", "jp_text",'jp_name']],  # 原文(text), 翻訳(jp_text)を取り込む
			on="index",
			how="left",
			suffixes=("_orig", "")
		)
		.assign(
			text_match=lambda d: d["text"] == d["text_orig"]  # text と text_orig が同じか
		)
	)
	m = merged[~merged["text_match"]][["index", "text",'text_orig']]
	k = m[m["text"].notna()& m["text"].str.strip().ne("")]
	if k.size:
		# print('不一致')
		print(k)
		# return None
	return merged

def text_html(pp,icon='',translation=True):
	bbb = ''
	if pp['command'] == 'name':
		if pp['name']:
			bbb += f'{icon}\n<em class="name">{pp["name"]}</em>\n'
			if translation:
				bbb += f'<em class="translation">{pp["jp_name"]}</em>\n'

		bbb = f'<div class="character">{bbb}</div>'
		text = ''
		if pp['text']:
			text += f'<p>{pp["text"]}</p>\n'
			if translation:
				text += f'<p class="translation">{pp["jp_text"]}</p>\n'
			bbb += f"""<div class="text">\n{text}</div>"""
		return f"""<li class="dialogue">\n{bbb}\n</li>\n"""
	else:
		return bbb

def icon_html(pp,char_dict):
	a =  'name'
	if 'focus' in pp.index and pd.notna(pp['focus']):
		f = int(pp['focus'])
		if f>1:
			a += str(f)
	d = {'char_136_hsguma':'char_136_hsguma_1',
	 	'char_1504_cqbw':'char_1504_cqbw_1',
	 	'char_2006_weiywfmzuki_1':"char_2006_fmzuki_1"}
	cha_id = pp[a].split('#')[0].lower()

	if cha_id in d.keys():
		cha_id = d[cha_id]
	if cha_id in char_dict.keys():
		url = char_dict[cha_id]
		a = f'<div class="icon" style="--img-url: url(\'{url}\')" data-src="{url}"></div>'
		return a
	else: return None

def img_back_html(image_id, df_datas_back):
	t = df_datas_back[df_datas_back['name']==image_id].html.iloc[0]
	return f'<img src="{t}" width="100%" height="auto">'

from collections import defaultdict
# soup,df -> html（本文）
def make_html(df,translation=True):
	p = df
	b = ''
	with open('soup/soup14-8-BEG.txt','r') as f:
		h = f.read()
	soup = BeautifulSoup(h,'html.parser')
	datas_back = soup.find('script', {'id': 'datas_back', 'type': 'csv'}).string
	df_datas_back=pd.read_csv(StringIO(datas_back), header=None)
	df_datas_back.columns = ['name', 'html']
	char_dict = char_img_dict(soup)

	# watched_img = []

	nokori_div = 0
	opts_dict = defaultdict(str)
	opts_dict_jp = defaultdict(str)
	icon = ''

	for i in range(p.shape[0]):
		pp = p.iloc[i]
		c = pp['command']

		if c == 'decision':
			match nokori_div:
				case 0: # 連続するdecisionのひとつめ
					nokori_div = pp['option_number']
					b += '<div class="decision">'
					b += '<li class="decision">'
					b += f'<p class="decision">「{pp["text"]}」</p>'
					if translation:b += f'<p class="translation">「{pp["jp_text"]}」</p>'
				case 1:
					b += f'<p class="decision">「{pp["text"]}」</p>'
					if translation:b += f'<p class="translation">「{pp["jp_text"]}」</p>'
					b += '</li>'
				case _:
					b += f'<p class="decision">「{pp["text"]}」</p>'
					if translation: b += f'<p class="translation">「{pp["jp_text"]}」</p>'
			opts_list.append(pp["text"])
			if translation:	opts_list_jp.append(pp["jp_text"])
			nokori_div -= 1
		elif c=='predicate':
			if len(opts_list)==1:
				b += '</div>'
				opts_list = []
				opts_list_jp = []
			else:
				ref = pp['references']
				if type(ref)==str and ';' not in ref:
					ref = int(ref)
					if ref>1: b += '</div>'
					b += '<div class="predicate">'
					b +=  f'<p>「{opts_list[ref-1]}」</p>'
					if translation: b += f'<p class="translation">「{opts_list_jp[ref-1]}」</p>'
				else:
					if i<p.shape[0]-1 and p.iloc[i+1]['command']=='predicate':
						continue
					b += '</div></div>'
					opts_list = []
					opts_list_jp = []
		elif c=='character':
			if pd.notna(pp['name']):
				icon = icon_html(pp,char_dict)
			else:
				icon = ''
			if  ('focus' in pp.index) and pp['focus']==-1:
				icon = ""
		elif c=='blocker':
			icon=''
		elif c=='name':
			b += text_html(pp,icon,translation)
		elif c in ('image','background'):
			image_id = pp['image']
			if pd.notna(image_id):
				if c=='background':
					image_id = 'bg_' + image_id
				b += img_back_html(image_id, df_datas_back)+'\n'
	return b

In [124]:
def make_filename(url, extension='html'):
	filename_list = url.split('/')[2:]
	filename = filename_list[0].split('_')[0]
	if len(filename_list)>1:
		filename += '-'+filename_list[1]
	if extension=='html':
		filename += '.html'
	if extension=='csv':
		filename += '.csv'
	if extension=='txt':
		filename += '.txt'
	return filename

def translate_title(title):
	appendix = ''
	if '行动前' == title[-3:]:
		title = title[:-3]
		appendix = ' 戦闘前'
	elif '行动后' == title[-3:]:
		title = title[:-3]
		appendix = ' 戦闘後'
	response = client.responses.create(
		model="gpt-4.1",
		input=[{
			'role': 'system',
			'content': 'あなたは中国語の翻訳家です。中国語の小説のタイトルを与えるので，日本語に訳してください。英語のみのときや記号のみの場合はそのまま出力せよ。そのほかの出力は絶対にするな',
		},
			{'role':'user','content':title}]
	)
	return response.output_text + appendix

def make_html_all(url,title,next_page='',translation=True,mini=False,update_html=False, force_translate=False):
	if not update_html:
		if os.path.exists('./html/'+make_filename(url)):
			# print('html already exists')
			return

	soup_filename = make_filename(url,'txt')
	soup_path = './soup/soup'+soup_filename
	flag = 1
	if os.path.exists(soup_path):
		with open(soup_path,'r') as f:
			h = f.read()
		if h: # 以前記録したのが空でなければ，それを使う
			soup = BeautifulSoup(h,'html.parser')
			flag = 0
	if flag:
		print(title+' fetching..')
		soup = html_to_soup(url)
		if soup:
			with open(soup_path,'w') as f:
				f.write(soup.find('script', {'id': 'datas_txt', 'type': 'csv'}).prettify())

	csv_filename = make_filename(url,'csv')
	exist_csv = ''
	if mini:
		csv_path = './csv/mini/'+csv_filename
		if os.path.exists('./csv/'+csv_filename):
			exist_csv = './csv/'+csv_filename
		elif os.path.exists('./html/'+csv_filename):
			exist_csv = './html/'+csv_filename
	else:
		csv_path = './csv/'+csv_filename
		if os.path.exists(csv_path):
			exist_csv = csv_path

	df = make_df(soup)
	# csv がある時

	if exist_csv and not force_translate:
		df_t = pd.read_csv(exist_csv)
		df_t_valid = df_t[df_t['text'].notna()]
		# ２．同じ text が複数ある場合は最初の行だけ残す
		df_t_unique = df_t_valid.drop_duplicates(subset='text')
		# 方法１：merge で一度に両方追加
		df = df.merge(
			df_t_unique[['text', 'jp_text', 'jp_name']],
			on='text',
			how='left'
		)
	# csvないとき　翻訳のcsvを新規作成する
	else:
		if not soup:
			print('timeout')
			return
		print(title+' fetched. now translating..')
		if translation:
			df = translate(df,mini)
		df.to_csv(csv_path, index=False)

	body = make_html(df,translation=translation)
	title = f'{title}　-　{translate_title(title)}'
	html = (f"""<!DOCTYPE html>
	<html lang="cn">
	<head>
	 <meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1">
	 <script src="https://kit.fontawesome.com/c07fe94aba.js" crossorigin="anonymous"></script>
	 <link href="https://fonts.googleapis.com/css2?family=Noto+Sans:wght@400;700&display=swap" rel="stylesheet">
	 <title>{title}</title>
	 <link rel="stylesheet" href="../css/styles.css">
</head><body><h2 class="title">{title}</h2>
<ul>{body}</ul>  <div class="button-container">
   <button class="back-home" onclick="location.href='../index.html'" aria-label="ホーム">
    <i class="fa-solid fa-house"></i>
   </button>
  <button class="next-page" onclick="location.href='{next_page}'">
   <i class="fa-solid fa-angle-right"></i>
  </button>
  </div>
 <div class="overlay">
   <img class="overlay-img" src="" alt="元画像">
  </div>
<script>
	const overlay = document.querySelector('.overlay');
    const img = overlay.querySelector('.overlay-img');

    document.body.addEventListener('click', e => {{
      if (e.target.matches('.icon')) {{
        img.src = e.target.dataset.src;
        overlay.classList.add('show');
      }}
      else if (e.target === overlay) {{
        overlay.classList.remove('show');
      }}    }});
        function fitTextToWidth(el, maxWidth) {{
    // 現在のフォントサイズを取得
    let style = window.getComputedStyle(el);
	let fontSize = parseFloat(style.fontSize);
	// 幅がオーバーしている限り、1pxずつ小さくする
while (el.scrollWidth > maxWidth && fontSize > 8) {{
fontSize -= 1;
el.style.fontSize = fontSize + 'px';
}}
}}

// ページ読み込み後／ウィンドウリサイズ時に実行
function adjustAll() {{
document.querySelectorAll('.character').forEach(char => {{
	const maxW = char.clientWidth;
char.querySelectorAll('em.name, em.translation').forEach(el => {{
// 初期サイズにリセットしてから再計算
el.style.fontSize = '';
fitTextToWidth(el, maxW);
}});
}});
}}

window.addEventListener('load', adjustAll);
window.addEventListener('resize', adjustAll);
</script>
</body>
</html>""")
	soup = BeautifulSoup(html, 'html.parser')
	html = soup.prettify()
	filename = make_filename(url)
	with open('./html/'+filename,'w',encoding='utf-8') as f:
		f.write(html)
	print(f"⇒ {title} saved to {filename}")

In [128]:
d = dict()

In [129]:
d[1]=10

In [130]:
d[1]=3

In [126]:
with open('a.html', 'r', encoding='utf-8') as f:
	soup = BeautifulSoup(f, 'html.parser')
tables = soup.find_all('table')
links = []
for table in tables:
	for pagelink in table.find_all('a'):
		if 'href' in pagelink.attrs:
			if '一览' in pagelink.string:
				continue
			link = pagelink['href']
			title = pagelink.string
			title = re.sub(r'[\s\u3000]', '',title)
			links.append([title,link])
print(links)

In [127]:
aa =  ['3-6决定行动后', '/w/3-6_%E5%86%B3%E5%AE%9A/END'], ['3-7轰鸣行动前', '/w/3-7_%E8%BD%B0%E9%B8%A3/BEG']
a = aa[0]
n =aa[1][1]
n=make_filename(n)
make_html_all(a[1],a[0],next_page=n,translation=True, update_html=True,force_translate=False)

<script id="datas_txt" type="csv">[HEADER(key="title_test", is_skippable=true, fit_mode="BLACK_MASK")] 
[stopmusic]
[Dialog]
[playMusic(intro="$dignified_intro", key="$dignified_loop", volume=0.4)]
[Delay(time=1)]
[Blocker(a=1, r=0, g=0, b=0, fadetime=1, block=true)]
[Background(image="bg_corridor",screenadapt="coverall")]
[Blocker(a=0, r=0, g=0, b=0, fadetime=3, block=true)]
6:30 p.m. 
[Dialog]
[PlaySound(key="$dooropenquite", volume=0.6)]
[delay(time=2)]
[PlaySound(key="$d_gen_walk_n")]
[delay(time=2)]
[Character]
[name="PRTS"]  编号00000-00002，接入权限-8。
[name="PRTS"]  Dr.{@nickname}，欢迎访问罗德岛综合生物处理室，已依据生物数据对您的意图进行判断。
[name="PRTS"]  另外，系统检测到您的心情不佳。
[Decision(options="少烦我。;......;该怎么样才算是好心情？", values="1;2;3")]
[Predicate(references="1")]
[Character]
[name="PRTS"]  Dr.{@nickname}表现出一定的攻击性。
[name="PRTS"]  请放心，系统不会因此电击你，不用太过顾忌系统对你的检测。
[Predicate(references="2")]
[Character]
[name="PRTS"]  Dr.{@nickname}陷入了沉默。
[name="PRTS"]  无论是无声抗议还是不愿进行沟通，我认为这都是一种孤独性精神障碍症的表现。
[name="PRTS"]  当然，请便，系统会平等对待所有人。


In [52]:
links =  [
	['7-132:00:00', '/w/7-1_32:00:00/NBT'], ['7-2别离之夜行动前', '/w/7-2_%E5%88%AB%E7%A6%BB%E4%B9%8B%E5%A4%9C/BEG'], ['7-2别离之夜行动后', '/w/7-2_%E5%88%AB%E7%A6%BB%E4%B9%8B%E5%A4%9C/END'], ['7-3变节之刃行动前', '/w/7-3_%E5%8F%98%E8%8A%82%E4%B9%8B%E5%88%83/BEG'], ['7-3变节之刃行动后', '/w/7-3_%E5%8F%98%E8%8A%82%E4%B9%8B%E5%88%83/END'], ['7-4并肩之约-1行动前', '/w/7-4_%E5%B9%B6%E8%82%A9%E4%B9%8B%E7%BA%A6-1/BEG'], ['7-5并肩之约-2行动后', '/w/7-5_%E5%B9%B6%E8%82%A9%E4%B9%8B%E7%BA%A6-2/END'], ['7-6遗忘之地行动前', '/w/7-6_%E9%81%97%E5%BF%98%E4%B9%8B%E5%9C%B0/BEG'], ['7-6遗忘之地行动后', '/w/7-6_%E9%81%97%E5%BF%98%E4%B9%8B%E5%9C%B0/END'], ['7-726:37:14', '/w/7-7_26:37:14/NBT'], ['7-8沉默者之怒-1行动前', '/w/7-8_%E6%B2%89%E9%BB%98%E8%80%85%E4%B9%8B%E6%80%92-1/BEG'], ['7-9沉默者之怒-2行动后', '/w/7-9_%E6%B2%89%E9%BB%98%E8%80%85%E4%B9%8B%E6%80%92-2/END'], ['7-10暗淡者之火行动前', '/w/7-10_%E6%9A%97%E6%B7%A1%E8%80%85%E4%B9%8B%E7%81%AB/BEG'], ['7-10暗淡者之火行动后', '/w/7-10_%E6%9A%97%E6%B7%A1%E8%80%85%E4%B9%8B%E7%81%AB/END'], ['7-13感染者之盾-1行动前', '/w/7-13_%E6%84%9F%E6%9F%93%E8%80%85%E4%B9%8B%E7%9B%BE-1/BEG'], ['7-17感染者之盾-2行动后', '/w/7-17_%E6%84%9F%E6%9F%93%E8%80%85%E4%B9%8B%E7%9B%BE-2/END'], ['7-18爱国者之死行动前', '/w/7-18_%E7%88%B1%E5%9B%BD%E8%80%85%E4%B9%8B%E6%AD%BB/BEG'], ['7-18爱国者之死行动后', '/w/7-18_%E7%88%B1%E5%9B%BD%E8%80%85%E4%B9%8B%E6%AD%BB/END'], ['7-1911:15:38', '/w/7-19_11:15:38/NBT'], ['7-20??:??:??', '/w/7-20_%3F%3F:%3F%3F:%3F%3F/NBT'] ,['R8-1昨日，谷壳将裂行动前', '/w/R8-1_%E6%98%A8%E6%97%A5%EF%BC%8C%E8%B0%B7%E5%A3%B3%E5%B0%86%E8%A3%82/BEG'], ['R8-1昨日，谷壳将裂行动后', '/w/R8-1_%E6%98%A8%E6%97%A5%EF%BC%8C%E8%B0%B7%E5%A3%B3%E5%B0%86%E8%A3%82/END'], ['M8-1今日，血色满溢', '/w/M8-1_%E4%BB%8A%E6%97%A5%EF%BC%8C%E8%A1%80%E8%89%B2%E6%BB%A1%E6%BA%A2/NBT'], ['R8-3麦秆，极易燃烧行动前', '/w/R8-3_%E9%BA%A6%E7%A7%86%EF%BC%8C%E6%9E%81%E6%98%93%E7%87%83%E7%83%A7/BEG'], ['R8-3麦秆，极易燃烧行动后', '/w/R8-3_%E9%BA%A6%E7%A7%86%EF%BC%8C%E6%9E%81%E6%98%93%E7%87%83%E7%83%A7/END'], ['M8-2失语，产自多言', '/w/M8-2_%E5%A4%B1%E8%AF%AD%EF%BC%8C%E4%BA%A7%E8%87%AA%E5%A4%9A%E8%A8%80/NBT'], ['R8-4火种，一触即灭行动前', '/w/R8-4_%E7%81%AB%E7%A7%8D%EF%BC%8C%E4%B8%80%E8%A7%A6%E5%8D%B3%E7%81%AD/BEG'], ['R8-4火种，一触即灭行动后', '/w/R8-4_%E7%81%AB%E7%A7%8D%EF%BC%8C%E4%B8%80%E8%A7%A6%E5%8D%B3%E7%81%AD/END'], ['M8-3死亡，召之即来', '/w/M8-3_%E6%AD%BB%E4%BA%A1%EF%BC%8C%E5%8F%AC%E4%B9%8B%E5%8D%B3%E6%9D%A5/NBT'], ['R8-5寒冷，来自知觉行动前', '/w/R8-5_%E5%AF%92%E5%86%B7%EF%BC%8C%E6%9D%A5%E8%87%AA%E7%9F%A5%E8%A7%89/BEG'], ['R8-5寒冷，来自知觉行动后', '/w/R8-5_%E5%AF%92%E5%86%B7%EF%BC%8C%E6%9D%A5%E8%87%AA%E7%9F%A5%E8%A7%89/END'], ['M8-4意志，片缕幻影', '/w/M8-4_%E6%84%8F%E5%BF%97%EF%BC%8C%E7%89%87%E7%BC%95%E5%B9%BB%E5%BD%B1/NBT'], ['R8-6战场，蔓延不止行动前', '/w/R8-6_%E6%88%98%E5%9C%BA%EF%BC%8C%E8%94%93%E5%BB%B6%E4%B8%8D%E6%AD%A2/BEG'], ['R8-6战场，蔓延不止行动后', '/w/R8-6_%E6%88%98%E5%9C%BA%EF%BC%8C%E8%94%93%E5%BB%B6%E4%B8%8D%E6%AD%A2/END'], ['M8-5厄运，等候已久', '/w/M8-5_%E5%8E%84%E8%BF%90%EF%BC%8C%E7%AD%89%E5%80%99%E5%B7%B2%E4%B9%85/NBT'], ['R8-8人心，向背无常行动前', '/w/R8-8_%E4%BA%BA%E5%BF%83%EF%BC%8C%E5%90%91%E8%83%8C%E6%97%A0%E5%B8%B8/BEG'], ['R8-8人心，向背无常行动后', '/w/R8-8_%E4%BA%BA%E5%BF%83%EF%BC%8C%E5%90%91%E8%83%8C%E6%97%A0%E5%B8%B8/END'], ['M8-6再见，只为再见行动前', '/w/M8-6_%E5%86%8D%E8%A7%81%EF%BC%8C%E5%8F%AA%E4%B8%BA%E5%86%8D%E8%A7%81/BEG'], ['M8-6再见，只为再见行动后', '/w/M8-6_%E5%86%8D%E8%A7%81%EF%BC%8C%E5%8F%AA%E4%B8%BA%E5%86%8D%E8%A7%81/END'], ['R8-9相逢，总是离别行动前', '/w/R8-9_%E7%9B%B8%E9%80%A2%EF%BC%8C%E6%80%BB%E6%98%AF%E7%A6%BB%E5%88%AB/BEG'], ['R8-9相逢，总是离别行动后', '/w/R8-9_%E7%9B%B8%E9%80%A2%EF%BC%8C%E6%80%BB%E6%98%AF%E7%A6%BB%E5%88%AB/END'], ['M8-7恶言，报应不爽行动前', '/w/M8-7_%E6%81%B6%E8%A8%80%EF%BC%8C%E6%8A%A5%E5%BA%94%E4%B8%8D%E7%88%BD/BEG'], ['M8-7恶言，报应不爽行动后', '/w/M8-7_%E6%81%B6%E8%A8%80%EF%BC%8C%E6%8A%A5%E5%BA%94%E4%B8%8D%E7%88%BD/END'], ['R8-11落雪，浸黑国土行动前', '/w/R8-11_%E8%90%BD%E9%9B%AA%EF%BC%8C%E6%B5%B8%E9%BB%91%E5%9B%BD%E5%9C%9F/BEG'], ['R8-11落雪，浸黑国土行动后', '/w/R8-11_%E8%90%BD%E9%9B%AA%EF%BC%8C%E6%B5%B8%E9%BB%91%E5%9B%BD%E5%9C%9F/END'], ['M8-8苏醒，浮出梦乡行动前', '/w/M8-8_%E8%8B%8F%E9%86%92%EF%BC%8C%E6%B5%AE%E5%87%BA%E6%A2%A6%E4%B9%A1/BEG'], ['M8-8苏醒，浮出梦乡行动后', '/w/M8-8_%E8%8B%8F%E9%86%92%EF%BC%8C%E6%B5%AE%E5%87%BA%E6%A2%A6%E4%B9%A1/END'], ['JT8-1恨火，流向原野行动前', '/w/JT8-1_%E6%81%A8%E7%81%AB%EF%BC%8C%E6%B5%81%E5%90%91%E5%8E%9F%E9%87%8E/BEG'], ['JT8-1恨火，流向原野行动后', '/w/JT8-1_%E6%81%A8%E7%81%AB%EF%BC%8C%E6%B5%81%E5%90%91%E5%8E%9F%E9%87%8E/END'], ['JT8-2睁眼，便是日暮行动前', '/w/JT8-2_%E7%9D%81%E7%9C%BC%EF%BC%8C%E4%BE%BF%E6%98%AF%E6%97%A5%E6%9A%AE/BEG'], ['JT8-2睁眼，便是日暮行动后', '/w/JT8-2_%E7%9D%81%E7%9C%BC%EF%BC%8C%E4%BE%BF%E6%98%AF%E6%97%A5%E6%9A%AE/END'], ['JT8-3昂首，足践烈焰行动前', '/w/JT8-3_%E6%98%82%E9%A6%96%EF%BC%8C%E8%B6%B3%E8%B7%B5%E7%83%88%E7%84%B0/BEG'], ['JT8-3昂首，足践烈焰行动后', '/w/JT8-3_%E6%98%82%E9%A6%96%EF%BC%8C%E8%B6%B3%E8%B7%B5%E7%83%88%E7%84%B0/END'], ['END8-1尾声，抑或开始', '/w/END8-1_%E5%B0%BE%E5%A3%B0%EF%BC%8C%E6%8A%91%E6%88%96%E5%BC%80%E5%A7%8B/NBT'], ['EG-1燃烧的片段1', '/w/EG-1_%E7%87%83%E7%83%A7%E7%9A%84%E7%89%87%E6%AE%B51/NBT'], ['EG-2燃烧的片段2', '/w/EG-2_%E7%87%83%E7%83%A7%E7%9A%84%E7%89%87%E6%AE%B52/NBT'], ['EG-3燃烧的片段3', '/w/EG-3_%E7%87%83%E7%83%A7%E7%9A%84%E7%89%87%E6%AE%B53/NBT'], ['EG-4燃烧的片段4', '/w/EG-4_%E7%87%83%E7%83%A7%E7%9A%84%E7%89%87%E6%AE%B54/NBT'], ['EG-5燃烧的片段5', '/w/EG-5_%E7%87%83%E7%83%A7%E7%9A%84%E7%89%87%E6%AE%B55/NBT'], ]
for i in range(len(links)):
	title, url = links[i]
	if i<len(links)-1:
		_,next_p = links[i+1]
		next_p = make_filename(next_p)
	try:
		make_html_all(url,title,next_page=next_p,translation=True,mini=True,update_html=False,force_translate=True)
	except Exception as e:
		print(title,e)
	# # 	exit()

20
40
60
80
100
120
140


In [113]:
soup = html_to_soup('/w/7-2_%E5%88%AB%E7%A6%BB%E4%B9%8B%E5%A4%9C/BEG')
# soup = html_to_soup('/w/W2G/BEG')
df = make_df(soup)
soup.find('script', {'id': 'datas_txt', 'type': 'csv'})

In [84]:
soup.find('script', {'id': 'datas_txt', 'type': 'csv'})

index=2 name='' text='ああ、君か。' japanese_text='ああ、君か。'


In [114]:
i = 0
for file in os.listdir('./html'):
	if file.endswith('.html'):
		with open(file, 'w') as f:
			# 'index.html'を'../index.html'に置換
			path = os.path.join('./html', file)
	# ファイルを読み込んで
			with open(path, 'r', encoding='utf-8') as f:
				text = f.read()
				# 置換して
			new_text = text.replace('index.html', '../index.html')
			# 上書き保存
			with open(path, 'w', encoding='utf-8') as f:
				f.write(new_text)
	i += 1
	if i % 20 == 0:
		print(i)

TranslateAllModel(translation=[TranslateModel(index=2, name='', text='哦，是你。', jp_text='ああ、あなたか。'), TranslateModel(index=6, name='', text='离我们上一次见面，已经过去了很久。', jp_text='私たちが最後に会ってから、もう長い時間が経った。'), TranslateModel(index=7, name='', text='这段时间里......你一直徘徊在悬崖的边缘。', jp_text='この間ずっと……あなたは崖っぷちをさまよっていた。'), TranslateModel(index=9, name='', text='你可能已经忘记了你的身份，但你还记得那个名字，这就够了。', jp_text='あなたは自分の正体をもう忘れたかもしれないが、その名前だけは覚えている、それで十分だ。'), TranslateModel(index=10, name='', text='——好了，别在这里逗留太久。', jp_text='――さあ、ここに長く留まらないように。'), TranslateModel(index=11, name='', text='毕竟，你既不是我的客人，也不应该出现在这里。', jp_text='結局、あなたは私の客でもないし、ここにいるべきでもない。'), TranslateModel(index=12, name='', text='她需要你。', jp_text='彼女はあなたを必要としている。'), TranslateModel(index=14, name='', text='12月23日。', jp_text='12月23日。'), TranslateModel(index=15, name='', text='你可能记不清这一天对你来说，究竟意味着什么。', jp_text='この日があなたにとって何を意味するのか、もう思い出せないかもしれない。'), TranslateModel(index=16, name='', text='这会让你陷入十分危险的处境。', jp_text='それはあなたをとても危険な状況に陥れるだろう。'), TranslateModel(index=17, name

In [134]:
a = df[df['command']=='name']
# name,textのみ抽出
a = a[['name', 'text']].reset_index()
a.head()
b = a.to_json(orient='records',force_ascii=False)

Unnamed: 0,index,name,text,jp_text
0,2,,哦，是你。,ああ、あなたか。
1,6,,离我们上一次见面，已经过去了很久。,私たちが最後に会ってから、もう長い時間が経った。
2,7,,这段时间里......你一直徘徊在悬崖的边缘。,この間ずっと……あなたは崖っぷちをさまよっていた。
3,9,,你可能已经忘记了你的身份，但你还记得那个名字，这就够了。,あなたは自分の正体をもう忘れたかもしれないが、その名前だけは覚えている、それで十分だ。
4,10,,——好了，别在这里逗留太久。,――さあ、ここに長く留まらないように。


In [161]:
import json
# 2. ChatGPT に渡すプロンプトを作成
#    text フィールドを日本語に翻訳し、jp_text キーを追加するよう指示
prompt = json.dumps(jb, ensure_ascii=False)
from typing import List

from pydantic import BaseModel

class TranslateAllModel(BaseModel):
	# title: str
	class TranslateModel(BaseModel):
		index: int
		name: str
		jp_name:str
		text: str
		jp_text: str
	translation: List[TranslateModel]


# 3. ChatCompletion API を呼び出し
response = client.responses.parse(
	model="gpt-4.1",
	input=[
		{"role": "system", "content": "あなたは優秀な翻訳者です。以下のリスト形式の JSON を読み込み、各オブジェクトの 'name',“text” を日本語に翻訳し、キー “jp_name”, “jp_text” として追加してください。出力は同じリスト形式の JSON のまま返してください。注意：「博士」は「ドクター」と訳すこと。入力が「？」など記号のみの場合，そのまま出力せよ。"},
		{"role": "user",   "content": prompt},
	],
	text_format=TranslateAllModel,
)

# response.output_text
response.output_parsed.dict()['translation']


In [163]:
q = response.output_parsed.dict()['translation']

Unnamed: 0,index,text,text_orig
0,0,,初始引导
1,1,,
2,2,哦，是你。,哦，是你。
3,3,,
4,4,,


In [170]:
d_q = pd.DataFrame(q)
d_q.head()

In [96]:
# df の行番号（0,1,2…）を index カラムとして追加
df["index"] = df.index
trans_df = d_q

merged = (
	df
	.merge(
		trans_df[["index", "text", "jp_text"]],  # 原文(text), 翻訳(jp_text)を取り込む
		on="index",
		how="left",
		suffixes=("_orig","", )
	)
	.assign(
		text_match=lambda d: d["text"] == d["text_orig"]  # text と text_orig が同じか
	)
)


Unnamed: 0,index,name,text,jp_text
107,254,阿米娅,虽然失去了记忆，但博士确实曾与我们......,記憶をなくしていても、博士は確かに私たちと……


In [27]:
merged[['index','text','text_orig']].head(5)

[0]
[1]
[2]


In [35]:
m = merged[~merged["text_match"]][["index", "text",'text_orig']]
k = m[m["text"].notna()& m["text"].str.strip().ne("")]
if k.size:
	print('不一致')
	print(k)

Unnamed: 0,command,text,is_tutorial,is_skippable,is_autoable,fit_mode,deny_auto_switch_scene,key,volume,delay,...,vibrato,randomness,width,height,fadeout,options,values,references,stageId,waitForSignal
0,HEADER,初始引导,True,True,True,BLACK_MASK,True,,,,...,,,,,,,,,,
1,PlayMusic,,,,,,,$babel_loop,0.8,0.2,...,,,,,,,,,,
2,name,哦，是你。,,,,,,,,,...,,,,,,,,,,
3,Image,,,,,,,,,,...,,,,,,,,,,
4,ImageTween,,,,,,,,,,...,,,,,,,,,,


In [59]:
df_j[df_j['index']==254]

In [60]:
def func(x, lst=None):
	if lst is None:
		lst = []
	lst.append(x)
	return lst

for i in range(3):
	print(func(i))

In [61]:
with open('kkk.html', 'r', encoding='utf-8') as f:
	soup = BeautifulSoup(f,'html.parser')
df = make_df(soup)
df.head()

Unnamed: 0,index,name,text,option_number
2,2,,哦，是你。,0
6,6,,离我们上一次见面，已经过去了很久。,0
7,7,,这段时间里......你一直徘徊在悬崖的边缘。,0
9,9,,你可能已经忘记了你的身份，但你还记得那个名字，这就够了。,0
10,10,,——好了，别在这里逗留太久。,0
...,...,...,...,...
299,299,阿米娅,希望博士能在战斗中，把过去的感觉找回来。,0
301,301,阿米娅,————也许连你自己都可能还不太相信......,0
302,302,阿米娅,但是我相信你。,0
304,304,阿米娅,——我相信你，一定可以的。,0


In [None]:
df['option_number'] = df['options'].fillna('').apply(
	lambda x: len(x.split(';')) if x else 0
)
#optionsにあるセミコロン区切りのデータをtextへ移す
# 1. explode 対象のマスク（text が NaN または 空文字、かつ options が非空）
mask = (df['text'].isna() | df['text'].eq('')) & df['options'].notna() & df['options'].ne('')

# 2. explode 対象をコピーして split ＆ explode
to_exp = df[mask].copy()
to_exp['text'] = to_exp['options'].str.split(';')
to_exp = to_exp.explode('text')

# 3. 対象外はそのままコピー
to_keep = df[~mask].copy()

# 4. 結合して元の順序に並び替え
df= pd.concat([to_keep, to_exp], axis=0).reset_index(drop=True)


In [None]:
df["index"] = df.index
p = df[df['command'].isin(['name', 'Decision'])]
p = p[['index','name','text','option_number']]

In [None]:
p

In [None]:

while i<p.shape[0]:
	pp = p.iloc[i]
	bb = ''
	if pp['command'] == 'decision':
		opt_n = pp['option_number']
		opt = []
		bb += '<li class="decision">\n'
		for x in range(opt_n):
			pp= p.iloc[i+x]
			opt.append(pp['text'])
			bb += f'<p class="decision">「{pp["text"]}」</p>\n'
			if translation:
				bb += f'<p class="translation">「{pp["jp_text"]}」</p>\n'
		bb += '</li>\n'
		i += opt_n
		if i>=p.shape[0]:
			break
		pred = 0  # フラグ
		while True and opt_n>1 and i<p.shape[0]: # 選択肢2つ以上ならPredicateをもっかいかく
			pp = p.iloc[i]

			if pp['command'] == 'predicate':
				if pred: bb += '</div>\n'
				if ';' in pp['references']:
					break
				bb += '<div class="predicate">\n'
				pred = 1
				t = opt[int(pp['references'])-1]
				bb+= f'<p>「{t}」</p>\n'
				if translation:
					bb += f'<p class="translation">「{pp["jp_text"]}」</p>\n'
			else:
				# icon = ''
				# if pp['command'] == 'character' and pd.notna(pp['name']):
				# 	icon = icon_html(pp,char_dict)
				# i += 1
				# if i>=p.shape[0]:	break
				bb += text_html(pp,icon=icon, translation=translation)
			i+= 1
		b += f"""<div class="decision">\n{bb}</div>\n"""

	# 背景
	elif p.iloc[i]['command'] == 'image' or p.iloc[i]['command']== 'background':
		image = p.iloc[i]['image']
		if p.iloc[i]['command'] == 'background':
			image = 'bg_'+image

		if pd.notna(image) and image not in watched_img: # 重複を避ける
			watched_img.append(image)
			# print(image)
			b += img_back_html(image, df_datas_back)+'\n'

	else:
		icon = ''
		if pp['command']== 'character' and pd.notna(pp['name']):
			icon = icon_html(pp,char_dict)
		i += 1
		if i>=p.shape[0]:	break
		pp = p.iloc[i]
		b += text_html(pp,icon,translation)

	i += 1