In [122]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from io import StringIO

def html_to_soup(url):
	try:
		res = requests.get('https://m.prts.wiki'+url, timeout=(5, 5))
	except requests.exceptions.Timeout:
		return None
	soup = BeautifulSoup(res.text, 'html.parser')
	return soup

# soup -> df
def make_df(soup):
	script = soup.find('script', {'id': 'datas_txt', 'type': 'csv'})
	lines = [line.strip() for line in script.string.splitlines() if line.strip()]
	records = []
	pattern = re.compile(
		r'^\[(?P<cmd>\w+)'
		r'(?:\((?P<args>.*?)\)|=(?P<val>[^]]+))?'
		r'\](?:\s*(?P<text>.*))?$'
	)
	for ln in lines:
		m = pattern.match(ln)
		if m:
			cmd   = m.group('cmd').lower()
			args  = m.group('args')
			val   = m.group('val')
			text  = m.group('text') or ''

			params = {}
			if args:
				# (key=val, …) をパース
				for part in re.split(r',\s*(?=\w+=)', args):
					k, v = part.split('=', 1)
					params[k] = v.strip().strip('"')
			elif val is not None:
				# = "…" 形式はコマンド名をキーに
				params[cmd] = val.strip('"')

			records.append({'command': cmd, 'text': text, **params})
		else:
			records.append({'command': None, 'text': ln})

	# DataFrame に
	df = pd.DataFrame(records)
	df['text'] = df['text'].str.replace('Dr.{@nickname}', '博士')
	df['text'] = df['text'].str.replace('{@nickname}博士', '博士')
	if 'name' not in df.columns:
		df['name'] = ''

	# 自分の発言をoptionsからtextに移す
	if 'options' in df.columns:
		df['option_number'] = df['options'].fillna('').apply(
			lambda x: len(x.split(';')) if x else 0
		)
		# 1. explode 対象のマスク（text が NaN または 空文字、かつ options が非空）
		mask = (df['text'].isna() | df['text'].eq('')) & df['options'].notna() & df['options'].ne('')

		# 2. explode 対象をコピーして split ＆ explode
		to_exp = df[mask].copy()
		to_exp['text'] = to_exp['options'].str.split(';')
		to_exp = to_exp.explode('text')

		# 3. 対象外はそのままコピー
		to_keep = df[~mask].copy()

		# 4. 結合して元の順序に並び替え
		df= pd.concat([to_keep, to_exp], axis=0).sort_index().reset_index(drop=True)
	return df


# soup,df -> キャラのurlのdict
def char_img_dict(soup):
	datas_char= soup.find('script', {'id': 'datas_char', 'type': 'csv'}).string
	datas_char =  pd.read_csv(StringIO(datas_char), header=None)
	datas_char.columns = ['name', 'html']
	datas_char['name'] = datas_char['name'].str.lower()
	datas_char = dict(zip(datas_char['name'], datas_char['html']))
	return datas_char

In [123]:
from openai import OpenAI
import json
from typing import List
from pydantic import BaseModel
import os

client = OpenAI()
class TranslateAllModel(BaseModel):
	# title: str
	class TranslateModel(BaseModel):
		index: int
		name: str
		jp_name:str
		text: str
		jp_text: str
	translation: List[TranslateModel]

def translate(df,mini):
	df["index"] = df.index
	p = df[df['command'].isin(['name', 'decision'])]
	p = p[['index','name','text']]

	if mini:model = 'gpt-4.1-mini'
	else:model = 'gpt-4.1'

	chunk_size = 80
	chunks = [p.iloc[i : i + chunk_size] for i in range(0, len(p), chunk_size)]

	processed = []
	for chunk in chunks:
		p = chunk.to_json(orient='records',force_ascii=False)
		prompt = json.dumps(p, ensure_ascii=False)
		response = client.responses.parse(
			model=model,
			input=[
				{"role": "system", "content":
					"""あなたは優秀な翻訳者です。以下のリスト形式の JSON を読み込み、各オブジェクトの 'name',“text” を日本語に翻訳し、キー “jp_name”, “jp_text” として追加してください。出力は同じリスト形式の JSON のまま返してください。
					# 注意:「博士」は「ドクター」と訳すこと。入力が「？」など記号のみの場合，そのまま出力せよ。阿米娅のセリフは敬体にしなさい。"""},
				{"role": "user",   "content": prompt},
			],
			text_format=TranslateAllModel,
		)
		processed_chunk = pd.DataFrame(response.output_parsed.dict()['translation'])
		processed.append(processed_chunk)

	# 処理結果を結合
	trans_df =  pd.concat(processed, ignore_index=True)
	trans_df['text'] = trans_df['text'].str.replace('\\\\', '\\')
	trans_df['text'] = trans_df['text'].str.replace('\\"', '"')

	merged = (
		df.merge(
			trans_df[["index", "text", "jp_text",'jp_name']],  # 原文(text), 翻訳(jp_text)を取り込む
			on="index",
			how="left",
			suffixes=("_orig", "")
		)
		.assign(
			text_match=lambda d: d["text"] == d["text_orig"]  # text と text_orig が同じか
		)
	)
	m = merged[~merged["text_match"]][["index", "text",'text_orig']]
	k = m[m["text"].notna()& m["text"].str.strip().ne("")]
	if k.size:
		# print('不一致')
		print(k)
		# return None
	return merged

def text_html(pp,icon='',translation=True):
	bbb = ''
	if pp['command'] == 'name':
		if pp['name']:
			bbb += f'{icon}\n<em class="name">{pp["name"]}</em>\n'
			if translation:
				bbb += f'<em class="translation">{pp["jp_name"]}</em>\n'

		bbb = f'<div class="character">{bbb}</div>'
		text = ''
		if pp['text']:
			text += f'<p>{pp["text"]}</p>\n'
			if translation:
				text += f'<p class="translation">{pp["jp_text"]}</p>\n'
			bbb += f"""<div class="text">\n{text}</div>"""
		return f"""<li class="dialogue">\n{bbb}\n</li>\n"""
	else:
		return bbb

def icon_html(pp,char_dict):
	a =  'name'
	if 'focus' in pp.index and pd.notna(pp['focus']):
		f = int(pp['focus'])
		if f>1:
			a += str(f)
	d = {'char_136_hsguma':'char_136_hsguma_1',
	 	'char_1504_cqbw':'char_1504_cqbw_1',
	 	'char_2006_weiywfmzuki_1':"char_2006_fmzuki_1"}
	cha_id = pp[a].split('#')[0].lower()

	if cha_id in d.keys():
		cha_id = d[cha_id]
	if cha_id in char_dict.keys():
		url = char_dict[cha_id]
		a = f'<div class="icon" style="--img-url: url(\'{url}\')" data-src="{url}"></div>'
		return a
	else: return None

def img_back_html(image_id, df_datas_back):
	t = df_datas_back[df_datas_back['name']==image_id].html.iloc[0]
	return f'<img src="{t}" width="100%" height="auto">'

from collections import defaultdict
# soup,df -> html（本文）
def make_html(df,translation=True):
	p = df
	b = ''
	with open('soup/soup14-8-BEG.txt','r') as f:
		h = f.read()
	soup = BeautifulSoup(h,'html.parser')
	datas_back = soup.find('script', {'id': 'datas_back', 'type': 'csv'}).string
	df_datas_back=pd.read_csv(StringIO(datas_back), header=None)
	df_datas_back.columns = ['name', 'html']
	char_dict = char_img_dict(soup)

	# watched_img = []

	nokori_div = 0
	opts_dict = defaultdict(str)
	opts_dict_jp = defaultdict(str)
	icon = ''

	for i in range(p.shape[0]):
		pp = p.iloc[i]
		c = pp['command']

		if c == 'decision':
			match nokori_div:
				case 0: # 連続するdecisionのひとつめ
					nokori_div = pp['option_number']
					b += '<div class="decision">'
					b += '<li class="decision">'
					b += f'<p class="decision">「{pp["text"]}」</p>'
					if translation:b += f'<p class="translation">「{pp["jp_text"]}」</p>'
				case 1:
					b += f'<p class="decision">「{pp["text"]}」</p>'
					if translation:b += f'<p class="translation">「{pp["jp_text"]}」</p>'
					b += '</li>'
				case _:
					b += f'<p class="decision">「{pp["text"]}」</p>'
					if translation: b += f'<p class="translation">「{pp["jp_text"]}」</p>'
			opts_list.append(pp["text"])
			if translation:	opts_list_jp.append(pp["jp_text"])
			nokori_div -= 1
		elif c=='predicate':
			if len(opts_list)==1:
				b += '</div>'
				opts_list = []
				opts_list_jp = []
			else:
				ref = pp['references']
				if type(ref)==str and ';' not in ref:
					ref = int(ref)
					if ref>1: b += '</div>'
					b += '<div class="predicate">'
					b +=  f'<p>「{opts_list[ref-1]}」</p>'
					if translation: b += f'<p class="translation">「{opts_list_jp[ref-1]}」</p>'
				else:
					if i<p.shape[0]-1 and p.iloc[i+1]['command']=='predicate':
						continue
					b += '</div></div>'
					opts_list = []
					opts_list_jp = []
		elif c=='character':
			if pd.notna(pp['name']):
				icon = icon_html(pp,char_dict)
			else:
				icon = ''
			if  ('focus' in pp.index) and pp['focus']==-1:
				icon = ""
		elif c=='blocker':
			icon=''
		elif c=='name':
			b += text_html(pp,icon,translation)
		elif c in ('image','background'):
			image_id = pp['image']
			if pd.notna(image_id):
				if c=='background':
					image_id = 'bg_' + image_id
				b += img_back_html(image_id, df_datas_back)+'\n'
	return b

In [124]:
def make_filename(url, extension='html'):
	filename_list = url.split('/')[2:]
	filename = filename_list[0].split('_')[0]
	if len(filename_list)>1:
		filename += '-'+filename_list[1]
	if extension=='html':
		filename += '.html'
	if extension=='csv':
		filename += '.csv'
	if extension=='txt':
		filename += '.txt'
	return filename

def translate_title(title):
	appendix = ''
	if '行动前' == title[-3:]:
		title = title[:-3]
		appendix = ' 戦闘前'
	elif '行动后' == title[-3:]:
		title = title[:-3]
		appendix = ' 戦闘後'
	response = client.responses.create(
		model="gpt-4.1",
		input=[{
			'role': 'system',
			'content': 'あなたは中国語の翻訳家です。中国語の小説のタイトルを与えるので，日本語に訳してください。英語のみのときや記号のみの場合はそのまま出力せよ。そのほかの出力は絶対にするな',
		},
			{'role':'user','content':title}]
	)
	return response.output_text + appendix

def make_html_all(url,title,next_page='',translation=True,mini=False,update_html=False, force_translate=False):
	if not update_html:
		if os.path.exists('./html/'+make_filename(url)):
			# print('html already exists')
			return

	soup_filename = make_filename(url,'txt')
	soup_path = './soup/soup'+soup_filename
	flag = 1
	if os.path.exists(soup_path):
		with open(soup_path,'r') as f:
			h = f.read()
		if h: # 以前記録したのが空でなければ，それを使う
			soup = BeautifulSoup(h,'html.parser')
			flag = 0
	if flag:
		print(title+' fetching..')
		soup = html_to_soup(url)
		if soup:
			with open(soup_path,'w') as f:
				f.write(soup.find('script', {'id': 'datas_txt', 'type': 'csv'}).prettify())

	csv_filename = make_filename(url,'csv')
	exist_csv = ''
	if mini:
		csv_path = './csv/mini/'+csv_filename
		if os.path.exists('./csv/'+csv_filename):
			exist_csv = './csv/'+csv_filename
		elif os.path.exists('./html/'+csv_filename):
			exist_csv = './html/'+csv_filename
	else:
		csv_path = './csv/'+csv_filename
		if os.path.exists(csv_path):
			exist_csv = csv_path

	df = make_df(soup)
	# csv がある時

	if exist_csv and not force_translate:
		df_t = pd.read_csv(exist_csv)
		df_t_valid = df_t[df_t['text'].notna()]
		# ２．同じ text が複数ある場合は最初の行だけ残す
		df_t_unique = df_t_valid.drop_duplicates(subset='text')
		# 方法１：merge で一度に両方追加
		df = df.merge(
			df_t_unique[['text', 'jp_text', 'jp_name']],
			on='text',
			how='left'
		)
	# csvないとき　翻訳のcsvを新規作成する
	else:
		if not soup:
			print('timeout')
			return
		print(title+' fetched. now translating..')
		if translation:
			df = translate(df,mini)
		df.to_csv(csv_path, index=False)

	body = make_html(df,translation=translation)
	title = f'{title}　-　{translate_title(title)}'
	html = (f"""<!DOCTYPE html>
	<html lang="cn">
	<head>
	 <meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1">
	 <script src="https://kit.fontawesome.com/c07fe94aba.js" crossorigin="anonymous"></script>
	 <link href="https://fonts.googleapis.com/css2?family=Noto+Sans:wght@400;700&display=swap" rel="stylesheet">
	 <title>{title}</title>
	 <link rel="stylesheet" href="../css/styles.css">
</head><body><h2 class="title">{title}</h2>
<ul>{body}</ul>  <div class="button-container">
   <button class="back-home" onclick="location.href='../index.html'" aria-label="ホーム">
    <i class="fa-solid fa-house"></i>
   </button>
  <button class="next-page" onclick="location.href='{next_page}'">
   <i class="fa-solid fa-angle-right"></i>
  </button>
  </div>
 <div class="overlay">
   <img class="overlay-img" src="" alt="元画像">
  </div>
<script>
	const overlay = document.querySelector('.overlay');
    const img = overlay.querySelector('.overlay-img');

    document.body.addEventListener('click', e => {{
      if (e.target.matches('.icon')) {{
        img.src = e.target.dataset.src;
        overlay.classList.add('show');
      }}
      else if (e.target === overlay) {{
        overlay.classList.remove('show');
      }}    }});
        function fitTextToWidth(el, maxWidth) {{
    // 現在のフォントサイズを取得
    let style = window.getComputedStyle(el);
	let fontSize = parseFloat(style.fontSize);
	// 幅がオーバーしている限り、1pxずつ小さくする
while (el.scrollWidth > maxWidth && fontSize > 8) {{
fontSize -= 1;
el.style.fontSize = fontSize + 'px';
}}
}}

// ページ読み込み後／ウィンドウリサイズ時に実行
function adjustAll() {{
document.querySelectorAll('.character').forEach(char => {{
	const maxW = char.clientWidth;
char.querySelectorAll('em.name, em.translation').forEach(el => {{
// 初期サイズにリセットしてから再計算
el.style.fontSize = '';
fitTextToWidth(el, maxW);
}});
}});
}}

window.addEventListener('load', adjustAll);
window.addEventListener('resize', adjustAll);
</script>
</body>
</html>""")
	soup = BeautifulSoup(html, 'html.parser')
	html = soup.prettify()
	filename = make_filename(url)
	with open('./html/'+filename,'w',encoding='utf-8') as f:
		f.write(html)
	print(f"⇒ {title} saved to {filename}")

In [128]:
d = dict()

In [129]:
d[1]=10

In [130]:
d[1]=3

In [131]:
with open('a.html', 'r', encoding='utf-8') as f:
	soup = BeautifulSoup(f, 'html.parser')
tables = soup.find_all('table')
links = []
for table in tables:
	for pagelink in table.find_all('a'):
		if 'href' in pagelink.attrs:
			if '一览' in pagelink.string:
				continue
			link = pagelink['href']
			title = pagelink.string
			title = re.sub(r'[\s\u3000]', '',title)
			links.append([title,link])
print(links)

[['采购中心', '/w/%E9%87%87%E8%B4%AD%E4%B8%AD%E5%BF%83/%E5%89%A7%E6%83%85'], ['EP09前情提要', '/w/EP09/ENTRY'], ['EP10序曲', '/w/EP10/ENTRY'], ['EP11序曲', '/w/EP11/ENTRY'], ['EP12序曲', '/w/EP12/ENTRY'], ['15-17“她”行动后分支1', '/w/15-17_%E2%80%9C%E5%A5%B9%E2%80%9D/END/SP1'], ['15-17“她”行动后分支2', '/w/15-17_%E2%80%9C%E5%A5%B9%E2%80%9D/END/SP2'], ['序章·上', '/w/W2G/BEG'], ['序章·下', '/w/G2H/END'], ['0-1坍塌行动前', '/w/0-1_%E5%9D%8D%E5%A1%8C/BEG'], ['0-1坍塌行动后', '/w/0-1_%E5%9D%8D%E5%A1%8C/END'], ['0-2守卫行动前', '/w/0-2_%E5%AE%88%E5%8D%AB/BEG'], ['0-2守卫行动后', '/w/0-2_%E5%AE%88%E5%8D%AB/END'], ['0-4混战行动前', '/w/0-4_%E6%B7%B7%E6%88%98/BEG'], ['0-6强击行动后', '/w/0-6_%E5%BC%BA%E5%87%BB/END'], ['0-7感染行动前', '/w/0-7_%E6%84%9F%E6%9F%93/BEG'], ['0-7感染行动后', '/w/0-7_%E6%84%9F%E6%9F%93/END'], ['0-8狩猎行动前', '/w/0-8_%E7%8B%A9%E7%8C%8E/BEG'], ['0-9临光行动后', '/w/0-9_%E4%B8%B4%E5%85%89/END'], ['0-10困境行动前', '/w/0-10_%E5%9B%B0%E5%A2%83/BEG'], ['0-11突围行动后', '/w/0-11_%E7%AA%81%E5%9B%B4/END'], ['1-1孤岛行动前', '/w/1-1_%E5%AD%A4%E5%B2%9B/BEG'], ['1-1孤岛行动后

In [127]:
aa =  ['3-6决定行动后', '/w/3-6_%E5%86%B3%E5%AE%9A/END'], ['3-7轰鸣行动前', '/w/3-7_%E8%BD%B0%E9%B8%A3/BEG']
a = aa[0]
n =aa[1][1]
n=make_filename(n)
make_html_all(a[1],a[0],next_page=n,translation=True, update_html=True,force_translate=False)

<script id="datas_txt" type="csv">[HEADER(key="title_test", is_skippable=true, fit_mode="BLACK_MASK")] 
[stopmusic]
[Dialog]
[playMusic(intro="$dignified_intro", key="$dignified_loop", volume=0.4)]
[Delay(time=1)]
[Blocker(a=1, r=0, g=0, b=0, fadetime=1, block=true)]
[Background(image="bg_corridor",screenadapt="coverall")]
[Blocker(a=0, r=0, g=0, b=0, fadetime=3, block=true)]
6:30 p.m. 
[Dialog]
[PlaySound(key="$dooropenquite", volume=0.6)]
[delay(time=2)]
[PlaySound(key="$d_gen_walk_n")]
[delay(time=2)]
[Character]
[name="PRTS"]  编号00000-00002，接入权限-8。
[name="PRTS"]  Dr.{@nickname}，欢迎访问罗德岛综合生物处理室，已依据生物数据对您的意图进行判断。
[name="PRTS"]  另外，系统检测到您的心情不佳。
[Decision(options="少烦我。;......;该怎么样才算是好心情？", values="1;2;3")]
[Predicate(references="1")]
[Character]
[name="PRTS"]  Dr.{@nickname}表现出一定的攻击性。
[name="PRTS"]  请放心，系统不会因此电击你，不用太过顾忌系统对你的检测。
[Predicate(references="2")]
[Character]
[name="PRTS"]  Dr.{@nickname}陷入了沉默。
[name="PRTS"]  无论是无声抗议还是不愿进行沟通，我认为这都是一种孤独性精神障碍症的表现。
[name="PRTS"]  当然，请便，系统会平等对待所有人。


In [136]:
links =  [
	['0-1坍塌行动前', '/w/0-1_%E5%9D%8D%E5%A1%8C/BEG'], ['0-1坍塌行动后', '/w/0-1_%E5%9D%8D%E5%A1%8C/END'], ['0-2守卫行动前', '/w/0-2_%E5%AE%88%E5%8D%AB/BEG'], ['0-2守卫行动后', '/w/0-2_%E5%AE%88%E5%8D%AB/END'], ['0-4混战行动前', '/w/0-4_%E6%B7%B7%E6%88%98/BEG'], ['0-6强击行动后', '/w/0-6_%E5%BC%BA%E5%87%BB/END'], ['0-7感染行动前', '/w/0-7_%E6%84%9F%E6%9F%93/BEG'], ['0-7感染行动后', '/w/0-7_%E6%84%9F%E6%9F%93/END'], ['0-8狩猎行动前', '/w/0-8_%E7%8B%A9%E7%8C%8E/BEG'], ['0-9临光行动后', '/w/0-9_%E4%B8%B4%E5%85%89/END'], ['0-10困境行动前', '/w/0-10_%E5%9B%B0%E5%A2%83/BEG'], ['0-11突围行动后', '/w/0-11_%E7%AA%81%E5%9B%B4/END'], ['1-1孤岛行动前', '/w/1-1_%E5%AD%A4%E5%B2%9B/BEG'], ['1-1孤岛行动后', '/w/1-1_%E5%AD%A4%E5%B2%9B/END'], ['1-3狂奔行动前', '/w/1-3_%E7%8B%82%E5%A5%94/BEG'], ['1-3狂奔行动后', '/w/1-3_%E7%8B%82%E5%A5%94/END'], ['1-4先兆行动前', '/w/1-4_%E5%85%88%E5%85%86/BEG'], ['1-6灾难行动前', '/w/1-6_%E7%81%BE%E9%9A%BE/BEG'], ['1-7暴君行动前', '/w/1-7_%E6%9A%B4%E5%90%9B/BEG'], ['1-7暴君行动后', '/w/1-7_%E6%9A%B4%E5%90%9B/END'], ['1-8意志行动前', '/w/1-8_%E6%84%8F%E5%BF%97/BEG'], ['1-10残留行动后', '/w/1-10_%E6%AE%8B%E7%95%99/END'], ['1-12代价行动前', '/w/1-12_%E4%BB%A3%E4%BB%B7/BEG'], ['1-12代价行动后', '/w/1-12_%E4%BB%A3%E4%BB%B7/END'], ['TR-11战术阻滞行动前', '/w/TR-11_%E6%88%98%E6%9C%AF%E9%98%BB%E6%BB%9E/BEG'], ['2-1龙门之行行动后', '/w/2-1_%E9%BE%99%E9%97%A8%E4%B9%8B%E8%A1%8C/END'], ['2-2兵不接刃行动前', '/w/2-2_%E5%85%B5%E4%B8%8D%E6%8E%A5%E5%88%83/BEG'], ['2-2兵不接刃行动后', '/w/2-2_%E5%85%B5%E4%B8%8D%E6%8E%A5%E5%88%83/END'], ['2-3无罪推定行动前', '/w/2-3_%E6%97%A0%E7%BD%AA%E6%8E%A8%E5%AE%9A/BEG'], ['2-3无罪推定行动后', '/w/2-3_%E6%97%A0%E7%BD%AA%E6%8E%A8%E5%AE%9A/END'], ['2-4企鹅物流行动前', '/w/2-4_%E4%BC%81%E9%B9%85%E7%89%A9%E6%B5%81/BEG'], ['2-4企鹅物流行动后', '/w/2-4_%E4%BC%81%E9%B9%85%E7%89%A9%E6%B5%81/END'], ['2-5高空坠物行动前', '/w/2-5_%E9%AB%98%E7%A9%BA%E5%9D%A0%E7%89%A9/BEG'], ['2-5高空坠物行动后', '/w/2-5_%E9%AB%98%E7%A9%BA%E5%9D%A0%E7%89%A9/END'], ['2-6握紧扶手行动前', '/w/2-6_%E6%8F%A1%E7%B4%A7%E6%89%B6%E6%89%8B/BEG'], ['2-6握紧扶手行动后', '/w/2-6_%E6%8F%A1%E7%B4%A7%E6%89%B6%E6%89%8B/END'], ['2-7注意卫生行动前', '/w/2-7_%E6%B3%A8%E6%84%8F%E5%8D%AB%E7%94%9F/BEG'], ['2-7注意卫生行动后', '/w/2-7_%E6%B3%A8%E6%84%8F%E5%8D%AB%E7%94%9F/END'], ['2-8不做约定行动前', '/w/2-8_%E4%B8%8D%E5%81%9A%E7%BA%A6%E5%AE%9A/BEG'], ['2-8不做约定行动后', '/w/2-8_%E4%B8%8D%E5%81%9A%E7%BA%A6%E5%AE%9A/END'], ['2-9操作暗箱行动前', '/w/2-9_%E6%93%8D%E4%BD%9C%E6%9A%97%E7%AE%B1/BEG'], ['2-9操作暗箱行动后', '/w/2-9_%E6%93%8D%E4%BD%9C%E6%9A%97%E7%AE%B1/END'], ['2-10病入膏肓行动前', '/w/2-10_%E7%97%85%E5%85%A5%E8%86%8F%E8%82%93/BEG'], ['2-10病入膏肓行动后', '/w/2-10_%E7%97%85%E5%85%A5%E8%86%8F%E8%82%93/END'], ['3-1会合行动前', '/w/3-1_%E4%BC%9A%E5%90%88/BEG'], ['3-1会合行动后', '/w/3-1_%E4%BC%9A%E5%90%88/END'], ['3-2记忆行动前', '/w/3-2_%E8%AE%B0%E5%BF%86/BEG'], ['3-2记忆行动后', '/w/3-2_%E8%AE%B0%E5%BF%86/END'], ['3-3回旋行动前', '/w/3-3_%E5%9B%9E%E6%97%8B/BEG'], ['3-3回旋行动后', '/w/3-3_%E5%9B%9E%E6%97%8B/END'], ['3-4龟裂行动前', '/w/3-4_%E9%BE%9F%E8%A3%82/BEG'], ['3-4龟裂行动后', '/w/3-4_%E9%BE%9F%E8%A3%82/END'], ['3-5呼叫行动前', '/w/3-5_%E5%91%BC%E5%8F%AB/BEG'], ['3-5呼叫行动后', '/w/3-5_%E5%91%BC%E5%8F%AB/END'], ['3-6决定行动前', '/w/3-6_%E5%86%B3%E5%AE%9A/BEG'], ['3-6决定行动后', '/w/3-6_%E5%86%B3%E5%AE%9A/END'], ['3-7轰鸣行动前', '/w/3-7_%E8%BD%B0%E9%B8%A3/BEG'], ['3-7轰鸣行动后', '/w/3-7_%E8%BD%B0%E9%B8%A3/END'], ['3-8黄昏行动前', '/w/3-8_%E9%BB%84%E6%98%8F/BEG'], ['3-8黄昏行动后', '/w/3-8_%E9%BB%84%E6%98%8F/END'], ['4-1免费拥抱行动前', '/w/4-1_%E5%85%8D%E8%B4%B9%E6%8B%A5%E6%8A%B1/BEG'], ['4-1免费拥抱行动后', '/w/4-1_%E5%85%8D%E8%B4%B9%E6%8B%A5%E6%8A%B1/END'], ['4-2雨中漫步行动前', '/w/4-2_%E9%9B%A8%E4%B8%AD%E6%BC%AB%E6%AD%A5/BEG'], ['4-2雨中漫步行动后', '/w/4-2_%E9%9B%A8%E4%B8%AD%E6%BC%AB%E6%AD%A5/END'], ['4-3人工制冷行动前', '/w/4-3_%E4%BA%BA%E5%B7%A5%E5%88%B6%E5%86%B7/BEG'], ['4-3人工制冷行动后', '/w/4-3_%E4%BA%BA%E5%B7%A5%E5%88%B6%E5%86%B7/END'], ['4-4不要恐慌行动前', '/w/4-4_%E4%B8%8D%E8%A6%81%E6%81%90%E6%85%8C/BEG'], ['4-4不要恐慌行动后', '/w/4-4_%E4%B8%8D%E8%A6%81%E6%81%90%E6%85%8C/END'], ['4-5官僚主义行动前', '/w/4-5_%E5%AE%98%E5%83%9A%E4%B8%BB%E4%B9%89/BEG'], ['4-5官僚主义行动后', '/w/4-5_%E5%AE%98%E5%83%9A%E4%B8%BB%E4%B9%89/END'], ['4-6少见多怪行动前', '/w/4-6_%E5%B0%91%E8%A7%81%E5%A4%9A%E6%80%AA/BEG'], ['4-6少见多怪行动后', '/w/4-6_%E5%B0%91%E8%A7%81%E5%A4%9A%E6%80%AA/END'], ['4-7各取所需行动前', '/w/4-7_%E5%90%84%E5%8F%96%E6%89%80%E9%9C%80/BEG'], ['4-7各取所需行动后', '/w/4-7_%E5%90%84%E5%8F%96%E6%89%80%E9%9C%80/END'], ['4-8应激反应行动前', '/w/4-8_%E5%BA%94%E6%BF%80%E5%8F%8D%E5%BA%94/BEG'], ['4-8应激反应行动后', '/w/4-8_%E5%BA%94%E6%BF%80%E5%8F%8D%E5%BA%94/END'], ['4-9彻入骨髓行动前', '/w/4-9_%E5%BD%BB%E5%85%A5%E9%AA%A8%E9%AB%93/BEG'], ['4-9彻入骨髓行动后', '/w/4-9_%E5%BD%BB%E5%85%A5%E9%AA%A8%E9%AB%93/END'], ['4-10灯火将熄行动前', '/w/4-10_%E7%81%AF%E7%81%AB%E5%B0%86%E7%86%84/BEG'], ['4-10灯火将熄行动后', '/w/4-10_%E7%81%AF%E7%81%AB%E5%B0%86%E7%86%84/END'], ['5-1冤家易结行动前', '/w/5-1_%E5%86%A4%E5%AE%B6%E6%98%93%E7%BB%93/BEG'], ['5-1冤家易结行动后', '/w/5-1_%E5%86%A4%E5%AE%B6%E6%98%93%E7%BB%93/END'], ['5-2有口难言行动前', '/w/5-2_%E6%9C%89%E5%8F%A3%E9%9A%BE%E8%A8%80/BEG'], ['5-2有口难言行动后', '/w/5-2_%E6%9C%89%E5%8F%A3%E9%9A%BE%E8%A8%80/END'], ['5-3义胆凡躯行动前', '/w/5-3_%E4%B9%89%E8%83%86%E5%87%A1%E8%BA%AF/BEG'], ['5-3义胆凡躯行动后', '/w/5-3_%E4%B9%89%E8%83%86%E5%87%A1%E8%BA%AF/END'], ['5-4没人在家行动前', '/w/5-4_%E6%B2%A1%E4%BA%BA%E5%9C%A8%E5%AE%B6/BEG'], ['5-4没人在家行动后', '/w/5-4_%E6%B2%A1%E4%BA%BA%E5%9C%A8%E5%AE%B6/END'], ['5-6疑兵之计行动前', '/w/5-6_%E7%96%91%E5%85%B5%E4%B9%8B%E8%AE%A1/BEG'], ['5-6疑兵之计行动后', '/w/5-6_%E7%96%91%E5%85%B5%E4%B9%8B%E8%AE%A1/END'], ['5-7生死与共行动前', '/w/5-7_%E7%94%9F%E6%AD%BB%E4%B8%8E%E5%85%B1/BEG'], ['5-7生死与共行动后', '/w/5-7_%E7%94%9F%E6%AD%BB%E4%B8%8E%E5%85%B1/END'], ['5-9孽生恶物行动前', '/w/5-9_%E5%AD%BD%E7%94%9F%E6%81%B6%E7%89%A9/BEG'], ['5-9孽生恶物行动后', '/w/5-9_%E5%AD%BD%E7%94%9F%E6%81%B6%E7%89%A9/END'], ['5-10长夜终尽行动前', '/w/5-10_%E9%95%BF%E5%A4%9C%E7%BB%88%E5%B0%BD/BEG'], ['5-10长夜终尽行动后', '/w/5-10_%E9%95%BF%E5%A4%9C%E7%BB%88%E5%B0%BD/END'], ['5-11棋胜后着', '/w/5-11_%E6%A3%8B%E8%83%9C%E5%90%8E%E7%9D%80/NBT'], ['6-1僵局行动前', '/w/6-1_%E5%83%B5%E5%B1%80/BEG'], ['6-1僵局行动后', '/w/6-1_%E5%83%B5%E5%B1%80/END'], ['6-2一些误会行动前', '/w/6-2_%E4%B8%80%E4%BA%9B%E8%AF%AF%E4%BC%9A/BEG'], ['6-2一些误会行动后', '/w/6-2_%E4%B8%80%E4%BA%9B%E8%AF%AF%E4%BC%9A/END'], ['6-3同时走失行动前', '/w/6-3_%E5%90%8C%E6%97%B6%E8%B5%B0%E5%A4%B1/BEG'], ['6-3同时走失行动后', '/w/6-3_%E5%90%8C%E6%97%B6%E8%B5%B0%E5%A4%B1/END'], ['6-4溃烂的疮疤行动前', '/w/6-4_%E6%BA%83%E7%83%82%E7%9A%84%E7%96%AE%E7%96%A4/BEG'], ['6-4溃烂的疮疤行动后', '/w/6-4_%E6%BA%83%E7%83%82%E7%9A%84%E7%96%AE%E7%96%A4/END'], ['6-5解决谁？行动前', '/w/6-5_%E8%A7%A3%E5%86%B3%E8%B0%81%EF%BC%9F/BEG'], ['6-5解决谁？行动后', '/w/6-5_%E8%A7%A3%E5%86%B3%E8%B0%81%EF%BC%9F/END'], ['6-6别说过去的事', '/w/6-6_%E5%88%AB%E8%AF%B4%E8%BF%87%E5%8E%BB%E7%9A%84%E4%BA%8B/NBT'], ['6-7断弦行动前', '/w/6-7_%E6%96%AD%E5%BC%A6/BEG'], ['6-7断弦行动后', '/w/6-7_%E6%96%AD%E5%BC%A6/END'], ['6-8只是从天而降！行动前', '/w/6-8_%E5%8F%AA%E6%98%AF%E4%BB%8E%E5%A4%A9%E8%80%8C%E9%99%8D%EF%BC%81/BEG'], ['6-8只是从天而降！行动后', '/w/6-8_%E5%8F%AA%E6%98%AF%E4%BB%8E%E5%A4%A9%E8%80%8C%E9%99%8D%EF%BC%81/END'], ['6-9换一个角度行动前', '/w/6-9_%E6%8D%A2%E4%B8%80%E4%B8%AA%E8%A7%92%E5%BA%A6/BEG'], ['6-10解开铃铛行动后', '/w/6-10_%E8%A7%A3%E5%BC%80%E9%93%83%E9%93%9B/END'], ['6-11“这种事”行动前', '/w/6-11_%E2%80%9C%E8%BF%99%E7%A7%8D%E4%BA%8B%E2%80%9D/BEG'], ['6-11“这种事”行动后', '/w/6-11_%E2%80%9C%E8%BF%99%E7%A7%8D%E4%BA%8B%E2%80%9D/END'], ['6-12冰原之雪行动前', '/w/6-12_%E5%86%B0%E5%8E%9F%E4%B9%8B%E9%9B%AA/BEG'], ['6-13没有火,没有光', '/w/6-13_%E6%B2%A1%E6%9C%89%E7%81%AB,%E6%B2%A1%E6%9C%89%E5%85%89/NBT'], ['6-14冰原之霜行动后', '/w/6-14_%E5%86%B0%E5%8E%9F%E4%B9%8B%E9%9C%9C/END'], ['6-15不错的回忆行动前', '/w/6-15_%E4%B8%8D%E9%94%99%E7%9A%84%E5%9B%9E%E5%BF%86/BEG'], ['6-15不错的回忆行动后', '/w/6-15_%E4%B8%8D%E9%94%99%E7%9A%84%E5%9B%9E%E5%BF%86/END'], ['6-16黑兔子,白兔子行动前', '/w/6-16_%E9%BB%91%E5%85%94%E5%AD%90,%E7%99%BD%E5%85%94%E5%AD%90/BEG'], ['6-17冬逝行动后', '/w/6-17_%E5%86%AC%E9%80%9D/END'], ['6-18只有你知道', '/w/6-18_%E5%8F%AA%E6%9C%89%E4%BD%A0%E7%9F%A5%E9%81%93/NBT'], ]
for i in range(len(links)):
	title, url = links[i]
	if i<len(links)-1:
		_,next_p = links[i+1]
		next_p = make_filename(next_p)
	soup_filename = make_filename(url,'txt')
	soup_path = './soup/soup'+soup_filename
	flag = 1
	if os.path.exists(soup_path):
		with open(soup_path,'r') as f:
			h = f.read()
		if h: # 以前記録したのが空でなければ，それを使う
			soup = BeautifulSoup(h,'html.parser')
			flag = 0
	if flag:
		print(title+' fetching..')
		soup = html_to_soup(url)
	a = soup.find('script', {'id': 'datas_txt', 'type': 'csv'})
	for i in str(a).split('\n'):
		if i[0] not in '[< ':
			print(title)
			print(i)
			break

	# try:
	# 	make_html_all(url,title,next_page=next_p,translation=True,mini=True,update_html=False,force_translate=True)
	# except Exception as e:
	# 	print(title,e)
	# # 	exit()

3-5呼叫行动后
————
4-2雨中漫步行动后
被剥夺了身份的人。感染者。你们早已不是人。
5-1冤家易结行动前
哗啦——
5-2有口难言行动前
敲门声
5-4没人在家行动后
玩具要摆放整齐。
5-10长夜终尽行动前
数分钟前
6-1僵局行动前
937年
6-2一些误会行动前
咦......Rosmontis，你在做什么？
6-3同时走失行动后
有多少人死在你面前了？
6-5解决谁？行动后
与此同时，贫民区的另一处
6-6别说过去的事
我根本不知道他遭遇过什么。
6-7断弦行动前
他将弩矢搭在弩上，上弦。
6-8只是从天而降！行动前
9：00 A.M. 天气/阴
6-11“这种事”行动前
呜，呜......
6-13没有火,没有光
......？
6-15不错的回忆行动前
12:21 P.M.
6-15不错的回忆行动后
15:22 P.M.
6-16黑兔子,白兔子行动前
......要不要看到最后......
6-17冬逝行动后
霜星的手指摸索着你的面庞。
6-18只有你知道
龙门保卫行动已经结束。重复一遍，龙门保卫行动已经结束。


In [113]:
soup = html_to_soup('/w/7-2_%E5%88%AB%E7%A6%BB%E4%B9%8B%E5%A4%9C/BEG')
# soup = html_to_soup('/w/W2G/BEG')
df = make_df(soup)
soup.find('script', {'id': 'datas_txt', 'type': 'csv'})

In [84]:
soup.find('script', {'id': 'datas_txt', 'type': 'csv'})

index=2 name='' text='ああ、君か。' japanese_text='ああ、君か。'


In [114]:
i = 0
for file in os.listdir('./html'):
	if file.endswith('.html'):
		with open(file, 'w') as f:
			# 'index.html'を'../index.html'に置換
			path = os.path.join('./html', file)
	# ファイルを読み込んで
			with open(path, 'r', encoding='utf-8') as f:
				text = f.read()
				# 置換して
			new_text = text.replace('index.html', '../index.html')
			# 上書き保存
			with open(path, 'w', encoding='utf-8') as f:
				f.write(new_text)
	i += 1
	if i % 20 == 0:
		print(i)

TranslateAllModel(translation=[TranslateModel(index=2, name='', text='哦，是你。', jp_text='ああ、あなたか。'), TranslateModel(index=6, name='', text='离我们上一次见面，已经过去了很久。', jp_text='私たちが最後に会ってから、もう長い時間が経った。'), TranslateModel(index=7, name='', text='这段时间里......你一直徘徊在悬崖的边缘。', jp_text='この間ずっと……あなたは崖っぷちをさまよっていた。'), TranslateModel(index=9, name='', text='你可能已经忘记了你的身份，但你还记得那个名字，这就够了。', jp_text='あなたは自分の正体をもう忘れたかもしれないが、その名前だけは覚えている、それで十分だ。'), TranslateModel(index=10, name='', text='——好了，别在这里逗留太久。', jp_text='――さあ、ここに長く留まらないように。'), TranslateModel(index=11, name='', text='毕竟，你既不是我的客人，也不应该出现在这里。', jp_text='結局、あなたは私の客でもないし、ここにいるべきでもない。'), TranslateModel(index=12, name='', text='她需要你。', jp_text='彼女はあなたを必要としている。'), TranslateModel(index=14, name='', text='12月23日。', jp_text='12月23日。'), TranslateModel(index=15, name='', text='你可能记不清这一天对你来说，究竟意味着什么。', jp_text='この日があなたにとって何を意味するのか、もう思い出せないかもしれない。'), TranslateModel(index=16, name='', text='这会让你陷入十分危险的处境。', jp_text='それはあなたをとても危険な状況に陥れるだろう。'), TranslateModel(index=17, name

In [134]:
a = df[df['command']=='name']
# name,textのみ抽出
a = a[['name', 'text']].reset_index()
a.head()
b = a.to_json(orient='records',force_ascii=False)

Unnamed: 0,index,name,text,jp_text
0,2,,哦，是你。,ああ、あなたか。
1,6,,离我们上一次见面，已经过去了很久。,私たちが最後に会ってから、もう長い時間が経った。
2,7,,这段时间里......你一直徘徊在悬崖的边缘。,この間ずっと……あなたは崖っぷちをさまよっていた。
3,9,,你可能已经忘记了你的身份，但你还记得那个名字，这就够了。,あなたは自分の正体をもう忘れたかもしれないが、その名前だけは覚えている、それで十分だ。
4,10,,——好了，别在这里逗留太久。,――さあ、ここに長く留まらないように。


In [161]:
import json
# 2. ChatGPT に渡すプロンプトを作成
#    text フィールドを日本語に翻訳し、jp_text キーを追加するよう指示
prompt = json.dumps(jb, ensure_ascii=False)
from typing import List

from pydantic import BaseModel

class TranslateAllModel(BaseModel):
	# title: str
	class TranslateModel(BaseModel):
		index: int
		name: str
		jp_name:str
		text: str
		jp_text: str
	translation: List[TranslateModel]


# 3. ChatCompletion API を呼び出し
response = client.responses.parse(
	model="gpt-4.1",
	input=[
		{"role": "system", "content": "あなたは優秀な翻訳者です。以下のリスト形式の JSON を読み込み、各オブジェクトの 'name',“text” を日本語に翻訳し、キー “jp_name”, “jp_text” として追加してください。出力は同じリスト形式の JSON のまま返してください。注意：「博士」は「ドクター」と訳すこと。入力が「？」など記号のみの場合，そのまま出力せよ。"},
		{"role": "user",   "content": prompt},
	],
	text_format=TranslateAllModel,
)

# response.output_text
response.output_parsed.dict()['translation']


In [163]:
q = response.output_parsed.dict()['translation']

Unnamed: 0,index,text,text_orig
0,0,,初始引导
1,1,,
2,2,哦，是你。,哦，是你。
3,3,,
4,4,,


In [170]:
d_q = pd.DataFrame(q)
d_q.head()

In [96]:
# df の行番号（0,1,2…）を index カラムとして追加
df["index"] = df.index
trans_df = d_q

merged = (
	df
	.merge(
		trans_df[["index", "text", "jp_text"]],  # 原文(text), 翻訳(jp_text)を取り込む
		on="index",
		how="left",
		suffixes=("_orig","", )
	)
	.assign(
		text_match=lambda d: d["text"] == d["text_orig"]  # text と text_orig が同じか
	)
)


Unnamed: 0,index,name,text,jp_text
107,254,阿米娅,虽然失去了记忆，但博士确实曾与我们......,記憶をなくしていても、博士は確かに私たちと……


In [27]:
merged[['index','text','text_orig']].head(5)

[0]
[1]
[2]


In [35]:
m = merged[~merged["text_match"]][["index", "text",'text_orig']]
k = m[m["text"].notna()& m["text"].str.strip().ne("")]
if k.size:
	print('不一致')
	print(k)

Unnamed: 0,command,text,is_tutorial,is_skippable,is_autoable,fit_mode,deny_auto_switch_scene,key,volume,delay,...,vibrato,randomness,width,height,fadeout,options,values,references,stageId,waitForSignal
0,HEADER,初始引导,True,True,True,BLACK_MASK,True,,,,...,,,,,,,,,,
1,PlayMusic,,,,,,,$babel_loop,0.8,0.2,...,,,,,,,,,,
2,name,哦，是你。,,,,,,,,,...,,,,,,,,,,
3,Image,,,,,,,,,,...,,,,,,,,,,
4,ImageTween,,,,,,,,,,...,,,,,,,,,,


In [59]:
df_j[df_j['index']==254]

In [60]:
def func(x, lst=None):
	if lst is None:
		lst = []
	lst.append(x)
	return lst

for i in range(3):
	print(func(i))

In [61]:
with open('kkk.html', 'r', encoding='utf-8') as f:
	soup = BeautifulSoup(f,'html.parser')
df = make_df(soup)
df.head()

Unnamed: 0,index,name,text,option_number
2,2,,哦，是你。,0
6,6,,离我们上一次见面，已经过去了很久。,0
7,7,,这段时间里......你一直徘徊在悬崖的边缘。,0
9,9,,你可能已经忘记了你的身份，但你还记得那个名字，这就够了。,0
10,10,,——好了，别在这里逗留太久。,0
...,...,...,...,...
299,299,阿米娅,希望博士能在战斗中，把过去的感觉找回来。,0
301,301,阿米娅,————也许连你自己都可能还不太相信......,0
302,302,阿米娅,但是我相信你。,0
304,304,阿米娅,——我相信你，一定可以的。,0


In [None]:
df['option_number'] = df['options'].fillna('').apply(
	lambda x: len(x.split(';')) if x else 0
)
#optionsにあるセミコロン区切りのデータをtextへ移す
# 1. explode 対象のマスク（text が NaN または 空文字、かつ options が非空）
mask = (df['text'].isna() | df['text'].eq('')) & df['options'].notna() & df['options'].ne('')

# 2. explode 対象をコピーして split ＆ explode
to_exp = df[mask].copy()
to_exp['text'] = to_exp['options'].str.split(';')
to_exp = to_exp.explode('text')

# 3. 対象外はそのままコピー
to_keep = df[~mask].copy()

# 4. 結合して元の順序に並び替え
df= pd.concat([to_keep, to_exp], axis=0).reset_index(drop=True)


In [None]:
df["index"] = df.index
p = df[df['command'].isin(['name', 'Decision'])]
p = p[['index','name','text','option_number']]

In [None]:
p

In [None]:

while i<p.shape[0]:
	pp = p.iloc[i]
	bb = ''
	if pp['command'] == 'decision':
		opt_n = pp['option_number']
		opt = []
		bb += '<li class="decision">\n'
		for x in range(opt_n):
			pp= p.iloc[i+x]
			opt.append(pp['text'])
			bb += f'<p class="decision">「{pp["text"]}」</p>\n'
			if translation:
				bb += f'<p class="translation">「{pp["jp_text"]}」</p>\n'
		bb += '</li>\n'
		i += opt_n
		if i>=p.shape[0]:
			break
		pred = 0  # フラグ
		while True and opt_n>1 and i<p.shape[0]: # 選択肢2つ以上ならPredicateをもっかいかく
			pp = p.iloc[i]

			if pp['command'] == 'predicate':
				if pred: bb += '</div>\n'
				if ';' in pp['references']:
					break
				bb += '<div class="predicate">\n'
				pred = 1
				t = opt[int(pp['references'])-1]
				bb+= f'<p>「{t}」</p>\n'
				if translation:
					bb += f'<p class="translation">「{pp["jp_text"]}」</p>\n'
			else:
				# icon = ''
				# if pp['command'] == 'character' and pd.notna(pp['name']):
				# 	icon = icon_html(pp,char_dict)
				# i += 1
				# if i>=p.shape[0]:	break
				bb += text_html(pp,icon=icon, translation=translation)
			i+= 1
		b += f"""<div class="decision">\n{bb}</div>\n"""

	# 背景
	elif p.iloc[i]['command'] == 'image' or p.iloc[i]['command']== 'background':
		image = p.iloc[i]['image']
		if p.iloc[i]['command'] == 'background':
			image = 'bg_'+image

		if pd.notna(image) and image not in watched_img: # 重複を避ける
			watched_img.append(image)
			# print(image)
			b += img_back_html(image, df_datas_back)+'\n'

	else:
		icon = ''
		if pp['command']== 'character' and pd.notna(pp['name']):
			icon = icon_html(pp,char_dict)
		i += 1
		if i>=p.shape[0]:	break
		pp = p.iloc[i]
		b += text_html(pp,icon,translation)

	i += 1