In [16]:
import requests,re,os,json
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
from openai import OpenAI
from typing import List
from pydantic import BaseModel
def html_to_soup(url):
	try:
		res = requests.get('https://m.prts.wiki'+url, timeout=(5, 5))
	except requests.exceptions.Timeout:
		return None
	soup = BeautifulSoup(res.text, 'html.parser')
	return soup

# soup -> df
def make_df(soup):
	script = soup.find('script', {'id': 'datas_txt', 'type': 'csv'})
	lines = [line.strip() for line in script.string.splitlines() if line.strip()]
	records = []
	pattern = re.compile(
		r'^\[(?P<cmd>\w+)'
		r'(?:\((?P<args>.*?)\)|=(?P<val>[^]]+))?'
		r'\](?:\s*(?P<text>.*))?$'
	)
	for ln in lines:
		m = pattern.match(ln)
		if m:
			cmd   = m.group('cmd').lower()
			args  = m.group('args')
			val   = m.group('val')
			text  = m.group('text') or ''

			params = {}
			if args:
				for part in re.split(r',\s*(?=\w+=)', args):
					k, v = part.split('=', 1)
					params[k] = v.strip().strip('"')
			elif val is not None:
				params[cmd] = val.strip('"')

			records.append({'command': cmd, 'text': text, **params})
		else:
			records.append({'command': None, 'text': ln})
	# DataFrame に
	df = pd.DataFrame(records)
	df['text'] = df['text'].str.replace('Dr.{@nickname}', '博士')
	df['text'] = df['text'].str.replace('Dr. {@nickname}', '博士')
	df['text'] = df['text'].str.replace('{@nickname}博士', '博士')
	if 'name' not in df.columns:
		df['name'] = ''


	if 'options' in df.columns:
		df['option_number'] = (
			df['options']
			.fillna('')
			.apply(lambda x: len(x.split(';')) if x.strip() else 0)
		)
		df = df.reset_index().rename(columns={'index':'orig_idx'})
		mask = ((df['text'].eq('') | df['text'].isna()) &
		        df['options'].astype(str).str.strip().ne(''))
		df_expand = (
			df[mask]
			.assign(
				text         = lambda d: d['options'].str.split(';'),
				option_value = lambda d: d['values'].str.split(';'),
				# 順序を数字リストで保持
				option_order = lambda d: d['options']
				.str.split(';')
				.apply(lambda lst: list(range(len(lst))) if isinstance(lst, list) else [])
			).explode(['text','option_value','option_order'])
		)

		df_no = df[~mask].copy()
		df_no['option_order'] = pd.NA  # ソート用に同じ列を用意

		df = (
			pd.concat([df_expand, df_no], ignore_index=True)
			.sort_values(['orig_idx','option_order'], kind='stable')
			.reset_index(drop=True)
			# 不要列を落とす
			.drop(['orig_idx','option_order','options'], axis=1)
		)

	keep_cols = ['command','text','name','option_number','option_value','references','focus','image','index','values','name1','name2','name3','name4']
	df = df[[col for col in keep_cols if col in df.columns]]
	return df

# soup,df -> キャラのurlのdict
def char_img_dict(soup):
	datas_char= soup.find('script', {'id': 'datas_char', 'type': 'csv'}).string
	datas_char =  pd.read_csv(StringIO(datas_char), header=None)
	datas_char.columns = ['name', 'html']
	datas_char['name'] = datas_char['name'].str.lower()
	datas_char = dict(zip(datas_char['name'], datas_char['html']))
	return datas_char

client = OpenAI()
class TranslateAllModel(BaseModel):
	class TranslateModel(BaseModel):
		index: int
		name: str
		jp_name:str
		text: str
		jp_text: str
	translation: List[TranslateModel]

def translate(df,mini):
	df["index"] = df.index
	p = df[pd.notna(df['text'])]
	p = p[['index','name','text']]

	if mini:model = 'gpt-4.1-mini'
	else:model = 'gpt-4.1'

	chunk_size = 50
	chunks = [p.iloc[i : i + chunk_size] for i in range(0, len(p), chunk_size)]
	processed = []
	for chunk in chunks:
		p = chunk.to_json(orient='records',force_ascii=False)
		prompt = json.dumps(p, ensure_ascii=False)
		response = client.responses.parse(
			model=model,
			input=[
				{"role": "system", "content":
					"""あなたは優秀な翻訳者です。以下のリスト形式の JSON を読み込み、各オブジェクトの 'name',“text” を日本語に翻訳し、キー “jp_name”, “jp_text” として追加してください。出力は同じリスト形式の JSON のまま返してください。
					# 注意:「博士」は「ドクター」と訳すこと。入力が「？」など記号のみの場合，そのまま出力せよ。阿米娅のセリフは敬体にしなさい。"""},
				{"role": "user",   "content": prompt},
			],
			text_format=TranslateAllModel,
		)
		processed_chunk = pd.DataFrame(response.output_parsed.dict()['translation'])
		processed.append(processed_chunk)

	# 処理結果を結合
	trans_df =  pd.concat(processed, ignore_index=True)
	trans_df['text'] = trans_df['text'].str.replace('\\\\', '\\')
	trans_df['text'] = trans_df['text'].str.replace('\\"', '"')

	merged = (df.merge(
		trans_df[["index", "text", "jp_text",'jp_name']],  # 原文(text), 翻訳(jp_text)を取り込む
		on="index",
		how="left",
		suffixes=("_orig", ""))
	          .assign(text_match=lambda d: d["text"] == d["text_orig"]  # text と text_orig が同じか　->　text_match列に
	                  )
	          )
	m = merged[~merged["text_match"]][["index", "text",'text_orig']]
	k = m[m["text"].notna()& m["text"].str.strip().ne("")]
	if k.size:
		# print('不一致')
		print(k)
	# return None
	return merged

def make_filename(url, extension='html'):
	filename_list = url.split('/')[2:]
	filename = filename_list[0].split('_')[0]
	if len(filename_list)>1:
		filename += '-'+filename_list[1]
	if extension=='html':
		filename += '.html'
	if extension=='csv':
		filename += '.csv'
	if extension=='txt':
		filename += '.txt'
	return filename

def url_to_translatedCSV(url,title,mini,force_translate=False):
	soup_filename = make_filename(url,'txt')
	soup_path = './soup/soup'+soup_filename
	flag = 1
	soup = ''
	if os.path.exists(soup_path):
		with open(soup_path,'r') as f:
			h = f.read()
		if h: # 以前記録したのが空でなければ，それを使う
			soup = BeautifulSoup(h,'html.parser')
			flag = 0
	if flag:
		soup = html_to_soup(url)
		if soup:
			with open(soup_path,'w') as f:
				f.write(soup.find('script', {'id': 'datas_txt', 'type': 'csv'}).prettify())
	csv_filename = make_filename(url,'csv')
	exist_csv = ''
	if mini:
		csv_path = './csv/mini/'+csv_filename
		if os.path.exists('./csv/'+csv_filename):
			exist_csv = './csv/'+csv_filename
		elif os.path.exists(csv_path):
			exist_csv = csv_path
	else:
		csv_path = './csv/'+csv_filename
		if os.path.exists(csv_path):
			exist_csv = csv_path
	if not soup:
		print('timeout')
		return
	elif not exist_csv or force_translate:
		df = make_df(soup)
		df = translate(df,mini)
		df.to_csv(csv_path, index=False)

In [17]:
def text_html(pp,icon='',translation=True):
	bbb = ''
	if pp['command'] == 'name':
		if pd.notna(pp['name']) and pp['name']!='':
			bbb += f'{icon}\n<em class="name">{pp["name"]}</em>\n'
			if translation and pp['jp_name']!='':
				bbb += f'<em class="translation">{pp["jp_name"]}</em>\n'

		bbb = f'<div class="character">{bbb}</div>'
		text = ''
		if pd.notna(pp['text'])and pp['text']!='':
			text += f'<p>{pp["text"]}</p>\n'
			if translation and pp['jp_text']!='':
				text += f'<p class="translation">{pp["jp_text"]}</p>\n'
			bbb += f"""<div class="text">\n{text}</div>"""
		return f"""<li class="dialogue">\n{bbb}\n</li>\n"""
	else:
		return bbb

def icon_html(pp,char_dict):
	a =  'name'
	if 'focus' in pp.index and pd.notna(pp['focus']):
		f = int(pp['focus'])
		if f>1:
			a += str(f)
	d = {'char_136_hsguma':'char_136_hsguma_1',
	     'char_1504_cqbw':'char_1504_cqbw_1',
	     'char_2006_weiywfmzuki_1':"char_2006_fmzuki_1"}
	cha_id = pp[a].split('#')[0].lower()

	if cha_id in d.keys():
		cha_id = d[cha_id]
	if cha_id in char_dict.keys():
		url = char_dict[cha_id]
		a = f'<div class="icon" style="--img-url: url(\'{url}\')" data-src="{url}"></div>'
		return a
	else: return None

def img_back_html(image_id, df_datas_back):
	t = df_datas_back[df_datas_back['name']==image_id].html.iloc[0]
	return f'<img src="{t}" width="100%" height="auto">'

from collections import defaultdict
# soup,df -> html（本文）
def make_html(df,translation=True):
	p = df
	b = ''
	with open('soup/soup14-8-BEG.txt','r') as f:
		h = f.read()
	soup = BeautifulSoup(h,'html.parser')
	datas_back = soup.find('script', {'id': 'datas_back', 'type': 'csv'}).string
	df_datas_back=pd.read_csv(StringIO(datas_back), header=None)
	df_datas_back.columns = ['name', 'html']
	char_dict = char_img_dict(soup)

	# watched_img = []

	opts_dict = defaultdict(str)
	opts_dict_jp = defaultdict(str)
	icon = ''

	for i in range(p.shape[0]):
		pp = p.iloc[i]
		c = pp['command']
		if c == 'decision':
			op_id = int(pp['option_value'])
			if op_id==1: # 連続するdecisionのうち最初のもの
				b += '<div class="decision"><li class="decision">'
				b += f'<p class="decision">「{pp["text"]}」</p>'
				if translation: b += f'<p class="translation">「{pp["jp_text"]}」</p>'

			elif op_id == int(pp['option_number']):
				b += f'<p class="decision">「{pp["text"]}」</p>'
				if translation: b += f'<p class="translation">「{pp["jp_text"]}」</p>'
				b += '</li>'
			else:
				b += f'<p class="decision">「{pp["text"]}」</p>'
				if translation: b+= f'<p class="translation">「{pp["jp_text"]}」</p>'
			opts_dict[op_id] = pp['text']
			opts_dict_jp[op_id] = pp['jp_text']
			opts_dict['option_number']=int(pp['option_number'])

		elif c=='predicate':
			if opts_dict['option_number']==1:
				b += '</div>'
				opts_dict = defaultdict(str)
				opts_dict_jp = defaultdict(str)
			else:
				ref = pp['references']
				if type(ref)==str and ';' not in ref:
					ref = int(ref)
					if ref>1: b += '</div>'
					b += '<div class="predicate">'
					b +=  f'<p>「{opts_dict[ref]}」</p>'
					if translation: b += f'<p class="translation">「{opts_dict_jp[ref]}」</p>'

				else:  # ; が含まれる時

					b += '</div></div>'
					opts_dict = defaultdict(str)
					opts_dict_jp = defaultdict(str)
		elif c=='character':
			if pd.notna(pp['name']) and pp['name']!='':
				icon = icon_html(pp,char_dict)
			else:
				icon = ''
			if  ('focus' in pp.index) and pp['focus']==-1:
				icon = ""
		elif c=='blocker':
			icon=''
		elif c=='name':
			b += text_html(pp,icon,translation)
		elif c in ('image','background'):
			image_id = pp['image']
			if pd.notna(image_id):
				if c=='background':
					image_id = 'bg_' + image_id
				b += img_back_html(image_id, df_datas_back)+'\n'
		elif pd.notna(pp['text']) and pp['text']!='' and c!='header':
			b += f'<p>{pp["text"]}</p>\n'
			if translation and pp['jp_text']!='':
				b+= f'<p class="translation">{pp["jp_text"]}</p>\n'
	return b

In [18]:
def translate_title(title):
	appendix = ''
	if '行动前' == title[-3:]:
		title = title[:-3]
		appendix = ' 戦闘前'
	elif '行动后' == title[-3:]:
		title = title[:-3]
		appendix = ' 戦闘後'
	response = client.responses.create(
		model="gpt-4.1-mini",
		input=[{
			'role': 'system',
			'content': 'あなたは中国語の翻訳家です。中国語の小説のタイトルを与えるので，日本語に訳してください。英語のみのときや記号のみの場合はそのまま出力せよ。そのほかの出力は絶対にするな',
		},
			{'role':'user','content':title}]
	)
	return response.output_text + appendix

def make_html_all(url,title,next_page='',translation=True,mini=False,update_html=False, force_translate=False):
	if not update_html:
		if os.path.exists('./html/'+make_filename(url)):
			# print('html already exists')
			return

	soup_filename = make_filename(url,'txt')
	soup_path = './soup/soup'+soup_filename
	flag = 1
	if os.path.exists(soup_path):
		with open(soup_path,'r') as f:
			h = f.read()
		if h: # 以前記録したのが空でなければ，それを使う
			soup = BeautifulSoup(h,'html.parser')
			flag = 0
	if flag:
		print(title+' fetching..')
		soup = html_to_soup(url)
		if soup:
			with open(soup_path,'w') as f:
				f.write(soup.find('script', {'id': 'datas_txt', 'type': 'csv'}).prettify())

	csv_filename = make_filename(url,'csv')
	exist_csv = ''
	if mini:
		csv_path = './csv/mini/'+csv_filename
		if os.path.exists('./csv/'+csv_filename):
			exist_csv = './csv/'+csv_filename
		elif os.path.exists('./html/'+csv_filename):
			exist_csv = './html/'+csv_filename
	else:
		csv_path = './csv/'+csv_filename
		if os.path.exists(csv_path):
			exist_csv = csv_path

	df = make_df(soup)
	# csv がある時

	if exist_csv and not force_translate:
		df_t = pd.read_csv(exist_csv)
		df_t_valid = df_t[df_t['text'].notna()]
		# ２．同じ text が複数ある場合は最初の行だけ残す
		df_t_unique = df_t_valid.drop_duplicates(subset='text')
		# 方法１：merge で一度に両方追加
		df = df.merge(
			df_t_unique[['text', 'jp_text', 'jp_name']],
			on='text',
			how='left'
		)
	# csvないとき　翻訳のcsvを新規作成する
	else:
		if not soup:
			print('timeout')
			return
		print(title+' fetched. now translating..')
		if translation:
			df = translate(df,mini)
		df.to_csv(csv_path, index=False)

	body = make_html(df,translation=translation)
	title = f'{title}　-　{translate_title(title)}'
	html = (f"""<!DOCTYPE html>
	<html lang="zh-CN">
	<head>
	 <meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1">
	 <script src="https://kit.fontawesome.com/c07fe94aba.js" crossorigin="anonymous"></script>
	 <link href="https://fonts.googleapis.com/css2?family=Noto+Sans:wght@400;700&display=swap" rel="stylesheet">
	 <title>{title}</title>
	 <link rel="stylesheet" href="../css/styles.css">
</head><body><h2 class="title">{title}</h2>
<ul>{body}</ul>  <div class="button-container">
   <button class="back-home" onclick="location.href='../index.html'" aria-label="ホーム">
    <i class="fa-solid fa-house"></i>
   </button>
  <button class="next-page" onclick="location.href='{next_page}'">
   <i class="fa-solid fa-angle-right"></i>
  </button>
  </div>
 <div class="overlay">
   <img class="overlay-img" src="" alt="元画像">
  </div>
<script>
	const overlay = document.querySelector('.overlay');
    const img = overlay.querySelector('.overlay-img');

    document.body.addEventListener('click', e => {{
      if (e.target.matches('.icon')) {{
        img.src = e.target.dataset.src;
        overlay.classList.add('show');
      }}
      else if (e.target === overlay) {{
        overlay.classList.remove('show');
      }}    }});
        function fitTextToWidth(el, maxWidth) {{
    // 現在のフォントサイズを取得
    let style = window.getComputedStyle(el);
	let fontSize = parseFloat(style.fontSize);
	// 幅がオーバーしている限り、1pxずつ小さくする
while (el.scrollWidth > maxWidth && fontSize > 8) {{
fontSize -= 1;
el.style.fontSize = fontSize + 'px';
}}
}}

// ページ読み込み後／ウィンドウリサイズ時に実行
function adjustAll() {{
document.querySelectorAll('.character').forEach(char => {{
	const maxW = char.clientWidth;
char.querySelectorAll('em.name, em.translation').forEach(el => {{
// 初期サイズにリセットしてから再計算
el.style.fontSize = '';
fitTextToWidth(el, maxW);
}});
}});
}}

window.addEventListener('load', adjustAll);
window.addEventListener('resize', adjustAll);
</script>
</body>
</html>""")
	soup = BeautifulSoup(html, 'html.parser')
	for el in soup.select('.translation'):
		el['lang'] = 'jp'
	html = soup.prettify()
	filename = make_filename(url)
	with open('./html/'+filename,'w',encoding='utf-8') as f:
		f.write(html)
	print(f"⇒ {title} saved to {filename}")

In [4]:
links =  [ ['7-18爱国者之死行动后', '/w/7-18_%E7%88%B1%E5%9B%BD%E8%80%85%E4%B9%8B%E6%AD%BB/END'], ['7-1911:15:38', '/w/7-19_11:15:38/NBT'], ['7-20??:??:??', '/w/7-20_%3F%3F:%3F%3F:%3F%3F/NBT'] ]
# ,['R8-1昨日，谷壳将裂行动前', '/w/R8-1_%E6%98%A8%E6%97%A5%EF%BC%8C%E8%B0
for i in range(len(links)):
	title, url = links[i]
	if i<len(links)-1:
		_,next_p = links[i+1]
		next_p = make_filename(next_p)
	try:
		make_html_all(url,title,next_page=next_p,translation=True,mini=False,update_html=True,force_translate=True)
	except Exception as e:
		print(title,e)
# # 	exit()

7-18爱国者之死行动后 fetched. now translating..
7-18爱国者之死行动后 'name2'
7-1911:15:38 fetched. now translating..
7-1911:15:38 'name2'
7-20??:??:?? fetched. now translating..
⇒ 7-20??:??:??　-　7-20??:??:?? saved to 7-20-NBT.html
R8-1昨日，谷壳将裂行动前 fetched. now translating..
R8-1昨日，谷壳将裂行动前 'name2'
R8-1昨日，谷壳将裂行动后 fetched. now translating..
R8-1昨日，谷壳将裂行动后 'name2'
M8-1今日，血色满溢 fetched. now translating..
M8-1今日，血色满溢 'name2'
R8-3麦秆，极易燃烧行动前 fetched. now translating..
R8-3麦秆，极易燃烧行动前 'name2'
R8-3麦秆，极易燃烧行动后 fetched. now translating..


KeyboardInterrupt: 

In [21]:
d = [['END8-1尾声，抑或开始', '/w/END8-1_%E5%B0%BE%E5%A3%B0%EF%BC%8C%E6%8A%91%E6%88%96%E5%BC%80%E5%A7%8B/NBT'], ['EG-1燃烧的片段1', '/w/EG-1_%E7%87%83%E7%83%A7%E7%9A%84%E7%89%87%E6%AE%B51/NBT'], ['EG-2燃烧的片段2', '/w/EG-2_%E7%87%83%E7%83%A7%E7%9A%84%E7%89%87%E6%AE%B52/NBT'], ['EG-3燃烧的片段3', '/w/EG-3_%E7%87%83%E7%83%A7%E7%9A%84%E7%89%87%E6%AE%B53/NBT'], ['EG-4燃烧的片段4', '/w/EG-4_%E7%87%83%E7%83%A7%E7%9A%84%E7%89%87%E6%AE%B54/NBT'], ['EG-5燃烧的片段5', '/w/EG-5_%E7%87%83%E7%83%A7%E7%9A%84%E7%89%87%E6%AE%B55/NBT'],]
for title,url in d:
	print(title)
	url_to_translatedCSV(url,title,mini=True,force_translate=False)

END8-1尾声，抑或开始
      index                                               text  \
185     185                                     ……现在我们成了他们的助手。   
187     187              霍克，回去以后要一起喝杯咖啡吗？我可以和你讲讲上次我们在哥伦比亚的行动……   
197     197                               那这次……我们可能真的光彩地做了件好事。   
207     207                                           啊，那应该是……   
208     208                      叔叔，我们要带走塔露拉。啊，它飞走了……我们已经带走她了。   
210     210                                  ……什么？你们，你们带走了塔露拉？   
214     214                                            不行……不行。   
219     219                                       小猫，现学现卖可不对……   
235     235                                    ……说什么！你不可以再这么说。   
238     238      但是，记住，无论是塔露拉，还是那个魔王……哪怕一次，只要有一次让我们知道你们在为非作歹……   
245     245                     以后……我会成为你们说的那个，能认出别人是好人还是坏人的人。   
258     258                                 ……如果我有女儿，应该也有你这么大。   
259     259         小猫，如果另一个女孩，那个魔王……那个卡特斯。如果她真的能做好，真的能走下去的话……   
573     573  这里的景色我永远不会忘记，晖洁。只要看到它，我就会想到他们......我的妹妹，我没有血缘关...

In [9]:
'我决定反着走。我要剥掉他们彬彬有礼的皮囊，拆开他们引以为豪的一层接一层的贵族城市，把真相告诉'=='我决定反着走。我要剥掉他们彬彬有礼的皮囊，拆开他们引以为豪的一层接一层的贵族城市，把真相告诉'

True