In [None]:
# Chapter という文字列が含まれる行を抽出して別ファイルに保存する
# 抽出結果を書き込んだファイルを確認し、文中にChapterが含まれる行があるか確認する

import os
import re
from tqdm import tqdm

# ディレクトリの指定
source_directory = './texts'
check_directory = './check'

# 旧約を先にする
paths = sorted(os.listdir(source_directory), reverse=True)

for path in paths:
	print(f"\n---- {path} ----\n")
	with open(f'{source_directory}/{path}', mode='r', encoding='utf-8') as f:
		lines = f.readlines()
		with open(f'{check_directory}/{path.replace("bible", "chapters")}', mode='w', encoding='utf-8') as g:
			for line in tqdm(lines):
				if (re.search(r'Chapter', line)): g.write(line)


---- old.bible.txt ----



100%|██████████| 108346/108346 [00:00<00:00, 1202036.90it/s]



---- new.bible.txt ----



100%|██████████| 29549/29549 [00:00<00:00, 1634175.30it/s]


In [None]:
# 旧約と新約で形式が少し違うため、形式をそろえる。
# Chapterごとに分類できるように無駄な文字を削除する
#
# ex)
# 1. Matther Chapter 1	 		-> Matther Chapter
# 2. Matther Chapter 2~ 		-> None
# 3. ~:~				   		-> None
# 4. THE BOOK OF ~		 		-> None
# 5. \n					   		-> None

import os
import re
from tqdm import tqdm

# ディレクトリの指定
source_directory = './texts'
target_directory = './clean'

# 旧約を先にする
paths = sorted(os.listdir(source_directory), reverse=True)

pattern12 = re.compile(r'^[0-9]?\s?[A-Za-z]+ Chapter [0-9]+$')
pattern3  = re.compile(r'^[0-9]+:[0-9]+.\s')
pattern45 = re.compile(r'^[THE BOOK OF [A-Z]+|\n')

for path in paths:
	print(f"\n---- {path} ----\n")

	with open(f'{source_directory}/{path}', mode='r', encoding='utf-8') as f:
		lines = f.readlines()

		with open(f'{target_directory}/{path}', mode='w', encoding='utf-8') as g:
			for line in tqdm(lines):
				if (re.match(pattern12, line)):
					match (line.strip().split()[-1]):
						case '1':
							line = re.sub(r'[0-9]+ ', '', line)
							line = re.sub('Chapter 1', 'Chapter', line)
							g.write(line)
						case _:
							continue
				elif (re.match(pattern3, line)): g.write(re.sub(pattern3, '', line))
				elif (re.match(pattern45, line)): continue
				else: g.write(line)


---- old.bible.txt ----



100%|██████████| 108347/108347 [00:00<00:00, 186862.88it/s]



---- new.bible.txt ----



100%|██████████| 29550/29550 [00:00<00:00, 125355.44it/s]


In [113]:
# 章ごとに別々のファイルに抽出する。

import os
import re
from tqdm import tqdm
from shutil import rmtree

# ディレクトリの指定
source_directory = './clean'

# 旧約を先にする
paths = sorted(os.listdir(source_directory), reverse=True)

for path in paths:
	if path.split('.')[-1] != 'txt': continue
	
	print(f"\n---- {path} ----\n")
	if (os.path.exists(f"{source_directory}/{path.split('.')[0]}")):
		rmtree(f"{source_directory}/{path.split('.')[0]}")
		os.mkdir(f"{source_directory}/{path.split('.')[0]}")

	with open(f'{source_directory}/{path}', mode='r', encoding='utf-8') as f:
		lines = f.readlines()

	target_directory = None
	for line in tqdm(lines):
		if (re.search(r'[A-Za-z]+ Chapter$', line)):
			target_directory = f"{source_directory}/{path.split('.')[0]}/{line.split()[0]}.txt"
		elif target_directory is None: continue
		else:
			with open(target_directory, mode='a', encoding='utf-8') as g:
				g.write(line)


---- old.bible.txt ----



100%|██████████| 68802/68802 [00:19<00:00, 3556.44it/s]



---- new.bible.txt ----



100%|██████████| 18503/18503 [00:05<00:00, 3437.96it/s]
