In [1]:
import urllib.request
import re

In [2]:
chars = {}

In [3]:
# add_listの文字と半角カタカナ、全角英数字はlangdata/jpn/forbidden_charactersに含まれていてもchars.txtへ出力する
add_list = [
                0x25b2,  # ▲
                0x25b3,  # △
            ]


In [4]:
with urllib.request.urlopen('http://x0213.org/codetable/sjis-0213-2004-std.txt') as f:
    for line in f.read().decode('ascii').splitlines():
        if line[0] == '#':
            continue
        else:
            m = re.search('U\+([0-9a-f]{4})', line, flags=re.I) #恐らく大量の文字をここで取得している
            if m:
                code = int(m.group(1), base=16) #16進数をint型に変換
                #print(code)
                if code > 0x20: #0x20は10進数で32を意味する
                    chars[code] = True

In [5]:
del_list = {}
with open('./jpn/forbidden_characters') as f:
    for line in f:
        m = re.search('0x([0-9a-f]{2,4})(-0x([0-9a-f]{2,4}))?\s*$', line, flags=re.I)
        if m:
            if m.group(2):
                range_s = [int(m.group(1), base=16), int(m.group(3), base=16)]
            else:
                range_s = [int(m.group(1), base=16), int(m.group(1), base=16)]
        for c in chars:
            if range_s[0] <= c <= range_s[1]:
#                 if not (ord('｡') <= c <= ord('ﾟ')  # NOT 半角カタカナ
#                         or ord('！') <= c <= ord('｝')):  # NOT 全角英数字
                #print("%s excluded as %x - %x" % (chr(c), range_s[0], range_s[1])) #ここに表示される文字は消される
                del_list[c] = True

In [6]:
for c in del_list:
    del chars[c]

In [7]:
for c in add_list:
    chars[c] = True

In [8]:
with open('chars.txt', 'w') as wf:
    for code in sorted(chars):
        print("0x%x,%s" % (code, chr(code)), file=wf)

In [9]:
print(chr(0x25b2))

▲


この文字コード一覧（chars.txt）を基準として、単語辞書をもとに学習用テキストを作成

create_training_text.py

In [20]:
import glob
import random
import sys
import textwrap
from collections import Counter

In [21]:
def read_chars(filename):
    # 文字種ごとの出現回数
    count = Counter()
    with open(filename) as chars:
        for line in chars:
            count[int(line.split(',')[0],base=16)] = 0
    return count

def read_all_words(dir_s):
    words = {}
    files = glob.glob(dir_s + '/*.csv')
    for filename in files:
        with open(filename, encoding='utf-8') as file:
            for line in file:
                word = line.split(',')[0]
                words[word] = True
    return list(words.keys())

def main():
    # training_bs.txt
    text = ''
    count_required = 20
    chars = read_chars('chars.txt')
    words = read_all_words('mecab-ipadic-neologd/seed')
    print("Total words %d" % len(words))
    training = open('training_bs.txt', 'w', encoding='utf-8')
    random.shuffle(words)
    for word in words:
        min_count = 10000
        skip = False
        # wordに含まれる文字の中で出現回数が最少のもの
        for c in word:
            code = ord(c)
            if code not in chars:
                # 文字種リストに含まれない文字がある場合はスキップ
                skip = True
                # スキップの場合は警告表示
                
                print("skipped %s by %s" % (word, c), file=sys.stderr)
                break
            count = chars[code] + 1
            if count < min_count:
                min_count = count
        # 最少出現回数が20回以下なら、この単語は「使う」
        if not skip and min_count <= count_required:
            text += word
            # 使ったら出現回数をアップデート
            for c in word:
                code = ord(c)
                chars[code] += 1
    # まとめて出力
    training.write("\n".join(textwrap.wrap(text, width=40)))
    training.close()

    # 1回も使われなかった文字
    with open('unused_chars.txt', 'w', encoding='utf-8') as uc:
        for c in chars:
            if chars[c] == 0:
                print('0x%x,%s' % (c, chr(c)), file=uc)

In [22]:
words = read_all_words('mecab-ipadic-neologd/seed')

In [24]:
len(words)

5146049

Total words 0


ここまで出来たらtraining_bs.txtを使って学習雨用データをtesstrain.shで作成 <br>
以下のコマンドで学習データがキチンととってこれるかを見る

In [None]:
nohup time bash ~/tess/tesseract/src/training/tesstrain.sh --fonts_dir /usr/share/fonts --lang jpn /
--fontlist "TakaoGothic" "TakaoPGothic" "VL Gothic" "VL PGothic" "Noto Sans CJK JP Bold" "Noto Sans CJK JP" '必要に応じてフォントを追加' /
--linedata_only 
--training_text ~/training_bs.txt #これが作成物  
--langdata_dir ~/tess/langdata 
--noextract_font_properties --output_dir ~/tess/training_bs > ~/tess/training_bs/generate.log 2>&1 &