In [1]:
import random
import pandas as pd
from transformers import BertTokenizer

from dataset import get_valid_chinese_chars

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
bert_hanzis = get_valid_chinese_chars(tokenizer)

In [5]:
# separate the characters into train, val, test
index = list(range(len(bert_hanzis)))
random.shuffle(index)  # shuffle the index
train_split = int(len(bert_hanzis) * 0.8)  # 80% for train
val_split = int(len(bert_hanzis) * 0.9)  # 10% for val
train_df = pd.DataFrame({'hanzi': bert_hanzis[:train_split]})
val_df = pd.DataFrame({'hanzi': bert_hanzis[train_split:val_split]})
test_df = pd.DataFrame({'hanzi': bert_hanzis[val_split:]})

In [6]:
train_df['image_name'] = train_df['hanzi'] + '.jpg'
val_df['image_name'] = val_df['hanzi'] + '.jpg'
test_df['image_name'] = test_df['hanzi'] + '.jpg'

In [7]:
train_df['image_name'].to_csv('./data/train.csv', index=False)
val_df['image_name'].to_csv('./data/val.csv', index=False)
test_df['image_name'].to_csv('./data/test.csv', index=False)

# Generated Chinese Character Dataset

In [8]:
from PIL import ImageFont, ImageDraw, Image
from tqdm import tqdm
from pathlib2 import Path

In [9]:
font_name = 'STSONG'
font_tft_path = f'./data/font/{font_name}.TTF'
text_size = 64 # also the image size
font = ImageFont.truetype(font_tft_path, text_size)

In [10]:
def generate_hanzi_image(characters, text_size, font, save_dir):
    if not save_dir.exists():
        save_dir.mkdir(parents=True)
    for character in tqdm(characters):
        text_width, text_height = font.getsize(character)
        xmin, ymin, xmax, ymax = font.getmask(character).getbbox()
        offsetx, offsety = font.getoffset(character)
        canvas = Image.new('L', (text_width, text_height), (255))
        draw = ImageDraw.Draw(canvas)
        white = 0
        draw.text((0, 0), character, font=font, fill=white)
        x_gap = (text_size - (xmax - xmin)) / 2
        y_gap = (text_size - (ymax - ymin)) / 2
        canvas = canvas.crop((xmin + offsetx - x_gap, ymin + offsety - y_gap, xmax + offsetx + x_gap, ymax + offsety + y_gap))
        # check the size of canvas
        assert canvas.size == (text_size, text_size)
        canvas.save(str(save_dir / '{}.jpg'.format(character)))

In [11]:
dir = Path('./data/hanzi_img') / font_name
generate_hanzi_image(bert_hanzis, text_size, font, dir)

  text_width, text_height = font.getsize(character)
  offsetx, offsety = font.getoffset(character)
100%|██████████| 5610/5610 [00:03<00:00, 1441.84it/s]
