# Export embeddings for Hanzis

In [27]:
from pathlib2 import Path

In [41]:
data_path = Path('data')
save_dir = Path('embeddings')
model_dir = Path('experiments')

wrap_size = 32

## Prepare the list of Hanzis

### The list of hanzi from the dataset of visual encoder

In [4]:
import pandas as pd

In [36]:
dataset_hanzi_df = pd.read_csv(data_path / 'common_hanzi.csv')
dataset_hanzis = hanzi_df['汉字'].values.tolist()

### The list of hanzi from the tokenizer of the bert

In [17]:
from transformers import BertTokenizer
import re

In [15]:
def get_valid_chinese_chars(tokenizer):
    pattern = re.compile(r'[\u4e00-\u9fff]')
    vocab = tokenizer.get_vocab()
    chinese_chars = [char for char in vocab if pattern.match(char)]

    return chinese_chars

In [22]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
bert_hanzis = get_valid_chinese_chars(tokenizer)

### The common set

In [23]:
hanzis = list(set(dataset_hanzis) & set(bert_hanzis))

## Export visual embeddings

In [48]:
import torch
import pickle

from utils import load_best_checkpoint
from tqdm import tqdm
from PIL import Image
from torchvision import transforms

from model import create_AutoEncoder

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = create_AutoEncoder(None)
model = model.to(device)
load_best_checkpoint(model_dir, model)

loading best checkpoint at experiments/best.pth.tar


In [51]:
img_transforms = transforms.Compose(
            [transforms.Resize((wrap_size, wrap_size)),
             transforms.ToTensor()])

In [52]:
embeddings = {}
with torch.no_grad():
    for hanzi in tqdm(hanzis):
        img_path = data_path / 'hanzi_img' / f'{hanzi}.jpg'
        img = Image.open(str(img_path)).convert('L')
        img_tensor = img_transforms(img).unsqueeze(0).to(device)
        embedding = model.encoder(img_tensor)
        embedding = embedding.squeeze(0).detach().cpu().numpy()
        embeddings[hanzi] = embedding
pickle.dump(embeddings, open(save_dir / 'visual_embedding.pkl', 'wb'))

100%|██████████| 3487/3487 [00:04<00:00, 840.09it/s]


## Export bert embeddings

In [74]:
from transformers import BertModel, BertTokenizer
from tqdm import tqdm

In [62]:
model = BertModel.from_pretrained('bert-base-chinese')
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = model.to(device)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [75]:

embeddings = {}
for hanzi in tqdm(hanzis):
    inputs = tokenizer(hanzi, return_tensors='pt')
    hanzi_id = inputs['input_ids'][0][1].to(device)
    embedding = model.embeddings.word_embeddings(hanzi_id).detach().cpu().numpy()
    embeddings[hanzi] = embedding
pickle.dump(embeddings, open(save_dir / 'text_embedding.pkl', 'wb'))

100%|██████████| 3487/3487 [00:01<00:00, 2376.34it/s]
