# Export embeddings for Hanzis

In [6]:
from pathlib2 import Path

In [7]:
data_path = Path('data')
save_dir = Path('embeddings')
model_dir = Path('experiments')

wrap_size = 32

## Prepare the list of Hanzis

### The list of hanzi from the tokenizer of the bert

In [8]:
from transformers import BertTokenizer
from dataset import get_valid_chinese_chars

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
hanzis = get_valid_chinese_chars(tokenizer)

## Export visual embeddings

In [18]:
import torch
import pickle

from utils import load_best_checkpoint
from tqdm import tqdm
from PIL import Image
from torchvision import transforms

from model import create_AutoEncoder
import yaml

In [3]:
config = yaml.load(open('config/config.yml', 'r'), Loader=yaml.FullLoader)

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = create_AutoEncoder(config['model'])
model = model.to(device)
load_best_checkpoint(model_dir, model)

loading best checkpoint at experiments/best.pth.tar


In [12]:
img_transforms = transforms.Compose(
            [transforms.Resize((wrap_size, wrap_size)),
             transforms.ToTensor()])

In [19]:
embeddings = {}
with torch.no_grad():
    for hanzi in tqdm(hanzis):
        img_path = data_path / 'hanzi_img' / config['font'] / f'{hanzi}.jpg'
        img = Image.open(str(img_path)).convert('L')
        img_tensor = img_transforms(img).unsqueeze(0).to(device)
        embedding = model.Encoder(img_tensor)
        embedding = embedding.squeeze(0).detach().cpu().numpy()
        embeddings[hanzi] = embedding
pickle.dump(embeddings, open(save_dir / 'visual_embedding.pkl', 'wb'))

100%|██████████| 5610/5610 [00:05<00:00, 994.10it/s] 


## Export bert embeddings

In [20]:
from transformers import BertModel, BertTokenizer
from tqdm import tqdm

In [21]:
model = BertModel.from_pretrained('bert-base-chinese')
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = model.to(device)

Downloading pytorch_model.bin: 100%|██████████| 412M/412M [00:06<00:00, 66.2MB/s] 
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:

embeddings = {}
for hanzi in tqdm(hanzis):
    inputs = tokenizer(hanzi, return_tensors='pt')
    hanzi_id = inputs['input_ids'][0][1].to(device)
    embedding = model.embeddings.word_embeddings(hanzi_id).detach().cpu().numpy()
    embeddings[hanzi] = embedding
pickle.dump(embeddings, open(save_dir / 'text_embedding.pkl', 'wb'))

100%|██████████| 5610/5610 [00:02<00:00, 2752.48it/s]
