In [1]:
!pip3 install conllu



## Фичи

В первую очередь используем иероглифы, которые, как известно, имеют в себе семантический компонент, и, таким образом, являются значимыми признаками. Список иероглифов извлечём с http://www.hanzicraft.com/lists/frequency - там 8943 наиболее распространённых иероглифов. (в Python своего встроенного списка нет)

In [0]:
from bs4 import BeautifulSoup
import requests
sess = requests.Session()

Возьмём четыре тысячи наиболее частых иероглифов (на сайте они выстроены по частоте), потому что все сразу брать смысла нет.

In [0]:
page = sess.get('http://www.hanzicraft.com/lists/frequency')
soup = BeautifulSoup(page.text, 'html.parser')
hanzi = [ch.text.strip().split('\n')[0] for ch in soup.findAll('li', class_='list')][:4000]

In [4]:
len(hanzi)

4000

In [0]:
hanzi_set = set(hanzi)

Загрузим данные.

In [0]:
from conllu import parse
import urllib

In [7]:
train_url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_Chinese-GSD/master/zh_gsd-ud-train.conllu'
urllib.request.urlretrieve(train_url, filename = 'zh_gsd-ud-train.conllu')

('zh_gsd-ud-train.conllu', <http.client.HTTPMessage at 0x7fdc02c6e940>)

In [0]:
with open("zh_gsd-ud-train.conllu", "r", encoding="utf-8") as f:
    cntr = f.read()
f.close()

In [0]:
chinese_train = parse(cntr)

In [10]:
chinese_train[0]

TokenList<看似, 簡單, ，, 只, 是, 二, 選, 一, 做, 決擇, ，, 但, 其實, 他們, 代表, 的, 是, 你, 周遭, 的, 親朋, 好友, ，, 試, 著, 給, 你, 不同, 的, 意見, ，, 但, 追根究底, ，, 最後, 決定, 的, 還是, 自己, 。>

Для обучения будем использовать bag of characters, поскольку для иероглифов это просто предельно важно.

In [0]:
chinese_train_words = [[w['form'] for w in i] for i in chinese_train]
chinese_train_postags = [[w['upostag'] for w in i] for i in chinese_train]
train_data = [(chinese_train_words[i], chinese_train_postags[i]) for i in range(len(chinese_train))]

In [12]:
chinese_train_words[3], chinese_train_postags[3], train_data[3]

(['懷孕', '期', '為', '421', '至', '457', '日', '。'],
 ['VERB', 'PART', 'AUX', 'NUM', 'CCONJ', 'NUM', 'NOUN', 'PUNCT'],
 (['懷孕', '期', '為', '421', '至', '457', '日', '。'],
  ['VERB', 'PART', 'AUX', 'NUM', 'CCONJ', 'NUM', 'NOUN', 'PUNCT']))

Для иероглифов используем bag of characters. В основном иероглифы в одном слове не будут встречаться дважды, и словарь иероглифов довольно ограничен, так что этот подход довольно эффективен.

In [0]:
def features_list(word):
    features = [0] * 4000
    for ch in word:
        if isinstance(ch, int) and len(features) == 4000: # в таком случае это не иероглиф, а число. достаточно единожды.
            features.append(1)     # для чисел отдельная категория и своя часть речи.
        elif (not isinstance(ch, int)) and len(features) == 4000:
            features.append(0)
            
        if ch in hanzi_set: # быстрее membership checking. если иероглиф есть в списке:
            ind = hanzi.index(ch) # соответсвтующая позиция - 1
            features[ind] = 1
    return features

Итого получаем на выходе вектор размерностью 4001: 4000 иероглифов + число/не число

In [0]:
def prepare_sequence(seq): # слова как вектор с тем, какие иероглифы в нём содержатся
    idxs = [features_list(w) for w in seq]
    return torch.tensor(idxs, dtype=torch.float)

In [0]:
def prepare_tags(seq, tags_ix): # тэги по маппингу
    idxs = [tags_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

Построим таблицу POS тэгов.

In [0]:
train_pos = [dic['upostag'] for dic in sum(chinese_train, [])] # какие части речи есть в тренировочном корпусе? их и метим

In [0]:
textlabels = list(set(train_pos))
tag_to_ix = {}
for i in range(len(textlabels)):
    tag_to_ix[textlabels[i]] = i

In [18]:
tag_to_ix

{'ADJ': 7,
 'ADP': 13,
 'ADV': 14,
 'AUX': 3,
 'CCONJ': 0,
 'DET': 6,
 'NOUN': 5,
 'NUM': 10,
 'PART': 1,
 'PRON': 4,
 'PROPN': 11,
 'PUNCT': 9,
 'SYM': 8,
 'VERB': 2,
 'X': 12}

Получим тестовые данные.

In [19]:
test_url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_Chinese-GSD/master/zh_gsd-ud-test.conllu'
urllib.request.urlretrieve(test_url, filename = 'zh_gsd-ud-test.conllu')

('zh_gsd-ud-test.conllu', <http.client.HTTPMessage at 0x7fdbf7bdef28>)

In [0]:
with open("zh_gsd-ud-test.conllu", "r", encoding="utf-8") as f: #https://github.com/UniversalDependencies/UD_Chinese-GSD
    cntst = f.read()
f.close()

In [0]:
chinese_test = parse(cntst)

In [0]:
chinese_test_words = [[w['form'] for w in i] for i in chinese_test]
chinese_test_postags = [[w['upostag'] for w in i] for i in chinese_test]
test_data = [(chinese_test_words[i], chinese_test_postags[i]) for i in range(len(chinese_test))]

In [23]:
chinese_test_words[2], chinese_test_postags[2], test_data[2]

(['杜鵑花',
  '為',
  '溫帶',
  '植物',
  '，',
  '台北',
  '雖然',
  '在',
  '亞',
  '熱帶',
  '，',
  '但',
  '冬季',
  '的',
  '東北',
  '季風',
  '卻',
  '使得',
  '杜鵑花',
  '在',
  '臺大',
  '宜然自得',
  '。'],
 ['NOUN',
  'AUX',
  'NOUN',
  'NOUN',
  'PUNCT',
  'PROPN',
  'ADP',
  'VERB',
  'PART',
  'NOUN',
  'PUNCT',
  'ADV',
  'NOUN',
  'PART',
  'NOUN',
  'NOUN',
  'ADV',
  'VERB',
  'NOUN',
  'VERB',
  'PROPN',
  'VERB',
  'PUNCT'],
 (['杜鵑花',
   '為',
   '溫帶',
   '植物',
   '，',
   '台北',
   '雖然',
   '在',
   '亞',
   '熱帶',
   '，',
   '但',
   '冬季',
   '的',
   '東北',
   '季風',
   '卻',
   '使得',
   '杜鵑花',
   '在',
   '臺大',
   '宜然自得',
   '。'],
  ['NOUN',
   'AUX',
   'NOUN',
   'NOUN',
   'PUNCT',
   'PROPN',
   'ADP',
   'VERB',
   'PART',
   'NOUN',
   'PUNCT',
   'ADV',
   'NOUN',
   'PART',
   'NOUN',
   'NOUN',
   'ADV',
   'VERB',
   'NOUN',
   'VERB',
   'PROPN',
   'VERB',
   'PUNCT']))

Приступим к питорчу

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from tqdm import tqdm

In [0]:
EMBEDDING_DIM = 4001
HIDDEN_DIM = 64
VOCAB_SIZE = 4001

In [0]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        lstm_out, _ = self.lstm(sentence.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [0]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [28]:
with torch.no_grad(): # scores before training
    inputs = prepare_sequence(train_data[0][0])
    tag_scores = model(inputs)
    print(tag_scores)

tensor([[-2.6452, -2.6986, -2.6694, -2.8301, -2.6916, -2.6713, -2.6924, -2.6595,
         -2.8458, -2.8378, -2.6700, -2.7204, -2.6561, -2.6917, -2.6730],
        [-2.6313, -2.7044, -2.6788, -2.8350, -2.6839, -2.6624, -2.6960, -2.6734,
         -2.8481, -2.8448, -2.6687, -2.7158, -2.6709, -2.6760, -2.6656],
        [-2.6259, -2.7099, -2.6858, -2.8368, -2.6808, -2.6581, -2.6996, -2.6795,
         -2.8496, -2.8455, -2.6653, -2.7131, -2.6767, -2.6700, -2.6598],
        [-2.6185, -2.7222, -2.7185, -2.8488, -2.6978, -2.6541, -2.6962, -2.6770,
         -2.8507, -2.8258, -2.6432, -2.6856, -2.6823, -2.6739, -2.6628],
        [-2.6202, -2.7066, -2.7130, -2.8564, -2.6814, -2.6631, -2.7071, -2.6863,
         -2.8558, -2.8438, -2.6456, -2.6884, -2.6705, -2.6699, -2.6533],
        [-2.6141, -2.7175, -2.7030, -2.8527, -2.6984, -2.6710, -2.6900, -2.6947,
         -2.8595, -2.8321, -2.6419, -2.6778, -2.6766, -2.6848, -2.6459],
        [-2.6176, -2.7172, -2.7008, -2.8454, -2.6930, -2.6608, -2.6953, -2.6

In [0]:
epochs = 10

In [30]:
for epoch in range(epochs):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in tqdm(train_data):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence)
        targets = prepare_tags(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
    print('Epoch {} has passed'.format(str(epoch+1)))

100%|██████████| 3997/3997 [01:06<00:00, 60.11it/s]
  0%|          | 7/3997 [00:00<01:09, 57.20it/s]

Epoch 1 has passed


100%|██████████| 3997/3997 [01:06<00:00, 60.47it/s]
  0%|          | 6/3997 [00:00<01:07, 59.23it/s]

Epoch 2 has passed


100%|██████████| 3997/3997 [01:04<00:00, 62.36it/s]
  0%|          | 7/3997 [00:00<01:09, 57.42it/s]

Epoch 3 has passed


100%|██████████| 3997/3997 [01:05<00:00, 62.05it/s]
  0%|          | 7/3997 [00:00<01:11, 55.95it/s]

Epoch 4 has passed


100%|██████████| 3997/3997 [01:05<00:00, 60.63it/s]
  0%|          | 7/3997 [00:00<01:12, 55.36it/s]

Epoch 5 has passed


100%|██████████| 3997/3997 [01:05<00:00, 61.03it/s]
  0%|          | 7/3997 [00:00<01:08, 58.08it/s]

Epoch 6 has passed


100%|██████████| 3997/3997 [01:05<00:00, 61.42it/s]
  0%|          | 7/3997 [00:00<01:09, 57.43it/s]

Epoch 7 has passed


100%|██████████| 3997/3997 [01:05<00:00, 60.91it/s]
  0%|          | 5/3997 [00:00<01:22, 48.41it/s]

Epoch 8 has passed


100%|██████████| 3997/3997 [01:06<00:00, 60.32it/s]
  0%|          | 6/3997 [00:00<01:12, 55.23it/s]

Epoch 9 has passed


100%|██████████| 3997/3997 [01:05<00:00, 59.47it/s]

Epoch 10 has passed





In [31]:
with torch.no_grad():
    inputs = prepare_sequence(train_data[0][0])
    tag_scores = model(inputs)

    print(tag_scores[0]) # максимальное значение в тензоре соответствует предсказанной POS

tensor([ -9.9266, -10.9005,  -0.0656,  -5.5113,  -8.1226,  -4.4142,  -5.6757,
         -3.6999,  -9.5186,  -9.9690,  -6.3908,  -5.2723,  -7.8330,  -5.6646,
         -4.8286])


In [32]:
tags_predicted = []
tags_true = []
for d in tqdm(test_data):
    with torch.no_grad():

        tags = prepare_tags(d[1], tag_to_ix)
        tags_true.append(tags.tolist())
        
        inputs = prepare_sequence(d[0])
        tag_scores = model(inputs).tolist()
        tags_encoded = [i.index(max(i)) for i in tag_scores] # максимальное значение в тензоре соответствует предсказанной POS
        tags_predicted.append(tags_encoded)

100%|██████████| 500/500 [00:02<00:00, 168.20it/s]


In [33]:
tags_predicted[0], tags_true[0]

([14, 9, 2, 1, 5, 14, 2, 1, 7, 5, 5], [14, 9, 4, 1, 5, 14, 2, 1, 7, 5, 9])

Для глобальной статистики мы можем и unnestн'уть списки. Это поможет определить, какие части речи определились правильно, а какие - нет.

In [0]:
tags_pred = sum(tags_predicted, [])

In [0]:
tags_actual = sum(tags_true, [])

In [0]:
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

In [37]:
f1_score(tags_actual, tags_pred, average='micro')

0.6704129204129204

In [38]:
accuracy_score(tags_actual, tags_pred)

0.6704129204129204

'PART': 0,
 'SYM': 1,
 'ADP': 2,
 'ADJ': 3,
 'NOUN': 4,
 'CCONJ': 5,
 'DET': 6,
 'VERB': 7,
 'PRON': 8,
 'X': 9,
 'NUM': 10,
 'PUNCT': 11,
 'ADV': 12,
 'AUX': 13,
 'PROPN': 14

In [39]:
confusion_matrix(tags_actual, tags_pred)

array([[ 140,    3,   11,    1,    0,    3,    0,    0,    0,   21,    0,
           0,    0,    5,    7],
       [   0,  900,   27,    2,    0,  218,    0,    4,    0,  160,   22,
          19,    0,    1,    6],
       [   1,   19, 1185,   23,    1,  236,    4,   11,    0,  139,   55,
          27,    0,   43,   25],
       [   0,    2,   44,  157,    0,    7,    0,    3,    0,   45,   12,
           1,    0,    0,   10],
       [   0,    1,    1,    0,  147,    4,    2,    0,    0,    4,    9,
           0,    0,    0,    0],
       [   0,   81,  253,    4,    3, 2452,   10,   15,    0,  216,   97,
         146,    0,   23,   12],
       [   0,    3,    6,    0,   14,   15,   60,    2,    0,    8,   26,
           0,    0,    2,    2],
       [   1,    3,   32,    1,    0,   75,    0,   88,    0,   16,   24,
           6,    0,    1,   25],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    3,    0,
           0,    0,    0,    0],
       [   0,   43,  135,    0,    0,

Для пяти эпох результаты очень хорошие. Это говорит об эффективности модели Bag of Characters для сильно основанного на иероглифах китайского языка!