In [None]:
import mindspore as ms
from mindspore import nn
from mindnlp.modules import CRF
from mindspore import ops

class BiLSTM_CRF(nn.Cell):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags, padding_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Dense(hidden_dim, num_tags, 'he_uniform')
        self.crf = CRF(num_tags, batch_first=True)

    def construct(self, inputs, seq_length, tags=None):
        embeds = self.embedding(inputs)
        outputs, _ = self.lstm(embeds, seq_length=seq_length)
        feats = self.hidden2tag(outputs)

        crf_outs = self.crf(feats, tags, seq_length)
        return crf_outs


In [None]:

embedding_dim = 16
hidden_dim = 32

training_data = [(
    "清 华 大 学 坐 落 于 首 都 北 京".split(),
    "B I I I O O O O O B I".split()
), (
    "重 庆 是 一 个 魔 幻 城 市".split(),
    "B I O O O O O O O".split()
)]

word_to_idx = {}
word_to_idx['<pad>'] = 0
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)

tag_to_idx = {"B": 0, "I": 1, "O": 2}

len(word_to_idx)

In [None]:

model = BiLSTM_CRF(len(word_to_idx), embedding_dim, hidden_dim, len(tag_to_idx))
optimizer = nn.SGD(model.trainable_params(), learning_rate=0.01, weight_decay=1e-4)



train_one_step = nn.TrainOneStepCell(model, optimizer)



In [None]:

def prepare_sequence(seqs, word_to_idx, tag_to_idx):
    seq_outputs, label_outputs, seq_length = [], [], []
    max_len = max([len(i[0]) for i in seqs])

    for seq, tag in seqs:
        seq_length.append(len(seq))
        idxs = [word_to_idx[w] for w in seq]
        labels = [tag_to_idx[t] for t in tag]
        idxs.extend([word_to_idx['<pad>'] for i in range(max_len - len(seq))])
        labels.extend([tag_to_idx['O'] for i in range(max_len - len(seq))])
        seq_outputs.append(idxs)
        label_outputs.append(labels)

    return ms.Tensor(seq_outputs, ms.int64), \
            ms.Tensor(label_outputs, ms.int64), \
            ms.Tensor(seq_length, ms.int64)


In [None]:

data, label, seq_length = prepare_sequence(training_data, word_to_idx, tag_to_idx)
data.shape, label.shape, seq_length.shape


In [None]:

train_one_step.compile(data, seq_length, label)


In [None]:

from tqdm import tqdm

steps = 500
with tqdm(total=steps) as t:
    for i in range(steps):
        loss = train_one_step(data, seq_length, label)
        t.set_postfix(loss=loss)
        t.update(1)


In [None]:
import re
from mindnlp.dataset import CoNLL2000Chunking
import mindspore.dataset as ds
import mindspore as ms
from mindspore.dataset import text,GeneratorDataset

dataset_train,dataset_test = CoNLL2000Chunking()

columns_to_project = ["words", "chunk_tag"]

dataset_train = dataset_train.project(columns= columns_to_project)
dataset_test = dataset_test.project(columns= columns_to_project)

input_columns = ["words", "chunk_tag"]
output_columns = ["text", "label"]

dataset_train = dataset_train.rename(input_columns=input_columns, output_columns=output_columns)
dataset_test = dataset_test.rename(input_columns=input_columns, output_columns=output_columns)

class TmpDataset:
    """ a Dataset for seq_length column """
    def __init__(self, dataset):
        self._dataset = dataset
        self._seq_length = []
        self._load()

    def _load(self):
        for data in self._dataset.create_dict_iterator():
            self._seq_length.append(len(data["text"]))

    def __getitem__(self, index):
        return self._seq_length[index]

    def __len__(self):
        print(max(self._seq_length), min(self._seq_length))
        return len(self._seq_length)
    
dataset_train_seq_length =  GeneratorDataset(TmpDataset(dataset_train), ["seq_length"],shuffle=False)
dataset_test_seq_length =  GeneratorDataset(TmpDataset(dataset_test), ["seq_length"],shuffle=False)

dataset_train = dataset_train.zip(dataset_train_seq_length)
dataset_test = dataset_test.zip(dataset_test_seq_length)

# itr = dataset_test.create_dict_iterator()
# for i in itr:
#     print(i)
#     break

def tag_idx(tags):
    """ tag_idx """
    tag_idx_list = []
    regex_dic = {"^B.*":0, "^I.*":1,"^O.*":2}
    for tag in tags:
        for key, value in regex_dic.items():
            if re.match(key, tag):
                tag_idx_list.append(value)
    return tag_idx_list

# vocab = dataset_train.build_vocab(columns=["text"],freq_range=None,top_k=None,
#                                   special_tokens=["<pad>","<unk>"],special_first=True)

vocab = ds.text.Vocab.from_dataset(dataset_train,columns=["text"],freq_range=None,top_k=None,
                                   special_tokens=["<pad>","<unk>"],special_first=True)

# print(len(vocab.vocab()))
# vocab.ids_to_tokens()

lookup_op = ds.text.Lookup(vocab, unknown_token="<unk>")
pad_text_op = ds.transforms.PadEnd([50],pad_value=vocab.tokens_to_ids('<pad>'))
pad_label_op = ds.transforms.PadEnd([50],pad_value=2)
type_cast_op = ds.transforms.TypeCast(ms.int64)

dataset_train = dataset_train.map(operations=[tag_idx], input_columns=["label"])
dataset_train = dataset_train.map(operations=[pad_label_op], input_columns=["label"])
dataset_train = dataset_train.map(operations=[lookup_op,pad_text_op], input_columns=["text"])
dataset_train = dataset_train.map(operations=[type_cast_op])

dataset_test = dataset_test.map(operations=[tag_idx], input_columns=["label"])
dataset_test = dataset_test.map(operations=[pad_label_op], input_columns=["label"])
dataset_test = dataset_test.map(operations=[lookup_op,pad_text_op], input_columns=["text"])
dataset_test = dataset_test.map(operations=[type_cast_op])

# dataset_test.bucket_batch_by_length(["text","label"],
#                                     bucket_boundaries=[10,20,30,40,50],bucket_batch_sizes=[32,16,8,4,2,1])

# dataset_test = dataset_test.batch(2)
# itr = dataset_test.create_dict_iterator()
# for i in itr:
#     print(i)
#     break




In [1]:
from tqdm import tqdm
from mindspore.dataset import text
from mindspore import nn,ops
from mindnlp.dataset import CoNLL2000Chunking, CoNLL2000Chunking_Process
from mindnlp.modules import CRF,RNNEncoder
from mindnlp.abc import Seq2vecModel
from mindnlp.engine.trainer import Trainer

dataset_train,dataset_test = CoNLL2000Chunking()

vocab = text.Vocab.from_dataset(dataset_train,columns=["words"],freq_range=None,top_k=None,
                                   special_tokens=["<pad>","<unk>"],special_first=True)

dataset_train = CoNLL2000Chunking_Process(dataset=dataset_train, vocab=vocab, batch_size=64, max_len=80)

# data_itr = dataset_train.create_dict_iterator()

# print(next(data_itr)["text"])

# dataset_train.get_dataset_size()

class Head(nn.Cell):
    """ Head for BiLSTM-CRF model """
    def __init__(self, hidden_dim, num_tags):
        super().__init__()
        self.hidden2tag = nn.Dense(hidden_dim, num_tags, 'he_uniform')

    def construct(self, context):
        return self.hidden2tag(context)

class BiLSTM_CRF(Seq2vecModel):
    """ BiLSTM-CRF model """
    def __init__(self, encoder, head, num_tags):
        super().__init__(encoder, head)
        self.encoder = encoder
        self.head = head
        self.crf = CRF(num_tags, batch_first=True)

    def construct(self, text, seq_length, label=None):
        output,_,_ = self.encoder(text)
        feats = self.head(output)
        res = self.crf(feats, label, seq_length)
        return res

embedding_dim = 16
hidden_dim = 32
embedding = nn.Embedding(vocab_size=len(vocab.vocab()), embedding_size=embedding_dim, padding_idx=0)
lstm_layer = nn.LSTM(embedding_dim, hidden_dim // 2, bidirectional=True, batch_first=True)
encoder = RNNEncoder(embedding, lstm_layer)
head = Head(hidden_dim, 3)
net = BiLSTM_CRF(encoder, head, 3)

optimizer = nn.SGD(net.trainable_params(), learning_rate=0.01, weight_decay=1e-4)

grad_fn = ops.value_and_grad(net, None, optimizer.parameters)

def train_step(seqs, seq_length, label):
    """ train_step """
    loss, grads = grad_fn(seqs, seq_length, label)
    loss = ops.depend(loss, optimizer(grads))
    return loss

for batch, (data, label, seq_length) in enumerate(dataset_train.create_tuple_iterator()):
    loss = train_step(data, seq_length , label)

print(">>>>>>开始训练<<<<<<")
steps = 140
with tqdm(total=steps) as t:
    for i in range(steps):
        for batch, (data, label, seq_length) in enumerate(dataset_train.create_tuple_iterator()):
            loss = train_step(data, seq_length ,label)
            t.set_postfix(loss=loss)
            t.update(1)