In [1]:
import argparse
import numpy as np
import mindspore.nn as nn
import mindspore.numpy as mnp
from mindspore import Tensor
import mindspore.dataset as ds
from mindspore.dataset import text
from mindspore.dataset.text import JiebaMode
from mindspore.dataset.text import NormalizeForm
from mindspore import load_checkpoint, load_param_into_net, save_checkpoint
from mindspore import set_context, PYNATIVE_MODE
import mindspore.ops as ops
from mindspore.common.parameter import Parameter
from mindspore.common.initializer import Uniform, HeUniform
import mindspore
import csv
import re
from tqdm import tqdm
import json
import math

In [2]:
pip install jieba

Looking in indexes: http://repo.myhuaweicloud.com/repository/pypi/simple
Note: you may need to restart the kernel to use updated packages.


In [3]:
import jieba

In [4]:
set_context(mode=PYNATIVE_MODE, device_target='Ascend')

构建数据集加载器

In [5]:
class DataLoader():
    """

    加载数据集并处理为一个Python迭代对象。

    """
    def __init__(self, path, type_dataset):
        """
        path:文件路径
        type_dataset:数据集类型
        """
        self.path = path
        self.datas, self.labels = [], []
        self.type_dataset = type_dataset
        self._load()

    def _load(self):
        with open(self.path, "r", encoding='utf-8')as file:
            txtFile = file.readlines()
            if self.type_dataset == "train":
                for line in txtFile:
                    sentence = line.split(",")[1].replace("\n", "")
                    self.datas.append([word for word in jieba.cut(sentence=sentence, cut_all=True, HMM=True)])
                    # label_onehot = [0] * 2
                    # label_onehot[int(line.split(",")[0])] = 1
                    # self.labels.append(label_onehot)
                    self.labels.append(int(line.split(",")[0]))
            if self.type_dataset == "test":
                for line in txtFile:
                    self.datas.append([word for word in jieba.cut(sentence=line, cut_all=True, HMM=True)])
                    self.labels.append(0)

    def __getitem__(self, idx):
        return self.datas[idx], self.labels[idx]

    def __len__(self):
        return len(self.datas)

加载数据集，转化为GeneratorDataset类型

In [6]:
def load_data(train_data_path, test_data_path):
    data_train = ds.GeneratorDataset(DataLoader(path=train_data_path, type_dataset ="train"),
                                     column_names=["review", "label"],
                                     shuffle=True)
    data_test = ds.GeneratorDataset(DataLoader(path=test_data_path, type_dataset = "test"),
                                    column_names=["review", "label"],
                                    shuffle=False)

    return data_train, data_test

构建词表

In [7]:
def build_vocab():
    with open("/home/ma-user/work/mindcon_text_classification/train/data.txt", "r", encoding = 'utf-8')as file:
        txtFile = file.readlines()
        tokens = []
        index = []
        for line in txtFile:
            # for word in jieba.cut(sentence=sentence,  cut_all=True, HMM=True):
            #     # print(word)
            #     if word not in tokens:
            #         tokens.append(str(word))
            #         index.append(len(tokens))
            sentence = line.split(",")[1].replace("\n", "")
            for word in jieba.cut(sentence=sentence,  cut_all=True, HMM=True):
                # print(word)
                if word not in tokens:
                    tokens.append(str(word))
                    index.append(len(tokens))
                    
    #添加两个特殊token
    tokens.append("<unk>")
    index.append(len(tokens))
    tokens.append("<pad>")
    index.append(len(tokens))
    vocab_dict = dict(zip(tokens, index))
    vocab = text.Vocab.from_dict(word_dict=vocab_dict)
    return vocab, len(tokens)

In [8]:
class RNN(nn.Cell):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim,
                                      )
        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           dropout=dropout,
                           batch_first=True)
        weight_init = HeUniform(math.sqrt(5))
        bias_init = Uniform(1 / math.sqrt(hidden_dim * 2))
        self.fc = nn.Dense(hidden_dim * 2, output_dim, weight_init=weight_init, bias_init=bias_init)
        self.dropout = nn.Dropout(1-dropout)
        self.sigmoid = ops.Sigmoid()
        self.squeeze = ops.Squeeze(1)

    def construct(self, inputs):
        embedded = self.dropout(self.embedding(inputs))
        _, (hidden, _) = self.rnn(embedded)
        hidden = self.dropout(mnp.concatenate((hidden[-2, :, :], hidden[-1, :, :]), axis=1))
        output = self.sigmoid(self.fc(hidden))
        output = self.squeeze(output)
        # print(output.shape)
        return output

In [9]:
vocab, vocab_len = build_vocab()

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.354 seconds.
Prefix dict has been built successfully.


In [13]:
train_data_path = "/home/ma-user/work/mindcon_text_classification/train/data.txt"
test_data_path = "/home/ma-user/work/mindcon_text_classification/test/test.txt" 
data_train, data_test = load_data(train_data_path, test_data_path)

In [10]:
pad_idx = vocab.tokens_to_ids('<pad>')
lr = 0.0001
num_layers = 2
bidirectional = True
output_size = 1
hidden_size = 256
dropout = 0.5
epochs = 30


ckpt_file_name = "/home/ma-user/work/model.ckpt"

net = RNN(vocab_len, 100, hidden_size, output_size, num_layers, bidirectional, dropout, pad_idx)

In [11]:
def predict(data_test, vocab, ckpt_file_name):
    """
    预测测试集中所有的数据
    """

    # load model
    lookup_op = ds.text.Lookup(vocab, unknown_token='<unk>')
    pad_op = ds.transforms.c_transforms.PadEnd([100], pad_value=vocab.tokens_to_ids('<pad>'))
    # type_cast_op = ds.transforms.c_transforms.TypeCast(mindspore.float32)

    # tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##',
    #                                       max_bytes_per_token=100,
    #                                       unknown_token='<unk>', lower_case=False,
    #                                       keep_whitespace=False,
    #                                       normalization_form=NormalizeForm.NONE,
    #                                       preserve_unused_token=True,
    #                                       with_offsets=False)

#     data_test = data_test.map(operations=[tokenizer_op],
#                                     input_columns=['review'])

    data_test = data_test.map(operations=[lookup_op],
                              input_columns=['review'])
    data_test = data_test.map(operations=[pad_op],
                              input_columns=['review'])
    data_test = data_test.batch(1, drop_remainder=True)

    param_dict = load_checkpoint(ckpt_file_name)
    load_param_into_net(net, param_dict)

    net.set_train(False)
    predictions = []

    for i in tqdm(data_test.create_tuple_iterator()):
        # print(i[0].shape)
        # print(type(i[0]))
        prediction = net(i[0])
        prediction = np.round(prediction.asnumpy())
        # prediction = [np.argmax(pred) + 1 for pred in prediction]
        # print(prediction[0])
        predictions.append(int(prediction[0]))
    return predictions

In [14]:
prediction = predict(data_test, vocab, ckpt_file_name)

1000it [00:57, 17.36it/s]


In [None]:
with open("/home/ma-user/work/result.txt", 'w', encoding='utf-8') as file:
    for i in prediction:
        file.write(str(i) + '\n')
    file.close()