In [1]:
import argparse
import numpy as np
import mindspore.nn as nn
import mindspore.numpy as mnp
from mindspore import Tensor
import mindspore.dataset as ds
from mindspore.dataset import text
from mindspore.dataset.text import JiebaMode
from mindspore.dataset.text import NormalizeForm
from mindspore import load_checkpoint, load_param_into_net, save_checkpoint
from mindspore import set_context, PYNATIVE_MODE
import mindspore.ops as ops
from mindspore.common.parameter import Parameter
from mindspore.common.initializer import Uniform, HeUniform
import mindspore
import csv
import re
from tqdm import tqdm
import json
import math

In [2]:
pip install jieba

Looking in indexes: http://repo.myhuaweicloud.com/repository/pypi/simple
Note: you may need to restart the kernel to use updated packages.


In [3]:
import jieba

In [4]:
set_context(mode=PYNATIVE_MODE, device_target='Ascend')

在Notebook里面下载自己桶里的数据集

In [5]:
# import moxing as mox
# '''
# 这个是查询是否数据集创建成功的，里面参数就是 obs:// 然后就是桶里面的路径地址
# 成功返回Ture
# '''
# mox.file.exists('obs://mindcon4wilson/mindcon_text_classification.zip') 

In [6]:
# from modelarts.session import Session
# session = Session()
# session.obs.download_file(src_obs_file="obs://mindcon4wilson/mindcon_text_classification.zip", dst_local_dir="/home/ma-user/work/")
# '''
# 第一个参数就是数据集的具体位置，第二个就是你解压的目录，解压在work才会持久化的保存，不然你下次还得下载解压一遍
# '''

In [7]:
# #解压
# !unzip mindcon_text_classification.zip

In [8]:
class DataLoader():
    """数据集加载器

    加载数据集并处理为一个Python迭代对象。

    """
    def __init__(self, path, type_dataset):
        self.path = path
        self.datas, self.labels = [], []
        self.type_dataset = type_dataset
        self._load()

    def _load(self):
        with open(self.path, "r", encoding='utf-8')as file:
            txtFile = file.readlines()
            if self.type_dataset == "train":
                for line in txtFile:
                    sentence = line.split(",")[1].replace("\n", "")
                    self.datas.append([word for word in jieba.cut(sentence=sentence, cut_all=True, HMM=True)])
                    # label_onehot = [0] * 2
                    # label_onehot[int(line.split(",")[0])] = 1
                    # self.labels.append(label_onehot)
                    self.labels.append(int(line.split(",")[0]))
            if self.type_dataset == "test":
                for line in txtFile:
                    self.datas.append([word for word in jieba.cut(sentence=line, cut_all=True, HMM=True)])
                    self.labels.append(0)

    def __getitem__(self, idx):
        return self.datas[idx], self.labels[idx]

    def __len__(self):
        return len(self.datas)

In [9]:
def load_data(train_data_path, test_data_path):
    data_train = ds.GeneratorDataset(DataLoader(path=train_data_path, type_dataset ="train"),
                                     column_names=["review", "label"],
                                     shuffle=True)
    data_test = ds.GeneratorDataset(DataLoader(path=test_data_path, type_dataset = "test"),
                                    column_names=["review", "label"],
                                    shuffle=False)

    return data_train, data_test

In [10]:
def build_vocab():
    with open("/home/ma-user/work/mindcon_text_classification/train/data.txt", "r", encoding = 'utf-8')as file:
        txtFile = file.readlines()
        tokens = []
        index = []
        for line in txtFile:
            # for word in jieba.cut(sentence=sentence,  cut_all=True, HMM=True):
            #     # print(word)
            #     if word not in tokens:
            #         tokens.append(str(word))
            #         index.append(len(tokens))
            sentence = line.split(",")[1].replace("\n", "")
            for word in jieba.cut(sentence=sentence,  cut_all=True, HMM=True):
                # print(word)
                if word not in tokens:
                    tokens.append(str(word))
                    index.append(len(tokens))

    tokens.append("<unk>")
    index.append(len(tokens))
    tokens.append("<pad>")
    index.append(len(tokens))
    vocab_dict = dict(zip(tokens, index))
    vocab = text.Vocab.from_dict(word_dict=vocab_dict)
    return vocab, len(tokens)

In [11]:
def train_one_epoch(model, train_dataset, epoch=0):
    model.set_train()
    total = train_dataset.get_dataset_size()
    loss_total = 0
    step_total = 0
    print('******************************training******************************')

    with tqdm(total=total) as t:
        t.set_description('Epoch %i' % epoch)
        for i in train_dataset.create_tuple_iterator():
            loss = model(i[0], i[1])
            loss_total += loss.asnumpy()
            step_total += 1
            t.set_postfix(loss=loss_total / step_total)
            t.update(1)

In [12]:
def binary_accuracy(preds, y_s):
    """
    计算每个batch的准确率
    """

    # 对预测值进行四舍五入
    rounded_preds = np.around(preds)
    correct = [rounded_pred == y for rounded_pred, y in zip(rounded_preds, y_s)]
    acc = correct.count(True) / len(correct)
    return acc

In [13]:
def evaluate(model, test_dataset, criterion, epoch=0):
    """
    用验证集进行模型评估
    """
    total = test_dataset.get_dataset_size()
    epoch_loss = 0
    epoch_acc = 0
    step_total = 0
    model.set_train(False)
    print('******************************evaluting******************************')

    with tqdm(total=total) as t:
        t.set_description('Epoch %i' % epoch)
        for i in test_dataset.create_tuple_iterator():
            # print(i[0])
            predictions = model(i[0])
            loss = criterion(predictions, i[1])
            epoch_loss += loss.asnumpy()
            acc = binary_accuracy(predictions.asnumpy(), i[1].asnumpy())

            epoch_acc += acc

            step_total += 1
            t.set_postfix(loss=epoch_loss / step_total, acc=epoch_acc / step_total)
            t.update(1)
        #写入训练日志
        logs_file = open("/home/ma-user/work/log_file.txt", "a", encoding='utf-8')
        logs_file.write(f'Epoch {epoch} / 30, loss: {epoch_loss / total}, acc: {epoch_acc / total}' + '\n')
        logs_file.close()

    return epoch_loss / total

In [14]:
def data_preprocessing(vocab, data_train):
    """
    数据处理、打包
    """
    lookup_op = ds.text.Lookup(vocab, unknown_token='<unk>')
    pad_op = ds.transforms.c_transforms.PadEnd([100],
                                  pad_value=vocab.tokens_to_ids('<pad>'))
    type_cast_op = ds.transforms.c_transforms.TypeCast(mindspore.float32)
    tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##',
                                      max_bytes_per_token=100,
                                      unknown_token='<unk>', lower_case=False,
                                      keep_whitespace=False,
                                      normalization_form=NormalizeForm.NONE,
                                      preserve_unused_token=True,
                                      with_offsets=False)
    # tokenizer_op = text.JiebaTokenizer("/home/ma-user/work/hmm_model.utf8", "/home/ma-user/work/jieba.dict.utf8", mode=JiebaMode.MIX, with_offsets=False)
    # data_train = data_train.map(operations=[tokenizer_op],
    #                             input_columns=['review'])
    data_train = data_train.map(operations=[lookup_op, pad_op],
                                input_columns=['review'])
    data_train = data_train.map(operations=[type_cast_op],
                                input_columns=['label'])

    data_train, data_valid = data_train.split([0.8, 0.2])

    data_train = data_train.batch(100, drop_remainder=True)
    data_valid = data_valid.batch(100, drop_remainder=True)

    return data_train, data_valid

In [15]:
class RNN(nn.Cell):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim,
                                      )
        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           dropout=dropout,
                           batch_first=True)
        weight_init = HeUniform(math.sqrt(5))
        bias_init = Uniform(1 / math.sqrt(hidden_dim * 2))
        self.fc = nn.Dense(hidden_dim * 2, output_dim, weight_init=weight_init, bias_init=bias_init)
        self.dropout = nn.Dropout(1-dropout)
        self.sigmoid = ops.Sigmoid()
        self.squeeze = ops.Squeeze(1)

    def construct(self, inputs):
        embedded = self.dropout(self.embedding(inputs))
        _, (hidden, _) = self.rnn(embedded)
        hidden = self.dropout(mnp.concatenate((hidden[-2, :, :], hidden[-1, :, :]), axis=1))
        output = self.sigmoid(self.fc(hidden))
        output = self.squeeze(output)
        # print(output.shape)
        return output

In [16]:
vocab, vocab_len = build_vocab()

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.354 seconds.
Prefix dict has been built successfully.


In [17]:
train_data_path = "/home/ma-user/work/mindcon_text_classification/train/data.txt"
test_data_path = "/home/ma-user/work/mindcon_text_classification/test/test.txt" 
data_train, data_test = load_data(train_data_path, test_data_path)

In [18]:
data_train, data_valid = data_preprocessing(vocab, data_train)



In [19]:
def training_data(net, loss, data_train, data_valid, ckpt_file_name, lr, epochs):
    net_with_loss = nn.WithLossCell(net, loss)
    optimizer = nn.Adam(net.trainable_params(), learning_rate=lr)
    train_one_step = nn.TrainOneStepCell(net_with_loss, optimizer)
    best_valid_loss = float('inf')

    for epoch in range(epochs):
        train_one_epoch(train_one_step, data_train, epoch)
        valid_loss = evaluate(net, data_valid, loss, epoch)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            save_checkpoint(net, ckpt_file_name)

In [20]:
pad_idx = vocab.tokens_to_ids('<pad>')
lr = 0.0001
num_layers = 2
bidirectional = True
output_size = 1
hidden_size = 256
dropout = 0.5
epochs = 30


ckpt_file_name = "/home/ma-user/work/model.ckpt"


loss = nn.BCELoss(reduction='mean')
net = RNN(vocab_len, 100, hidden_size, output_size, num_layers, bidirectional, dropout, pad_idx)

In [None]:
training_data(net, loss, data_train, data_valid, ckpt_file_name, lr, epochs)