diff --git a/nested_sequence/README.md b/nested_sequence/README.md new file mode 100644 index 0000000000..f6a09ed22d --- /dev/null +++ b/nested_sequence/README.md @@ -0,0 +1 @@ +[TBD] diff --git a/nested_sequence/text_classification/README.md b/nested_sequence/text_classification/README.md new file mode 100644 index 0000000000..dbc1b4a5d3 --- /dev/null +++ b/nested_sequence/text_classification/README.md @@ -0,0 +1,239 @@ +# 基于双层序列的文本分类 +## 简介 +序列是自然语言处理任务面对的一种主要输入数据类型:句子由词语构成,而多个句子进一步构成了段落。因此,段落可以看作是一个嵌套的序列(或者叫作:双层序列),这个序列的每个元素又是一个序列。 + +双层序列是 PaddlePaddle 支持的一种非常灵活的数据组织方式,能够帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。以双层序列作为输入,我们可以设计一个层次化的网络,分别从词语和句子级别编码输入数据,从而更好地完成一些复杂的语言理解任务。 + +本例将演示如何在 PaddlePaddle 中将长文本输入(通常能达到段落或者篇章基本)组织为双层序列,完成对长文本的分类任务。 + +## 模型介绍 +我们将一段文本看成句子的序列,而每个句子又是词语的序列。 + +我们首先用卷积神经网络编码段落中的每一句话;然后,将每句话的表示向量经过池化层得到段落的编码向量;最后将段落的编码向量作为分类器(以softmax层的全连接层)输入,得到最终的分类结果。 + +**模型结构如下图所示** +

+
+图1. 基于双层序列的文本分类模型 +

+ +PaddlePaddle 实现该网络结构的代码见 `network_conf.py`。 + +对双层时间序列的处理,需要先将双层时间序列数据变换成单层时间序列数据,再对每一个单层时间序列进行处理。 在 PaddlePaddle 中 ,`recurrent_group` 是帮助我们构建处理双层序列的层次化模型的主要工具。这里,我们使用两个嵌套的 `recurrent_group` 。外层的 `recurrent_group` 将段落拆解为句子,`step` 函数中拿到的输入是句子序列;内层的 `recurrent_group` 将句子拆解为词语,`step` 函数中拿到的输入是非序列的词语。 + +在词语级别,我们通过 CNN 网络以词向量为输入输出学习到的句子表示;在段落级别,将每个句子的表示通过池化作用得到段落表示。 + +``` python +nest_group = paddle.layer.recurrent_group(input=[paddle.layer.SubsequenceInput(emb), + hidden_size], + step=cnn_cov_group) +``` + + +拆解后的单层序列数据经过一个CNN网络学习对应的向量表示,CNN的网络结构包含以下部分: + +- **卷积层**: 文本分类中的卷积在时间序列上进行,卷积核的宽度和词向量层产出的矩阵一致,卷积后得到的结果为“特征图”, 使用多个不同高度的卷积核,可以得到多个特征图。本例代码默认使用了大小为 3(图1红色框)和 4(图1蓝色框)的卷积核。 +- **最大池化层**: 对卷积得到的各个特征图分别进行最大池化操作。由于特征图本身已经是向量,因此最大池化实际上就是选出各个向量中的最大元素。将所有最大元素又被拼接在一起,组成新的向量。 +- **线性投影层**: 将不同卷积得到的结果经过最大池化层之后拼接为一个长向量, 然后经过一个线性投影得到对应单层序列的表示向量。 + +CNN网络具体代码实现如下: +```python +def cnn_cov_group(group_input, hidden_size): + """ + Covolution group definition + :param group_input: The input of this layer. + :type group_input: LayerOutput + :params hidden_size: Size of FC layer. + :type hidden_size: int + """ + conv3 = paddle.networks.sequence_conv_pool( + input=group_input, context_len=3, hidden_size=hidden_size) + conv4 = paddle.networks.sequence_conv_pool( + input=group_input, context_len=4, hidden_size=hidden_size) + + linear_proj = paddle.layer.fc(input=[conv3, conv4], + size=hidden_size, + param_attr=paddle.attr.ParamAttr(name='_cov_value_weight'), + bias_attr=paddle.attr.ParamAttr(name='_cov_value_bias'), + act=paddle.activation.Linear()) + + return linear_proj +``` +PaddlePaddle 中已经封装好的带有池化的文本序列卷积模块:`paddle.networks.sequence_conv_pool`,可直接调用。 + +在得到每个句子的表示向量之后, 将所有句子表示向量经过一个平均池化层, 得到一个样本的向量表示, 向量经过一个全连接层输出最终的预测结果。 代码如下: +```python +avg_pool = paddle.layer.pooling(input=nest_group, pooling_type=paddle.pooling.Avg(), + agg_level=paddle.layer.AggregateLevel.TO_NO_SEQUENCE) +prob = paddle.layer.mixed(size=class_num, + input=[paddle.layer.full_matrix_projection(input=avg_pool)], + act=paddle.activation.Softmax()) +``` +## 安装依赖包 +```bash +pip install -r requirements.txt +``` + +## 指定训练配置参数 + +通过 `config.py` 脚本修改训练和模型配置参数,脚本中有对可配置参数的详细解释,示例如下: +```python +class TrainerConfig(object): + + # whether to use GPU for training + use_gpu = False + # the number of threads used in one machine + trainer_count = 1 + + # train batch size + batch_size = 32 + + ... + + +class ModelConfig(object): + + # embedding vector dimension + emb_size = 28 + + ... +``` +修改 `config.py` 对参数进行调整。例如,通过修改 `use_gpu` 参数来指定是否使用 GPU 进行训练。 + +## 使用 PaddlePaddle 内置数据运行 + +### 训练 +在终端执行: +```bash +python train.py +``` +将以 PaddlePaddle 内置的情感分类数据集: `imdb` 运行本例。 +### 预测 +训练结束后模型将存储在指定目录当中(默认models目录),在终端执行: +```bash +python infer.py --model_path 'models/params_pass_00000.tar.gz' +``` +默认情况下,预测脚本将加载训练一个pass的模型对 `imdb的测试集` 进行测试。 + +## 使用自定义数据训练和预测 + +### 训练 +1.数据组织 + +输入数据格式如下:每一行为一条样本,以 `\t` 分隔,第一列是类别标签,第二列是输入文本的内容。以下是两条示例数据: + + ``` + positive This movie is very good. The actor is so handsome. + negative What a terrible movie. I waste so much time. + ``` + +2.编写数据读取接口 + +自定义数据读取接口只需编写一个 Python 生成器实现**从原始输入文本中解析一条训练样本**的逻辑。以下代码片段实现了读取原始数据返回类型为: `paddle.data_type.integer_value_sub_sequence` 和 `paddle.data_type.integer_value` +```python +def train_reader(data_dir, word_dict, label_dict): + """ + Reader interface for training data + + :param data_dir: data directory + :type data_dir: str + :param word_dict: path of word dictionary, + the dictionary must has a "UNK" in it. + :type word_dict: Python dict + :param label_dict: path of label dictionary. + :type label_dict: Python dict + """ + + def reader(): + UNK_ID = word_dict[''] + word_col = 1 + lbl_col = 0 + + for file_name in os.listdir(data_dir): + file_path = os.path.join(data_dir, file_name) + if not os.path.isfile(file_path): + continue + with open(file_path, "r") as f: + for line in f: + line_split = line.strip().split("\t") + doc = line_split[word_col] + doc_ids = [] + for sent in doc.strip().split("."): + sent_ids = [ + word_dict.get(w, UNK_ID) + for w in sent.split()] + if sent_ids: + doc_ids.append(sent_ids) + + yield doc_ids, label_dict[line_split[lbl_col]] + + return reader +``` +需要注意的是, 本例中以英文句号`'.'`作为分隔符, 将一段文本分隔为一定数量的句子, 且每个句子表示为对应词表的索引数组(`sent_ids`)。 由于当前样本的表示(`doc_ids`)中包含了该段文本的所有句子, 因此,它的类型为:`paddle.data_type.integer_value_sub_sequence`。 + + +3.指定命令行参数进行训练 + +`train.py`训练脚本中包含以下参数: +``` +Options: + --train_data_dir TEXT The path of training dataset (default: None). If + this parameter is not set, imdb dataset will be + used. + --test_data_dir TEXT The path of testing dataset (default: None). If this + parameter is not set, imdb dataset will be used. + --word_dict_path TEXT The path of word dictionary (default: None). If this + parameter is not set, imdb dataset will be used. If + this parameter is set, but the file does not exist, + word dictionay will be built from the training data + automatically. + --label_dict_path TEXT The path of label dictionary (default: None).If this + parameter is not set, imdb dataset will be used. If + this parameter is set, but the file does not exist, + label dictionay will be built from the training data + automatically. + --model_save_dir TEXT The path to save the trained models (default: + 'models'). + --help Show this message and exit. +``` + +修改`train.py`脚本中的启动参数,可以直接运行本例。 以`data`目录下的示例数据为例,在终端执行: +```bash +python train.py \ + --train_data_dir 'data/train_data' \ + --test_data_dir 'data/test_data' \ + --word_dict_path 'word_dict.txt' \ + --label_dict_path 'label_dict.txt' +``` +即可对样例数据进行训练。 + +### 预测 + +1.指定命令行参数 + +`infer.py`训练脚本中包含以下参数: + +``` +Options: + --data_path TEXT The path of data for inference (default: None). If + this parameter is not set, imdb test dataset will be + used. + --model_path TEXT The path of saved model. [required] + --word_dict_path TEXT The path of word dictionary (default: None). If this + parameter is not set, imdb dataset will be used. + --label_dict_path TEXT The path of label dictionary (default: None).If this + parameter is not set, imdb dataset will be used. + --batch_size INTEGER The number of examples in one batch (default: 32). + --help Show this message and exit. +``` + +2.以`data`目录下的示例数据为例,在终端执行: +```bash +python infer.py \ + --data_path 'data/infer.txt' \ + --word_dict_path 'word_dict.txt' \ + --label_dict_path 'label_dict.txt' \ + --model_path 'models/params_pass_00000.tar.gz' +``` + +即可对样例数据进行预测。 diff --git a/nested_sequence/text_classification/config.py b/nested_sequence/text_classification/config.py new file mode 100644 index 0000000000..1a6e4681b1 --- /dev/null +++ b/nested_sequence/text_classification/config.py @@ -0,0 +1,46 @@ +__all__ = ["TrainerConfig", "ModelConfig"] + + +class TrainerConfig(object): + + # Whether to use GPU in training or not. + use_gpu = False + # The number of computing threads. + trainer_count = 1 + + # The training batch size. + batch_size = 32 + + # The epoch number. + num_passes = 10 + + # The global learning rate. + learning_rate = 1e-3 + + # The decay rate for L2Regularization + l2_learning_rate = 1e-3 + + # This parameter is used for the averaged SGD. + # About the average_window * (number of the processed batch) parameters + # are used for average. + # To be accurate, between average_window *(number of the processed batch) + # and 2 * average_window * (number of the processed batch) parameters + # are used for average. + average_window = 0.5 + + # The buffer size of the data reader. + # The number of buffer size samples will be shuffled in training. + buf_size = 1000 + + # The parameter is used to control logging period. + # Training log will be printed every log_period. + log_period = 100 + + +class ModelConfig(object): + + # The dimension of embedding vector. + emb_size = 28 + + # The hidden size of sentence vectors. + hidden_size = 128 diff --git a/nested_sequence/text_classification/data/infer.txt b/nested_sequence/text_classification/data/infer.txt new file mode 100644 index 0000000000..8309d5c026 --- /dev/null +++ b/nested_sequence/text_classification/data/infer.txt @@ -0,0 +1,4 @@ +I was overtaken by the emotion. Unforgettable rendering of a wartime story which is unknown to most people. The performances were faultless and outstanding. +The original Vampires (1998) is one of my favorites. I was curious to see how a sequel would work considering they used none of the original characters. I was quite surprised at how this played out. +Without question, the worst ELVIS film ever made. The movie portrays all Indians as drunk, stupid, and lazy. Watch ELVIS's skin change color throughout the film. +I thought this movie was hysterical. I have watched it many times and recommend it highly. Mel Brooks, was excellent. The cast was fantastic..I don't understand how this movie gets a 2 out of 5 rating. I loved it. \ No newline at end of file diff --git a/nested_sequence/text_classification/data/test_data/test.txt b/nested_sequence/text_classification/data/test_data/test.txt new file mode 100644 index 0000000000..d162dbbeba --- /dev/null +++ b/nested_sequence/text_classification/data/test_data/test.txt @@ -0,0 +1,4 @@ +positive I liked the film. Some of the action scenes were very interesting, tense and well done. I especially liked the opening scene which had a semi truck in it. Also the film is funny is several parts. I'd give the film an 8 out of 10. +negative The plot for Descent, if it actually can be called a plot, has two noteworthy events. One near the beginning - one at the end. Together these events make up maybe 5% of the total movie time. Everything (and I mean _everything_) in between is basically the director's desperate effort to fill in the minutes. +negative This film lacked something I couldn't put my finger on at first: charisma on the part of the leading actress. This inevitably translated to lack of chemistry when she shared the screen with her leading man. Even the romantic scenes came across as being merely the actors at play. +negative I read the book a long time back and don't specifically remember the plot but do remember that I enjoyed it. Since I'm home sick on the couch it seemed like a good idea and Hey !! It is a Lifetime movie.

The movie is populated with grade B actors and actresses.

The female cast is right out of Desperate Housewives. \ No newline at end of file diff --git a/nested_sequence/text_classification/data/train_data/train.txt b/nested_sequence/text_classification/data/train_data/train.txt new file mode 100644 index 0000000000..4f392593bf --- /dev/null +++ b/nested_sequence/text_classification/data/train_data/train.txt @@ -0,0 +1,4 @@ +negative It was a Sunday night and I was waiting for the advertised movie on TV. They said it was a comedy! The movie started, 10 minutes passed, after that 30 minutes and I didn't laugh not even once. The fact is that the movie ended and I didn't get even on echance to laugh. +negative I saw this piece of garbage on AMC last night, and wonder how it could be considered in any way an American Movie Classic. It was awful in every way. How badly did Jack Lemmon, James Stewart and the rest of the cast need cash that they would even consider doing this movie? +positive its not as good as the first movie,but its a good solid movie its has good car chase scenes,on the remake of this movie there a story for are hero to drive fast as his trying to rush to the side of his ailing wife,the ending is great just a good fair movie to watch in my opinion. +positive Rosalind Russell executes a power-house performance as Rosie Lord, a very wealthy woman with greedy heirs. With an Auntie Mame-type character, this actress can never go wrong. Her very-real terror at being in an insane assylum is a wonderful piece of acting. Everyone should watch this. \ No newline at end of file diff --git a/nested_sequence/text_classification/images/model.jpg b/nested_sequence/text_classification/images/model.jpg new file mode 100644 index 0000000000..4f63d8b553 Binary files /dev/null and b/nested_sequence/text_classification/images/model.jpg differ diff --git a/nested_sequence/text_classification/index.html b/nested_sequence/text_classification/index.html new file mode 100644 index 0000000000..005de9249a --- /dev/null +++ b/nested_sequence/text_classification/index.html @@ -0,0 +1,303 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + + diff --git a/nested_sequence/text_classification/infer.py b/nested_sequence/text_classification/infer.py new file mode 100644 index 0000000000..461eba4935 --- /dev/null +++ b/nested_sequence/text_classification/infer.py @@ -0,0 +1,106 @@ +import sys +import os +import gzip +import click + +import paddle.v2 as paddle + +import reader +from network_conf import nested_net +from utils import logger, load_dict, load_reverse_dict + + +@click.command('infer') +@click.option( + "--data_path", + default=None, + help=("The path of data for inference (default: None). " + "If this parameter is not set, " + "imdb test dataset will be used.")) +@click.option( + "--model_path", type=str, required=True, help="The path of saved model.") +@click.option( + "--word_dict_path", + type=str, + default=None, + help=("The path of word dictionary (default: None). " + "If this parameter is not set, imdb dataset will be used.")) +@click.option( + "--label_dict_path", + type=str, + default=None, + help=("The path of label dictionary (default: None)." + "If this parameter is not set, imdb dataset will be used. ")) +@click.option( + "--batch_size", + type=int, + default=32, + help="The number of examples in one batch (default: 32).") +def infer(data_path, model_path, word_dict_path, batch_size, label_dict_path): + def _infer_a_batch(inferer, test_batch, ids_2_word, ids_2_label): + probs = inferer.infer(input=test_batch, field=["value"]) + assert len(probs) == len(test_batch) + for word_ids, prob in zip(test_batch, probs): + sent_ids = [] + for sent in word_ids[0]: + sent_ids.extend(sent) + word_text = " ".join([ids_2_word[id] for id in sent_ids]) + print("%s\t%s\t%s" % (ids_2_label[prob.argmax()], + " ".join(["{:0.4f}".format(p) + for p in prob]), word_text)) + + assert os.path.exists(model_path), "The trained model does not exist." + logger.info("Begin to predict...") + use_default_data = (data_path is None) + + if use_default_data: + word_dict = reader.imdb_word_dict() + word_reverse_dict = dict((value, key) + for key, value in word_dict.iteritems()) + + # The reversed label dict of the imdb dataset + label_reverse_dict = {0: "positive", 1: "negative"} + test_reader = reader.imdb_test(word_dict) + class_num = 2 + else: + assert os.path.exists( + word_dict_path), "The word dictionary file does not exist" + assert os.path.exists( + label_dict_path), "The label dictionary file does not exist" + + word_dict = load_dict(word_dict_path) + word_reverse_dict = dict((value, key) + for key, value in word_dict.iteritems()) + label_reverse_dict = load_reverse_dict(label_dict_path) + class_num = len(label_reverse_dict) + test_reader = reader.infer_reader(data_path, word_dict)() + + dict_dim = len(word_dict) + + # initialize PaddlePaddle. + paddle.init(use_gpu=False, trainer_count=1) + + prob_layer = nested_net(dict_dim, class_num, is_infer=True) + + # load the trained models. + parameters = paddle.parameters.Parameters.from_tar( + gzip.open(model_path, "r")) + inferer = paddle.inference.Inference( + output_layer=prob_layer, parameters=parameters) + + test_batch = [] + for idx, item in enumerate(test_reader): + test_batch.append([item[0]]) + if len(test_batch) == batch_size: + _infer_a_batch(inferer, test_batch, word_reverse_dict, + label_reverse_dict) + test_batch = [] + + if len(test_batch): + _infer_a_batch(inferer, test_batch, word_reverse_dict, + label_reverse_dict) + test_batch = [] + + +if __name__ == "__main__": + infer() diff --git a/nested_sequence/text_classification/network_conf.py b/nested_sequence/text_classification/network_conf.py new file mode 100644 index 0000000000..b4c4066909 --- /dev/null +++ b/nested_sequence/text_classification/network_conf.py @@ -0,0 +1,60 @@ +import paddle.v2 as paddle +from config import ModelConfig as conf + + +def cnn_cov_group(group_input, hidden_size): + """ + Convolution group definition. + :param group_input: The input of this layer. + :type group_input: LayerOutput + :params hidden_size: The size of the fully connected layer. + :type hidden_size: int + """ + conv3 = paddle.networks.sequence_conv_pool( + input=group_input, context_len=3, hidden_size=hidden_size) + conv4 = paddle.networks.sequence_conv_pool( + input=group_input, context_len=4, hidden_size=hidden_size) + + linear_proj = paddle.layer.fc( + input=[conv3, conv4], + size=hidden_size, + param_attr=paddle.attr.ParamAttr(name='_cov_value_weight'), + bias_attr=paddle.attr.ParamAttr(name='_cov_value_bias'), + act=paddle.activation.Linear()) + + return linear_proj + + +def nested_net(dict_dim, class_num, is_infer=False): + """ + Nested network definition. + :param dict_dim: Size of word dictionary. + :type dict_dim: int + :params class_num: Number of instance class. + :type class_num: int + :params is_infer: The boolean parameter + indicating inferring or training. + :type is_infer: bool + """ + data = paddle.layer.data( + "word", paddle.data_type.integer_value_sub_sequence(dict_dim)) + + emb = paddle.layer.embedding(input=data, size=conf.emb_size) + nest_group = paddle.layer.recurrent_group( + input=[paddle.layer.SubsequenceInput(emb), conf.hidden_size], + step=cnn_cov_group) + avg_pool = paddle.layer.pooling( + input=nest_group, + pooling_type=paddle.pooling.Avg(), + agg_level=paddle.layer.AggregateLevel.TO_NO_SEQUENCE) + prob = paddle.layer.mixed( + size=class_num, + input=[paddle.layer.full_matrix_projection(input=avg_pool)], + act=paddle.activation.Softmax()) + if is_infer == False: + label = paddle.layer.data("label", + paddle.data_type.integer_value(class_num)) + cost = paddle.layer.classification_cost(input=prob, label=label) + return cost, prob, label + + return prob diff --git a/nested_sequence/text_classification/reader.py b/nested_sequence/text_classification/reader.py new file mode 100644 index 0000000000..5202942caf --- /dev/null +++ b/nested_sequence/text_classification/reader.py @@ -0,0 +1,221 @@ +""" +IMDB dataset. + +This module downloads IMDB dataset from +http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set +of 25,000 highly polar movie reviews for training, and 25,000 for testing. +Besides, this module also provides API for building dictionary. +""" +import collections +import tarfile +import Queue +import re +import string +import threading +import os + +import paddle.v2.dataset.common + +URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' +MD5 = '7c2ac02c03563afcf9b574c7e56c153a' + + +def tokenize(pattern): + """ + Read files that match the given pattern. Tokenize and yield each file. + """ + with tarfile.open( + paddle.v2.dataset.common.download(URL, 'imdb', MD5)) as tarf: + tf = tarf.next() + while tf != None: + if bool(pattern.match(tf.name)): + # newline and punctuations removal and ad-hoc tokenization. + docs = tarf.extractfile(tf).read().rstrip("\n\r").lower().split( + '.') + doc_list = [] + for doc in docs: + doc = doc.strip() + if doc: + doc_without_punc = doc.translate( + None, string.punctuation).strip() + if doc_without_punc: + doc_list.append( + [word for word in doc_without_punc.split()]) + yield doc_list + tf = tarf.next() + + +def imdb_build_dict(pattern, cutoff): + """ + Build a word dictionary from the corpus. Keys of the dictionary are words, + and values are zero-based IDs of these words. + """ + word_freq = collections.defaultdict(int) + for doc_list in tokenize(pattern): + for doc in doc_list: + for word in doc: + word_freq[word] += 1 + + word_freq[''] = cutoff + 1 + word_freq = filter(lambda x: x[1] > cutoff, word_freq.items()) + dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0])) + words, _ = list(zip(*dictionary)) + word_idx = dict(zip(words, xrange(len(words)))) + return word_idx + + +def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size): + UNK = word_idx[''] + + qs = [Queue.Queue(maxsize=buffer_size), Queue.Queue(maxsize=buffer_size)] + + def load(pattern, queue): + for doc_list in tokenize(pattern): + queue.put(doc_list) + queue.put(None) + + def reader(): + # Creates two threads that loads positive and negative samples + # into qs. + t0 = threading.Thread(target=load, args=(pos_pattern, qs[0], )) + t0.daemon = True + t0.start() + + t1 = threading.Thread(target=load, args=(neg_pattern, qs[1], )) + t1.daemon = True + t1.start() + + # Read alternatively from qs[0] and qs[1]. + i = 0 + doc_list = qs[i].get() + + while doc_list != None: + ids_list = [] + for doc in doc_list: + ids_list.append([word_idx.get(w, UNK) for w in doc]) + yield ids_list, i % 2 + i += 1 + doc_list = qs[i % 2].get() + + # If any queue is empty, reads from the other queue. + i += 1 + doc_list = qs[i % 2].get() + while doc_list != None: + ids_list = [] + for doc in doc_list: + ids_list.append([word_idx.get(w, UNK) for w in doc]) + yield ids_list, i % 2 + doc_list = qs[i % 2].get() + + return reader() + + +def imdb_train(word_idx): + """ + IMDB training set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + subsequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + re.compile("aclImdb/train/pos/.*\.txt$"), + re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000) + + +def imdb_test(word_idx): + """ + IMDB test set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + subsequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Test reader creator + :rtype: callable + """ + return reader_creator( + re.compile("aclImdb/test/pos/.*\.txt$"), + re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000) + + +def imdb_word_dict(): + """ + Build a word dictionary from the corpus. + + :return: Word dictionary + :rtype: dict + """ + return imdb_build_dict( + re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150) + + +def train_reader(data_dir, word_dict, label_dict): + """ + Reader interface for training data + + :param data_dir: data directory + :type data_dir: str + :param word_dict: path of word dictionary, + the dictionary must has a "UNK" in it. + :type word_dict: Python dict + :param label_dict: path of label dictionary. + :type label_dict: Python dict + """ + + def reader(): + UNK_ID = word_dict[''] + word_col = 1 + lbl_col = 0 + + for file_name in os.listdir(data_dir): + file_path = os.path.join(data_dir, file_name) + if not os.path.isfile(file_path): + continue + with open(file_path, "r") as f: + for line in f: + line_split = line.strip().split("\t") + doc = line_split[word_col] + doc_ids = [] + for sent in doc.strip().split("."): + sent_ids = [ + word_dict.get(w, UNK_ID) for w in sent.split() + ] + if sent_ids: + doc_ids.append(sent_ids) + + yield doc_ids, label_dict[line_split[lbl_col]] + + return reader + + +def infer_reader(file_path, word_dict): + """ + Reader interface for prediction + + :param data_dir: data directory + :type data_dir: str + :param word_dict: path of word dictionary, + the dictionary must has a "UNK" in it. + :type word_dict: Python dict + """ + + def reader(): + UNK_ID = word_dict[''] + + with open(file_path, "r") as f: + for doc in f: + doc_ids = [] + for sent in doc.strip().split("."): + sent_ids = [word_dict.get(w, UNK_ID) for w in sent.split()] + if sent_ids: + doc_ids.append(sent_ids) + + yield doc_ids, doc + + return reader diff --git a/nested_sequence/text_classification/requirements.txt b/nested_sequence/text_classification/requirements.txt new file mode 100644 index 0000000000..dca9a90964 --- /dev/null +++ b/nested_sequence/text_classification/requirements.txt @@ -0,0 +1 @@ +click diff --git a/nested_sequence/text_classification/train.py b/nested_sequence/text_classification/train.py new file mode 100644 index 0000000000..a0da1ad0c5 --- /dev/null +++ b/nested_sequence/text_classification/train.py @@ -0,0 +1,195 @@ +import os +import sys +import gzip +import click + +import paddle.v2 as paddle + +import reader +from network_conf import nested_net +from utils import build_word_dict, build_label_dict, load_dict, logger +from config import TrainerConfig as conf + + +@click.command('train') +@click.option( + "--train_data_dir", + default=None, + help=("The path of training dataset (default: None). " + "If this parameter is not set, " + "imdb dataset will be used.")) +@click.option( + "--test_data_dir", + default=None, + help=("The path of testing dataset (default: None). " + "If this parameter is not set, " + "imdb dataset will be used.")) +@click.option( + "--word_dict_path", + type=str, + default=None, + help=("The path of word dictionary (default: None). " + "If this parameter is not set, imdb dataset will be used. " + "If this parameter is set, but the file does not exist, " + "word dictionay will be built from " + "the training data automatically.")) +@click.option( + "--label_dict_path", + type=str, + default=None, + help=("The path of label dictionary (default: None). " + "If this parameter is not set, imdb dataset will be used. " + "If this parameter is set, but the file does not exist, " + "label dictionay will be built from " + "the training data automatically.")) +@click.option( + "--model_save_dir", + type=str, + default="models", + help="The path to save the trained models (default: 'models').") +def train(train_data_dir, test_data_dir, word_dict_path, label_dict_path, + model_save_dir): + """ + :params train_data_path: The path of training data, if this parameter + is not specified, imdb dataset will be used to run this example + :type train_data_path: str + :params test_data_path: The path of testing data, if this parameter + is not specified, imdb dataset will be used to run this example + :type test_data_path: str + :params word_dict_path: The path of word dictionary, if this parameter + is not specified, imdb dataset will be used to run this example + :type word_dict_path: str + :params label_dict_path: The path of label dictionary, if this parameter + is not specified, imdb dataset will be used to run this example + :type label_dict_path: str + :params model_save_dir: dir where models saved + :type model_save_dir: str + """ + if train_data_dir is not None: + assert word_dict_path and label_dict_path, ( + "The parameter train_data_dir, word_dict_path, label_dict_path " + "should be set at the same time.") + + if not os.path.exists(model_save_dir): + os.mkdir(model_save_dir) + + use_default_data = (train_data_dir is None) + + if use_default_data: + logger.info(("No training data are porivided, " + "use imdb to train the model.")) + logger.info("Please wait to build the word dictionary ...") + + word_dict = reader.imdb_word_dict() + train_reader = paddle.batch( + paddle.reader.shuffle( + lambda: reader.imdb_train(word_dict), buf_size=1000), + batch_size=100) + test_reader = paddle.batch( + lambda: reader.imdb_test(word_dict), batch_size=100) + class_num = 2 + else: + if word_dict_path is None or not os.path.exists(word_dict_path): + logger.info(("Word dictionary is not given, the dictionary " + "is automatically built from the training data.")) + + # build the word dictionary to map the original string-typed + # words into integer-typed index + build_word_dict( + data_dir=train_data_dir, + save_path=word_dict_path, + use_col=1, + cutoff_fre=0) + + if not os.path.exists(label_dict_path): + logger.info(("Label dictionary is not given, the dictionary " + "is automatically built from the training data.")) + # build the label dictionary to map the original string-typed + # label into integer-typed index + build_label_dict( + data_dir=train_data_dir, save_path=label_dict_path, use_col=0) + + word_dict = load_dict(word_dict_path) + label_dict = load_dict(label_dict_path) + + class_num = len(label_dict) + logger.info("Class number is : %d." % class_num) + + train_reader = paddle.batch( + paddle.reader.shuffle( + reader.train_reader(train_data_dir, word_dict, label_dict), + buf_size=conf.buf_size), + batch_size=conf.batch_size) + + if test_data_dir is not None: + # here, because training and testing data share a same format, + # we still use the reader.train_reader to read the testing data. + test_reader = paddle.batch( + paddle.reader.shuffle( + reader.train_reader(test_data_dir, word_dict, label_dict), + buf_size=conf.buf_size), + batch_size=conf.batch_size) + else: + test_reader = None + + dict_dim = len(word_dict) + + logger.info("Length of word dictionary is : %d." % (dict_dim)) + + paddle.init(use_gpu=conf.use_gpu, trainer_count=conf.trainer_count) + + # create optimizer + adam_optimizer = paddle.optimizer.Adam( + learning_rate=conf.learning_rate, + regularization=paddle.optimizer.L2Regularization( + rate=conf.l2_learning_rate), + model_average=paddle.optimizer.ModelAverage( + average_window=conf.average_window)) + + # define network topology. + cost, prob, label = nested_net(dict_dim, class_num, is_infer=False) + + # create all the trainable parameters. + parameters = paddle.parameters.create(cost) + + # create the trainer instance. + trainer = paddle.trainer.SGD( + cost=cost, + extra_layers=paddle.evaluator.auc(input=prob, label=label), + parameters=parameters, + update_equation=adam_optimizer) + + # feeding dictionary + feeding = {"word": 0, "label": 1} + + def _event_handler(event): + """ + Define the end batch and the end pass event handler. + """ + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % conf.log_period == 0: + logger.info("Pass %d, Batch %d, Cost %f, %s\n" % ( + event.pass_id, event.batch_id, event.cost, event.metrics)) + + if isinstance(event, paddle.event.EndPass): + if test_reader is not None: + result = trainer.test(reader=test_reader, feeding=feeding) + logger.info("Test at Pass %d, %s \n" % (event.pass_id, + result.metrics)) + with gzip.open( + os.path.join(model_save_dir, "params_pass_%05d.tar.gz" % + event.pass_id), "w") as f: + parameters.to_tar(f) + + # begin training network + trainer.train( + reader=train_reader, + event_handler=_event_handler, + feeding=feeding, + num_passes=conf.num_passes) + + logger.info("Training has finished.") + + +if __name__ == "__main__": + train() diff --git a/nested_sequence/text_classification/utils.py b/nested_sequence/text_classification/utils.py new file mode 100644 index 0000000000..1535e31f46 --- /dev/null +++ b/nested_sequence/text_classification/utils.py @@ -0,0 +1,95 @@ +import os +import logging +from collections import defaultdict + +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) + + +def build_word_dict(data_dir, save_path, use_col=1, cutoff_fre=1): + """ + Build word dictionary from training data. + :param data_dir: The directory of training dataset. + :type data_dir: str + :params save_path: The path where the word dictionary will be saved. + :type save_path: str + :params use_col: The index of text juring line split. + :type use_col: int + :params cutoff_fre: The word will not be added to dictionary if it's + frequency is less than cutoff_fre. + :type cutoff_fre: int + """ + values = defaultdict(int) + + for file_name in os.listdir(data_dir): + file_path = os.path.join(data_dir, file_name) + if not os.path.isfile(file_path): + continue + with open(file_path, "r") as fdata: + for line in fdata: + line_splits = line.strip().split("\t") + if len(line_splits) < use_col: + continue + doc = line_splits[use_col] + for sent in doc.strip().split("."): + for w in sent.split(): + values[w] += 1 + + values[''] = cutoff_fre + with open(save_path, "w") as f: + for v, count in sorted( + values.iteritems(), key=lambda x: x[1], reverse=True): + if count < cutoff_fre: + break + f.write("%s\t%d\n" % (v, count)) + + +def build_label_dict(data_dir, save_path, use_col=0): + """ + Build label dictionary from training data. + :param data_dir: The directory of training dataset. + :type data_dir: str + :params save_path: The path where the label dictionary will be saved. + :type save_path: str + :params use_col: The index of label juring line split. + :type use_col: int + """ + values = defaultdict(int) + + for file_name in os.listdir(data_dir): + file_path = os.path.join(data_dir, file_name) + if not os.path.isfile(file_path): + continue + with open(file_path, "r") as fdata: + for line in fdata: + line_splits = line.strip().split("\t") + if len(line_splits) < use_col: + continue + values[line_splits[use_col]] += 1 + + with open(save_path, "w") as f: + for v, count in sorted( + values.iteritems(), key=lambda x: x[1], reverse=True): + f.write("%s\t%d\n" % (v, count)) + + +def load_dict(dict_path): + """ + Load word dictionary from dictionary path. + :param dict_path: The path of word dictionary. + :type data_dir: str + """ + return dict((line.strip().split("\t")[0], idx) + for idx, line in enumerate(open(dict_path, "r").readlines())) + + +def load_reverse_dict(dict_path): + """ + Load the reversed word dictionary from dictionary path. + Index of each word is saved in key of the dictionary and the + corresponding word saved in value of the dictionary. + :param dict_path: The path of word dictionary. + :type data_dir: str + """ + return dict((idx, line.strip().split("\t")[0]) + for idx, line in enumerate(open(dict_path, "r").readlines()))