diff --git a/demo/recognize_digits/train_ft.py b/demo/recognize_digits/train_ft.py new file mode 100644 index 00000000..6f89bbab --- /dev/null +++ b/demo/recognize_digits/train_ft.py @@ -0,0 +1,169 @@ +from PIL import Image +import numpy as np +import paddle.v2 as paddle +import paddle.v2.dataset.common as common +import os +import sys +import glob +import pickle + + +# NOTE: must change this to your own username on paddlecloud. +USERNAME = "demo" +DC = os.getenv("PADDLE_CLOUD_CURRENT_DATACENTER") +common.DATA_HOME = "/pfs/%s/home/%s" % (DC, USERNAME) +TRAIN_FILES_PATH = os.path.join(common.DATA_HOME, "mnist") +TEST_FILES_PATH = os.path.join(common.DATA_HOME, "mnist") + +TRAINER_ID = int(os.getenv("PADDLE_INIT_TRAINER_ID", "-1")) +TRAINER_COUNT = int(os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "-1")) + +def prepare_dataset(): + # convert will also split the dataset by line-count + common.convert(TRAIN_FILES_PATH, + paddle.dataset.mnist.train(), + 8192, "train") + common.convert(TEST_FILES_PATH, + paddle.dataset.mnist.test(), + 512, "test") + +def cluster_reader_recordio(trainer_id, trainer_count, flag): + ''' + read from cloud dataset which is stored as recordio format + each trainer will read a subset of files of the whole dataset. + ''' + import recordio + def reader(): + PATTERN_STR = "%s-*" % flag + FILES_PATTERN = os.path.join(TRAIN_FILES_PATH, PATTERN_STR) + file_list = glob.glob(FILES_PATTERN) + file_list.sort() + my_file_list = [] + # read files for current trainer_id + for idx, f in enumerate(file_list): + if idx % trainer_count == trainer_id: + my_file_list.append(f) + for f in my_file_list: + print "processing ", f + reader = recordio.reader(f) + record_raw = reader.read() + while record_raw: + yield pickle.loads(record_raw) + record_raw = reader.read() + reader.close() + return reader + + + +def softmax_regression(img): + predict = paddle.layer.fc( + input=img, size=10, act=paddle.activation.Softmax()) + return predict + + +def multilayer_perceptron(img): + # The first fully-connected layer + hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu()) + # The second fully-connected layer and the according activation function + hidden2 = paddle.layer.fc( + input=hidden1, size=64, act=paddle.activation.Relu()) + # The thrid fully-connected layer, note that the hidden size should be 10, + # which is the number of unique digits + predict = paddle.layer.fc( + input=hidden2, size=10, act=paddle.activation.Softmax()) + return predict + + +def convolutional_neural_network(img): + # first conv layer + conv_pool_1 = paddle.networks.simple_img_conv_pool( + input=img, + filter_size=5, + num_filters=20, + num_channel=1, + pool_size=2, + pool_stride=2, + act=paddle.activation.Relu()) + # second conv layer + conv_pool_2 = paddle.networks.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + num_channel=20, + pool_size=2, + pool_stride=2, + act=paddle.activation.Relu()) + # fully-connected layer + predict = paddle.layer.fc( + input=conv_pool_2, size=10, act=paddle.activation.Softmax()) + return predict + + +def main(): + etcd_ip = os.getenv("ETCD_IP") + etcd_endpoint = "http://" + etcd_ip + ":" + "2379" + paddle.init() + + # define network topology + images = paddle.layer.data( + name='pixel', type=paddle.data_type.dense_vector(784)) + label = paddle.layer.data( + name='label', type=paddle.data_type.integer_value(10)) + + # Here we can build the prediction network in different ways. Please + # choose one by uncomment corresponding line. + # predict = softmax_regression(images) + # predict = multilayer_perceptron(images) + predict = convolutional_neural_network(images) + + cost = paddle.layer.classification_cost(input=predict, label=label) + + parameters = paddle.parameters.create(cost) + + optimizer = paddle.optimizer.Momentum( + learning_rate=0.1 / 128.0, + momentum=0.9, + regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128)) + + trainer = paddle.trainer.SGD( + cost=cost, + parameters=parameters, + update_equation=optimizer, + is_local=False, + pserver_spec=etcd_endpoint, + use_etcd=True) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + reader=paddle.batch( + cluster_reader_recordio(TRAINER_ID, TRAINER_COUNT, "test"), + batch_size=2)) + print "Test with Pass %d, Cost %f, %s\n" % ( + event.pass_id, result.cost, result.metrics) + + trainer.train( + reader=paddle.batch( + cluster_reader_recordio(TRAINER_ID, TRAINER_COUNT, "train"), + batch_size=128), + event_handler=event_handler, + num_passes=5) + +if __name__ == '__main__': + usage = "python train.py [prepare|train]" + if len(sys.argv) != 2: + print usage + exit(1) + + if TRAINER_ID == -1 or TRAINER_COUNT == -1: + print "no cloud environ found, must run on cloud" + exit(1) + + if sys.argv[1] == "prepare": + prepare_dataset() + elif sys.argv[1] == "train": + main() diff --git a/demo/understand_sentiment/train_ft.py b/demo/understand_sentiment/train_ft.py new file mode 100644 index 00000000..337e7df8 --- /dev/null +++ b/demo/understand_sentiment/train_ft.py @@ -0,0 +1,229 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.v2 as paddle +import paddle.v2.dataset.common as common +import os +import sys +import glob +import pickle + +# NOTE: must change this to your own username on paddlecloud. +USERNAME = "demo" +DC = os.getenv("PADDLE_CLOUD_CURRENT_DATACENTER") +common.DATA_HOME = "/pfs/%s/home/%s" % (DC, USERNAME) +TRAIN_FILES_PATH = os.path.join(common.DATA_HOME, "imdb") +TEST_FILES_PATH = os.path.join(common.DATA_HOME, "imdb") + +TRAINER_ID = int(os.getenv("PADDLE_INIT_TRAINER_ID", "-1")) +TRAINER_COUNT = int(os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "-1")) + +def prepare_dataset(): + word_dict = paddle.dataset.imdb.word_dict() + # convert will also split the dataset by line-count + common.convert(TRAIN_FILES_PATH, + lambda: paddle.dataset.imdb.train(word_dict), + 1000, "train") + common.convert(TEST_FILES_PATH, + lambda: paddle.dataset.imdb.test(word_dict), + 1000, "test") + +def cluster_reader_recordio(trainer_id, trainer_count, flag): + ''' + read from cloud dataset which is stored as recordio format + each trainer will read a subset of files of the whole dataset. + ''' + import recordio + def reader(): + PATTERN_STR = "%s-*" % flag + FILES_PATTERN = os.path.join(TRAIN_FILES_PATH, PATTERN_STR) + file_list = glob.glob(FILES_PATTERN) + file_list.sort() + my_file_list = [] + # read files for current trainer_id + for idx, f in enumerate(file_list): + if idx % trainer_count == trainer_id: + my_file_list.append(f) + for f in my_file_list: + print "processing ", f + reader = recordio.reader(f) + record_raw = reader.read() + while record_raw: + yield pickle.loads(record_raw) + record_raw = reader.read() + reader.close() + return reader + + + +def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128): + data = paddle.layer.data("word", + paddle.data_type.integer_value_sequence(input_dim)) + emb = paddle.layer.embedding(input=data, size=emb_dim) + conv_3 = paddle.networks.sequence_conv_pool( + input=emb, context_len=3, hidden_size=hid_dim) + conv_4 = paddle.networks.sequence_conv_pool( + input=emb, context_len=4, hidden_size=hid_dim) + output = paddle.layer.fc( + input=[conv_3, conv_4], size=class_dim, act=paddle.activation.Softmax()) + lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) + cost = paddle.layer.classification_cost(input=output, label=lbl) + return cost + + +def stacked_lstm_net(input_dim, + class_dim=2, + emb_dim=128, + hid_dim=512, + stacked_num=3): + """ + A Wrapper for sentiment classification task. + This network uses bi-directional recurrent network, + consisting three LSTM layers. This configure is referred to + the paper as following url, but use fewer layrs. + http://www.aclweb.org/anthology/P15-1109 + + input_dim: here is word dictionary dimension. + class_dim: number of categories. + emb_dim: dimension of word embedding. + hid_dim: dimension of hidden layer. + stacked_num: number of stacked lstm-hidden layer. + """ + assert stacked_num % 2 == 1 + + layer_attr = paddle.attr.Extra(drop_rate=0.5) + fc_para_attr = paddle.attr.Param(learning_rate=1e-3) + lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.) + para_attr = [fc_para_attr, lstm_para_attr] + bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.) + relu = paddle.activation.Relu() + linear = paddle.activation.Linear() + + data = paddle.layer.data("word", + paddle.data_type.integer_value_sequence(input_dim)) + emb = paddle.layer.embedding(input=data, size=emb_dim) + + fc1 = paddle.layer.fc( + input=emb, size=hid_dim, act=linear, bias_attr=bias_attr) + lstm1 = paddle.layer.lstmemory( + input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr) + + inputs = [fc1, lstm1] + for i in range(2, stacked_num + 1): + fc = paddle.layer.fc( + input=inputs, + size=hid_dim, + act=linear, + param_attr=para_attr, + bias_attr=bias_attr) + lstm = paddle.layer.lstmemory( + input=fc, + reverse=(i % 2) == 0, + act=relu, + bias_attr=bias_attr, + layer_attr=layer_attr) + inputs = [fc, lstm] + + fc_last = paddle.layer.pooling( + input=inputs[0], pooling_type=paddle.pooling.Max()) + lstm_last = paddle.layer.pooling( + input=inputs[1], pooling_type=paddle.pooling.Max()) + output = paddle.layer.fc( + input=[fc_last, lstm_last], + size=class_dim, + act=paddle.activation.Softmax(), + bias_attr=bias_attr, + param_attr=para_attr) + + lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) + cost = paddle.layer.classification_cost(input=output, label=lbl) + return cost + + +def main(): + # init + paddle.init() + etcd_ip = os.getenv("ETCD_IP") + etcd_endpoint = "http://" + etcd_ip + ":" + "2379" + #data + print 'load dictionary...' + word_dict = paddle.dataset.imdb.word_dict() + dict_dim = len(word_dict) + class_dim = 2 + train_reader = paddle.batch( + paddle.reader.shuffle( + cluster_reader_recordio(TRAINER_ID, TRAINER_COUNT, "train"), buf_size=1000), + batch_size=100) + test_reader = paddle.batch( + cluster_reader_recordio(TRAINER_ID, TRAINER_COUNT, "test"), batch_size=100) + + feeding = {'word': 0, 'label': 1} + + # network config + # Please choose the way to build the network + # by uncommenting the corresponding line. + cost = convolution_net(dict_dim, class_dim=class_dim) + # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3) + + # create parameters + parameters = paddle.parameters.create(cost) + + # create optimizer + adam_optimizer = paddle.optimizer.Adam( + learning_rate=2e-3, + regularization=paddle.optimizer.L2Regularization(rate=8e-4), + model_average=paddle.optimizer.ModelAverage(average_window=0.5)) + + # End batch and end pass event handler + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "\nPass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + else: + sys.stdout.write('.') + sys.stdout.flush() + if isinstance(event, paddle.event.EndPass): + result = trainer.test(reader=test_reader, feeding=feeding) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) + + # create trainer + trainer = paddle.trainer.SGD( + cost=cost, + parameters=parameters, + update_equation=adam_optimizer, + is_local=False, + pserver_spec=etcd_endpoint, + use_etcd=True) + + trainer.train( + reader=train_reader, + event_handler=event_handler, + feeding=feeding, + num_passes=2) + +if __name__ == '__main__': + usage = "python train.py [prepare|train]" + if len(sys.argv) != 2: + print usage + exit(1) + + if TRAINER_ID == -1 or TRAINER_COUNT == -1: + print "no cloud environ found, must run on cloud" + exit(1) + + if sys.argv[1] == "prepare": + prepare_dataset() + elif sys.argv[1] == "train": + main()