Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 11 additions & 14 deletions PaddleNLP/emotion_detection/inference_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,7 @@ def do_save_inference_model(args):
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
infer_loader, probs, feed_target_names = create_model(
args,
num_labels=args.num_labels,
is_prediction=True)
args, num_labels=args.num_labels, is_prediction=True)

test_prog = test_prog.clone(for_test=True)
exe = fluid.Executor(place)
Expand Down Expand Up @@ -82,10 +80,10 @@ def test_inference_model(args, texts):

assert (args.inference_model_dir)
infer_program, feed_names, fetch_targets = fluid.io.load_inference_model(
dirname=args.inference_model_dir,
executor=exe,
model_filename="model.pdmodel",
params_filename="params.pdparams")
dirname=args.inference_model_dir,
executor=exe,
model_filename="model.pdmodel",
params_filename="params.pdparams")
data = []
seq_lens = []
for query in texts:
Expand All @@ -97,13 +95,13 @@ def test_inference_model(args, texts):
seq_lens = np.array(seq_lens)

pred = exe.run(infer_program,
feed={
feed_names[0]:data,
feed_names[1]:seq_lens},
fetch_list=fetch_targets,
return_numpy=True)
feed={feed_names[0]: data,
feed_names[1]: seq_lens},
fetch_list=fetch_targets,
return_numpy=True)
for probs in pred[0]:
print("%d\t%f\t%f\t%f" % (np.argmax(probs), probs[0], probs[1], probs[2]))
print("%d\t%f\t%f\t%f" %
(np.argmax(probs), probs[0], probs[1], probs[2]))


if __name__ == "__main__":
Expand All @@ -116,4 +114,3 @@ def test_inference_model(args, texts):
else:
texts = [u"我 讨厌 你 , 哼哼 哼 。 。", u"我 喜欢 你 , 爱 你 哟"]
test_inference_model(args, texts)

50 changes: 24 additions & 26 deletions PaddleNLP/lexical_analysis/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Lexical Analysis of Chinese,简称 LAC,是一个联合的词法分析模型

#### 1.PaddlePaddle 安装

本项目依赖 PaddlePaddle 1.4.0 及以上版本和PaddleHub 1.0.0及以上版本 ,PaddlePaddle安装请参考官网 [快速安装](http://www.paddlepaddle.org/paddle#quick-start),PaddleHub安装参考 [PaddleHub](https://github.com/PaddlePaddle/PaddleHub)。
本项目依赖 PaddlePaddle 1.6.0 及以上版本和PaddleHub 1.0.0及以上版本 ,PaddlePaddle安装请参考官网 [快速安装](http://www.paddlepaddle.org/paddle#quick-start),PaddleHub安装参考 [PaddleHub](https://github.com/PaddlePaddle/PaddleHub)。

> Warning: GPU 和 CPU 版本的 PaddlePaddle 分别是 paddlepaddle-gpu 和 paddlepaddle,请安装时注意区别。

Expand All @@ -26,52 +26,48 @@ Lexical Analysis of Chinese,简称 LAC,是一个联合的词法分析模型
git clone https://github.com/PaddlePaddle/models.git
cd models/PaddleNLP/lexical_analysis
```

#### 3. 环境依赖
PaddlePaddle的版本要求是:Python 2 版本是 2.7.15+、Python 3 版本是 3.5.1+/3.6/3.7。LAC的代码可支持Python2/3,无具体版本限制

### 数据准备

#### 1. 快速下载

本项目涉及的**数据集**和**预训练模型**的数据可通过执行以下脚本进行快速下载,若仅需使用部分数据,可根据需要参照下列介绍进行部分下载

```bash
sh download.sh
python downloads.py all
```
或在支持运行shell脚本的环境下执行:
```bash
sh downloads.sh
```

#### 2. 训练数据集

下载数据集文件,解压后会生成 `./data/` 文件夹
```bash
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/lexical_analysis-dataset-2.0.0.tar.gz
tar xvf lexical_analysis-dataset-2.0.0.tar.gz
python downloads.py dataset
```

#### 3. 预训练模型

我们开源了在自建数据集上训练的词法分析模型,可供用户直接使用,这里提供两种下载方式:

方式一:基于 PaddleHub 命令行工具,PaddleHub 的安装参考 [PaddleHub](https://github.com/PaddlePaddle/PaddleHub)
我们开源了在自建数据集上训练的词法分析模型,可供用户直接使用,可通过下述链接进行下载:
```bash
# download baseline model
hub download lexical_analysis
tar xvf lexical_analysis-2.0.0.tar.gz
python downloads.py lac

# download ERNIE finetuned model
hub download lexical_analysis_finetuned
tar xvf lexical_analysis_finetuned-1.0.0.tar.gz
```
python downloads.py finetuned

方式二:直接下载
```bash
# download baseline model
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/lexical_analysis-2.0.0.tar.gz
tar xvf lexical_analysis-2.0.0.tar.gz

# download ERNIE finetuned model
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/lexical_analysis_finetuned-1.0.0.tar.gz
tar xvf lexical_analysis_finetuned-1.0.0.tar.gz
# download ERNIE model for training
python downloads.py ernie
```

注:若需进行ERNIE Finetune训练,需自行下载 [ERNIE](https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz) 开放的模型,下载链接为: [https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz](https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz),下载后解压至 `./pretrained/` 目录下。

注:若需进行ERNIE Finetune训练,需先行下载
[ERNIE](https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz) 开放的模型,通过命令`python
downloads.py ernie`可完成下载
### 模型评估

我们基于自建的数据集训练了一个词法分析的模型,可以直接用这个模型对测试集 `./data/test.tsv` 进行验证,
Expand All @@ -85,8 +81,9 @@ sh run_ernie.sh eval

### 模型训练
基于示例的数据集,可通过下面的命令,在训练集 `./data/train.tsv` 上进行训练,示例包含程序在单机单卡/多卡,以及CPU多线程的运行设置
> Warning: 若需进行ERNIE Finetune训练,需自行下载 [ERNIE](https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz) 开放的模型,下载链接为: [https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz](https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz),下载后解压至 `./pretrained/` 目录下。

> Waring: 若需进行ERNIE Finetune训练,需先行下载
[ERNIE](https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz) 开放的模型,通过命令`python
downloads.py ernie`可完成下载
```bash
# baseline model, using single GPU
sh run.sh train_single_gpu
Expand Down Expand Up @@ -180,7 +177,7 @@ python inference_model.py \
1. 从原始数据文件中抽取出句子和标签,构造句子序列和标签序列
2. 将句子序列中的特殊字符进行转换
3. 依据词典获取词对应的整数索引

### 代码结构说明
```text
.
Expand All @@ -189,6 +186,7 @@ python inference_model.py \
├── compare.py # 执行LAC与其他开源分词的对比脚本
├── creator.py # 执行创建网络和数据读取器的脚本
├── data/ # 存放数据集的目录
├── downloads.py # 用于下载数据和模型的脚本
├── downloads.sh # 用于下载数据和模型的脚本
├── eval.py # 词法分析评估的脚本
├── inference_model.py # 执行保存inference_model的脚本,用于准备上线部署环境
Expand Down
7 changes: 4 additions & 3 deletions PaddleNLP/lexical_analysis/compare.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -11,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: UTF-8 -*-
"""
evaluate wordseg for LAC and other open-source wordseg tools
"""
Expand All @@ -20,6 +20,7 @@

import sys
import os
import io


def to_unicode(string):
Expand Down Expand Up @@ -70,7 +71,7 @@ def load_testdata(datapath="./data/test_data/test_part"):
"""none"""
sentences = []
sent_seg_list = []
for line in open(datapath):
for line in io.open(datapath, 'r', encoding='utf8'):
sent, label = line.strip().split("\t")
sentences.append(sent)

Expand Down Expand Up @@ -109,7 +110,7 @@ def get_lac_result():
`sh run.sh | tail -n 100 > result.txt`
"""
sent_seg_list = []
for line in open("./result.txt"):
for line in io.open("./result.txt", 'r', encoding='utf8'):
line = line.strip().split(" ")
words = [pair.split("/")[0] for pair in line]
labels = [pair.split("/")[1] for pair in line]
Expand Down
Loading