# 实现用户输入，进行POS

## 导入相应的库

In [1]:
# coding='UTF-8'
###
# 将训练好的词性标注模型封装为服务
###
# 加载存储的模型开始预测
from keras.preprocessing import sequence
from keras.models import load_model
import numpy as np
import jieba

## 将文本中的词语转换为索引

In [2]:
# 由于将词语转化为索引的word_index需要与词向量模型对齐，故在导入词向量模型后再将X进行处理
def tokenizer(texts, word_index):
    data = []
    MAX_SEQUENCE_LENGTH = 100
    for sentence in texts:
        new_sentence = []
        for word in sentence:
            try:
                new_sentence.append(word_index[word])  # 把文本中的词语转化为index
            except:
                new_sentence.append(0)
            
        data.append(new_sentence)
    # 使用kears的内置函数padding对齐句子,好处是输出numpy数组，不用自己转化了
    data = sequence.pad_sequences(data, maxlen = MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    
    return data

## 将输入的文本转换为索引列表

In [3]:
def transfer_input(input_text_segged):
    # 序号化 文本，tokenizer句子，并返回每个句子所对应的词语索引，并填充句子长度至100
    # 因为只有一句，所以在该输入句外加上了括号
    text_dictionary = np.load('./word_index.npy', allow_pickle=True).item()  # 导入{索引：词语}字典
    padded_input_text_indexs = tokenizer([input_text_segged], text_dictionary)
    
    return padded_input_text_indexs

## 将输出转化为标签格式

In [4]:
def transfer_output(output):
    # 导入{标签：索引}字典
    label_dictionary = np.load('./labels_index.npy', allow_pickle=True).item()
    label_dictionary_index_label = {label_dictionary[key]:key for key in label_dictionary}
    
    # 将预测的每一个标签对应向量转化为索引号
    output_index = []
    for label_vector in output[0]:
        label_max_prob = max(label_vector.tolist())
        label_max_prob_index = label_vector.tolist().index(label_max_prob)
        output_index.append(label_max_prob_index)
    processed_output = [label_dictionary_index_label[index] for index in output_index]

    return processed_output

## 将输入的文本进行分词

In [5]:
def pos_text_seg(text_to_be_seg):
    # 将输入的文本进行分词处理
    print(">>> Processing input text...")
    segged_text = jieba.cut(text_to_be_tag, cut_all=False)
    segged_text = '/'.join(segged_text)
    segged_text = segged_text.split('/')
    print('>>> Segged text：  ', segged_text)

    return segged_text

## 对输入的文本进行POS预测

In [6]:
def pos_text_predict(segged_text):
    # 将分词后的输入文本对应模型训练时所用的词典，转化为index
    processed_input = transfer_input(segged_text)
    
    # 模型预测
    print(">>> Using loaded model to predict...")
    output = pos_model.predict(processed_input)
    
    # 将输出转化为标签格式
    processed_output = transfer_output(output)
    
    # 将原词和词性组合输出
    final_output = []
    for index in range(len(segged_text)):
        final_output.append({segged_text[index]:processed_output[index]})

    return final_output

## 获取用户输入，加载模型进行预测，输出预测结果

In [7]:
if __name__ == '__main__':
    # 导入训练好的词性标注深度学习模型
    print("= "*20 + " Loading model... " + "= "*20)
    pos_model = load_model("./cn_pos_tag_BiGRU.h5")
    print("\nModel loaded successfully!")
    
    while True:
        # 接收输入文本
        print('\n', '= '*21, ' POS start ', '= '*21)
        text_to_be_tag = input("\n>>> Please input your sentence(if you want to quit, just type in quit):\n")
        if text_to_be_tag == 'quit':
            break
        print('\n', '-*'*15, ' Sentence processing ', '-*'*15)
        segged_text = pos_text_seg(text_to_be_tag)
        
        print('\n', '-*'*17, ' POS output ', '-*'*17)
        final_output = pos_text_predict(segged_text)
        
        print(final_output, '\n')

= = = = = = = = = = = = = = = = = = = =  Loading model... = = = = = = = = = = = = = = = = = = = = 

Model loaded successfully!

 = = = = = = = = = = = = = = = = = = = = =   POS start  = = = = = = = = = = = = = = = = = = = = = 

>>> Please input your sentence(if you want to quit, just type in quit):
随着科技、经济、社会之间关系的日益密切，以及支持个人、组织、国家竞争的信息需求之日益迫切，信息分析突破了科技信息分析的范围，形成了众多分支领域，成为现代信息咨询业的重要组成部分。


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1.WIN\AppData\Local\Temp\jieba.cache



 -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*  Sentence processing  -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
>>> Processing input text...


Loading model cost 0.886 seconds.
Prefix dict has been built successfully.


>>> Segged text：   ['随着', '科技', '、', '经济', '、', '社会', '之间', '关系', '的', '日益', '密切', '，', '以及', '支持', '个人', '、', '组织', '、', '国家', '竞争', '的', '信息', '需求', '之', '日益', '迫切', '，', '信息', '分析', '突破', '了', '科技', '信息', '分析', '的', '范围', '，', '形成', '了', '众多', '分支', '领域', '，', '成为', '现代', '信息', '咨询业', '的', '重要', '组成部分', '。']

 -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*  POS output  -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
>>> Using loaded model to predict...
[{'随着': 'P'}, {'科技': 'NN'}, {'、': 'PU'}, {'经济': 'NN'}, {'、': 'PU'}, {'社会': 'NN'}, {'之间': 'LC'}, {'关系': 'NN'}, {'的': 'DEG'}, {'日益': 'NN'}, {'密切': 'VA'}, {'，': 'PU'}, {'以及': 'CC'}, {'支持': 'VV'}, {'个人': 'NN'}, {'、': 'PU'}, {'组织': 'NN'}, {'、': 'PU'}, {'国家': 'NN'}, {'竞争': 'NN'}, {'的': 'DEC'}, {'信息': 'NN'}, {'需求': 'NN'}, {'之': 'DEG'}, {'日益': 'NN'}, {'迫切': 'NN'}, {'，': 'PU'}, {'信息': 'NN'}, {'分析': 'NN'}, {'突破': 'VV'}, {'了': 'AS'}, {'科技': 'NN'}, {'信息': 'NN'}, {'分析': 'NN'}, {'的': 'DEC'}, {'范围': 'NN'}, {'，': 'PU'}, {'形成': 'VV'}, {'了': 'AS'}, {'众多': 'CD'}, {'分支': 'NN'}, 