## 简历中关键动宾关系（动名词短语块）挖掘

In [5]:
from stanfordcorenlp import StanfordCoreNLP
from nltk import , ProbabilisticTree
from nltk.chunk.regexp import *
import nltk.tree as tree
import nltk,re

# 安装和教程详见StanfordCoreNLP官网
nlp = StanfordCoreNLP(r'D:\ProgramData\nlp_package\stanford-corenlp-full-2018-10-05', 
                      lang='zh')

In [4]:
grammer = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammer)  #生成规则

pattern = re.compile(u'[^a-zA-Z\u4E00-\u9FA5]')
pattern_del = re.compile('(\a-zA-Z0-9+)')

In [6]:
def _replace_c(text):
    """批量替换函数"""
    intab = ",?!()"       # 替换目标输入
    outtab = "，？！（）" # 替换后输出
    deltab = " \n<li>< li>+_-.><li \U0010fc01 _"   # 删除目标
    trantab = text.maketrans(intab, outtab, deltab)
    return text.translate(trantab)


def parse_sentence(text):
    """stanfordnlp解析之后，通过nltk tree返回依存树结构"""
    # 文本过滤替换处理
    text = _replace_c(text)
    try:
        if len(text.strip()) > 6:
            return Tree.fromstring(nlp.parse(text.strip()))
    except:
        pass


def pos(text):
    """stanfordnlp词性标注"""
    text = _replace_c(text)
    if len(text.strip()) > 6:
        return nlp.pos_tag(text)
    else:
        return False


def denpency_parse(text):
    return nlp.dependency_parse(text)

In [7]:
def read_data(path):
    return open(path, "r", encoding="utf8")


def get_noun_chunk(tree):
    """合并NP关系节点下的名词子节点"""
    noun_chunk = []
    if tree.label() == "NP":
        nouns_phase = ''.join(tree.leaves())
        noun_chunk.append(nouns_phase)
    return noun_chunk


def get_ip_recursion_noun(tree):
    """遍历所有IP节点下的名词"""
    np_list = []
    if len(tree) == 1:
        tr = tree[0]
        get_ip_recursion_noun(tr)
    if len(tree) == 2:
        tr = tree[0]
        get_ip_recursion_noun(tr)
        tr = tree[1]
        get_ip_recursion_noun(tr)
    if len(tree) == 3:
        tr = tree[0]
        get_ip_recursion_noun(tr)
        tr = tree[1]
        get_ip_recursion_noun(tr)
        tr = tree[2]
        get_ip_recursion_noun(tr)
    if tree.label() == 'NP':
        np_list.append(get_noun_chunk(tree))
    return np_list


def get_vv_loss_np(tree):
    """遍历传入的非NP节点下的所有名词"""
    if not isinstance(tree, nltk.tree.Tree):
        return False
    stack = []
    np = []
    stack.append(tree)
    current_tree = ''
    while stack:
        current_tree = stack.pop()
        # 过滤VP
        if isinstance(current_tree, nltk.tree.Tree) and current_tree.label() == 'VP':
            continue
        # 压入非NP节点
        elif isinstance(current_tree, nltk.tree.Tree) and current_tree.label() != 'NP':
            for i in range(len(current_tree)):
                stack.append(current_tree[i])
        # chunk NP节点
        elif isinstance(current_tree, nltk.tree.Tree) and current_tree.label() == 'NP':
            np.append(get_noun_chunk(tree))
    if np:
        return np
    else:
        return False


def search(tree_in):
    """寻找动宾关系
    tree_in：nltk Tree对象
    """
    if not isinstance(tree_in, nltk.tree.Tree):
        return False
    
    vp_pair = []  # 结果列表
    stack = []
    stack.append(tree_in)
    current_tree = ''
    
    while stack:
        tree = stack.pop()
        # 根节点，压入栈子节点
        if isinstance(tree, nltk.tree.Tree) and tree.label() == "ROOT":
            for i in range(len(tree)):
                stack.append(tree[i])
        # 从句节点，压入栈其子节点
        if isinstance(tree, nltk.tree.Tree) and tree.label() == "IP":
            for i in range(len(tree)):
                stack.append(tree[i])
        # 动宾关系节点，记录vp_pair
        if isinstance(tree, nltk.tree.Tree) and tree.label() == "VP":
            # 防止重复
            duplicate = []
            if len(tree) >= 2:
                for i in range(1, len(tree)):
                    if tree[0].label() == 'VV' and tree[i].label() == "NP":
                        # 合并该节点下动词，不扰乱语义
                        verb = ''.join(tree[0].leaves())
                        # 合并相邻名词
                        noun = get_noun_chunk(tree[i])
                        if verb and noun:
                            vp_pair.append((verb, noun))
                            duplicate.append(noun)
                    elif tree[0].label() == 'VV' and tree[i].label() != "NP":
                        noun = get_vv_loss_np(tree)
                        verb = ''.join(tree[0].leaves())
                        if verb and noun and noun not in duplicate:
                            duplicate.append(noun)
                            vp_pair.append((verb, noun))
    
    if vp_pair:
        return vp_pair
    else:
        return False

In [11]:
if __name__=="__main__":
    out = open("dependency.txt",'w',encoding='utf8')
    itera = read_data('text.txt')
    
    for it in itera:
        s = parse_sentence(it)
        res = search(s)   
        print(res)
        if not isinstance(res, bool):
            out.write(str(res) + '\n')
            
    itera.close()
    out.close()

False
[('利用', ['各种手段推广公司品牌'])]
False
False
[('落实', ['凝聚力工程'])]
[('组织', ['各类公关'])]
False


## 名词短语块挖掘

In [1]:
import os,json,nltk,re
from pyhanlp import *

In [2]:
Tokenizer = JClass('com.hankcs.hanlp.tokenizer.StandardTokenizer')

huan_hang = set(['。', '？', '！', '?'])
keep_pos = "q,qg,qt,qv,s,t,tg,g,gb,gbc,gc,gg,gm,gp,mg,Mg,n,an,ude1,nr,ns,nt,\
            nz,nb,nba,nbc,nbp,nf,ng,nh,nhd,o,nz,nx,ntu,nts,nto,nth,ntch,ntcf,\
            ntcb,ntc,nt,nsf,ns,nrj,nrf,nr2,nr1,nr,nnt,nnd,nn,nmc,nm,nl,nit,\
            nis,nic,ni,nhm,nhd"

keep_pos_nouns = set(keep_pos.split(","))
keep_pos_v = "v,vd,vg,vf,vl,vshi,vyou,vx,vi,vn"
keep_pos_v = set(keep_pos_v.split(","))
keep_pos_p = set(['p', 'pbei', 'pba'])
merge_pos = keep_pos_p | keep_pos_v  # union
keep_flag = set([
    '：', '，', '？', '。', '！', '；', '、', '-', '.', '!', ',', ':', ';', '?', '(',
    ')', '（', '）', '<', '>', '《', '》'
])
drop_pos_set = set([
    'xu', 'xx', 'y', 'yg', 'wh', 'wky', 'wkz', 'wp', 'ws', 'wyy', 'wyz', 'wb',
    'u', 'ud', 'ude1', 'ude2', 'ude3', 'udeng', 'udh'
])


def to_string(sentence, return_generator=False):
    """hanlp的sentence分词"""
    if return_generator:
        return (word_pos_item.toString().split('/')
                for word_pos_item in Tokenizer.segment(sentence))
    else:
        return [(word_pos_item.toString().split('/')[0],
                 word_pos_item.toString().split('/')[1])
                for word_pos_item in Tokenizer.segment(sentence)]


def cut_hanlp(raw_sentence, return_list=True):
    """分词结果返回list还是生成器（return_list=False）"""
    if len(raw_sentence.strip()) > 0:
        return to_string(raw_sentence) if return_list else iter(
            to_string(raw_sentence))


def getNodes(parent, model_tagged_file):
    """合并输入tree对象的目标子节点，输出名词短语块"""
    text = ''
    for node in parent:
        if type(node) is nltk.Tree:
            # 合并子节点的子节点：('文字'，'词性')
            if node.label() == 'NP':
                text += ''.join(
                    node_child[0].strip()
                    for node_child in node.leaves()) + "/NP" + "\n"
            if node.label() == 'VP':
                text += ''.join(
                    node_child[0].strip()
                    for node_child in node.leaves()) + "/VP" + "\n"
        # 叶子节点
        else:
            # 介词保留
            if node[1] in keep_pos_p:
                text += node[0].strip() + "/PP" + "\n"
            # 符号不需要
            if node[0] in huan_hang:
                text += node[0].strip() + "/O" + "\n"
            # 其他不需要
            if node[1] not in merge_pos:
                text += node[0].strip() + "/O" + "\n"
    
    print(text)
    model_tagged_file.write(text + "\n")


def grammer(sentence, model_tagged_file):
    """
    input sentences shape:[('工作', 'vn'), ('描述', 'v'), ('：', 'w'), 
                          ('我', 'rr'), ('曾', 'd'), ('在', 'p')]
    """
    # 解析的词性和格式：<一个单元>   
    #                                 |：  或者   
    #                                 *：0或多次   
    #                                +：1或者多次
    #                               ？：0或者1次  
    #                               { }：一条规则单元
    # 词性按hanlp标准
    grammar1 = r"""NP: 
        {<m|mg|Mg|mq|q|qg|qt|qv|s|>*<a|an|ag>*<s|g|gb|gbc|gc|gg|gm|gp|n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|o|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+<f>?<ude1>?<g|gb|gbc|gc|gg|gm|gp|n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|o|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+}
        {<n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+<cc>+<n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+}
        {<m|mg|Mg|mq|q|qg|qt|qv|s|>*<q|qg|qt|qv>*<f|b>*<vi|v|vn|vg|vd>+<ude1>+<n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+}
        {<g|gb|gbc|gc|gg|gm|gp|n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+<vi>?}
        VP:{<v|vd|vg|vf|vl|vshi|vyou|vx|vi|vn>+}
        """
    # 解析器对象
    cp = nltk.RegexpParser(grammar1)
    try:
        # 解析为tree，通过.draw()可视化
        result = cp.parse(sentence)
    except:
        pass
    else:
        getNodes(result, model_tagged_file)


def main():
    """主函数"""
    fout = open('nvp.txt', 'w', encoding='utf8')

    for line in open('text_2.txt', 'r', encoding='utf8'):
        line = line.strip()
        grammer(cut_hanlp(line), fout)

    fout.close()

if __name__ == '__main__':
    main()

所属行业/NP
：/O
/O
快速消费品/NP
(/O
食品/NP
,/O
饮料/NP
,/O
化妆品/NP
)/O

业务部/NP
/O
采购主管/NP

本人/O
是从/VP
黄金珠宝营业员/NP
开始做起/VP
,/O
现/O
从事/VP
黄金珠宝/NP
和/O
化妆品的招商工作/NP
,/O
曾/O
参与/VP
1999/O
年/O
和/O
2004年3家商场的招商工作/NP
,/O
在/PP
公司/NP
我/O
能/VP
比较/O
完美/O
的/O
贯彻/VP
和/O
执行公司的营销策略/NP
和/O
销售计划/NP
,/O
并能提出/VP
合理化建议/NP
,/O
在工作中/O
能/VP
与/O
供应商保持良好/NP
的/O
合作/VP
关系/NP
./O

RESUMEDOCSSTARTFLAG/NP
/O
销售总监/NP
/O
半年市场部总监工作/NP
，/O
之后/O
公司/NP
开始/VP
华东地区大客户销售业务/NP
，/O
转为/VP
华东区/NP

销售总监/NP
，/O
创建/VP
易车网华东地区大客户销售团队/NP
，/O
并/O
带领/VP
团队/NP
完成/VP
公司/NP
下达/VP

的/O
销售任务/NP
。/O
。/O

主要/O
收获/NP
：/O
半年市场总监工作经验/NP
，/O
成功/O
稳固/O
公司/NP
在/PP
上海的品牌形象/NP
和/O
客户关系/NP
，/O
有效/O

利用/VP
各种手段推广公司品牌/NP
。/O
。/O

两年半网络广告销售/NP
从业/O
及/O
管理经验/NP
，/O
成功/O
组建/VP

易车网华东区域大客户销售团队/NP
，/O
以/PP
直/O
客/NP
为/PP
导向/NP
，/O
兼顾/VP
渠道/NP
。/O
。/O

有/VP
丰富/O
的/O
互联网/NP

广告知识/NP
，/O
同时/O
也/O
有/VP
华东区域/NP
的/O
所有/O
汽车客户/NP
及/O
周边产品/NP
的/O
直/O
客关系/NP
，/O
及/O
相/O

关的渠道关系/NP
。/O
。/O

RESUMEDOCSSTARTFLAG/NP
/O
市场部高级经理/NP
/O
负责东方网/NP
旗下/O
的/O
<东方社区>刊物的市场推广/NP
及/O
合作/VP

主要/O
收获/