### 调用百度接口返回分词结果，对分词结果进行处理，处理成每行一个字(单词)一个词性的格式。

In [1]:
from aip import AipNlp

APP_ID = '********'
API_KEY = '*********************'
SECRET_KEY = '*************************'

client = AipNlp(APP_ID, API_KEY, SECRET_KEY)

In [2]:
# 改善QPS限制

import time
import sys
import re
SPLIT_LINE_MARKER = '！'
SPLIT_LINE_MARKER_SIZE = 3

# 拆分为句子
# def sentencesMaker(html):

#     import justext
#     paragraphs = justext.justext(html, [])

#     cache_sentences = ''
#     sentences = []


#     for p in paragraphs:
#         sent = p.text.strip().replace('\xa0', '').replace('\u3000', '')
#         sent = sent.encode('gb2312', 'ignore').decode('gb2312').encode('gbk', 'ignore').decode('gbk')
#         if not sent:
#             continue

#         # 可能是含有名字，需要进一步处理
#         if len(cache_sentences) < 5:
#             cache_sentences += ' ' + sent

#         else:
#             sentences.append(cache_sentences.strip())
#             cache_sentences = sent

#     if not not cache_sentences:
#         sentences.append(cache_sentences.strip())

#     return sentences

# 重新恢复句子
def restoreSentences(text, only_per=False):  # 长度不超过3700字节的句子
    restore_sentences = []
    isSucc = False
    if text is None:
        return restore_sentences, isSucc

    result = client.lexerCustom(text)

    items = result.get('items', [])
    items_size = len(items)

    tries_limit = 3
    tries_counter = 0
    
    while items_size == 0: # 分词结果为空
        if len(text) != 0: # 但句子长度不为空
            # 可能是qps限制
            time.sleep(1)
            result = client.lexerCustom(text)

            items = result.get('items', [])
            items_size = len(items)
            isSucc=True

        tries_counter += 1

        if tries_counter >= tries_limit: # 分词尝试大于等于4次之后仍失败
            print(f'error: 分词api请求失败多次！{text}')
#             print('error: 分词api请求失败多次!')
            return restore_sentences, isSucc

    restore_idx = 0

    last_restore_idx = 0
    has_per = False

    while restore_idx < items_size: # 对每个分词结果进行整理 
        # 分词不是拼接符"!!!"
        while restore_idx < items_size and items[restore_idx]['item'] != SPLIT_LINE_MARKER:
            item = items[restore_idx] # 先把第一个分词的结果(dict)赋值给item，item整理之后再直接赋给原items[]
            # TODO 剔除机构中的不合法字符
            format_pos = item['pos']
            
            # 对非ne标识的分词不做处理
            
            if item['ne'].startswith('ORG'): # 如果该分词的ne是ORG
                invalid_orgs = ['公司']
                item['item'] = item['item'].replace('&', '')
                if item['item'] in invalid_orgs:
                    format_pos = 'n' # 普通名词
                else:
                    format_pos = 'nt'  # 机构团体名

            elif item['ne'] == 'PER': # 如果该分词的ne是PER
                format_pos = 'nr' # 人名

            elif item['ne'] == 'TITLE': # 如果该分词的ne是TITLE（定制）
                format_pos = 'ti' # 职称

            elif item['ne'] == 'LOC': # 如果该分词的ne是LOC
                format_pos = 'ns'  # 地名

            elif item['ne'] == 'TIME': # 如果该分词的ne是TIME
                format_pos = 't' # 时间名词
                
                

            if format_pos == '': #如果pos为空，即是其他非上述的ne标识，将pos置为"xx"
                format_pos = 'xx'

            elif format_pos == 'nr':
                # 过滤先生或者女士之类的名称
                name = re.sub(r'((先生)|(小姐)|(阿姨)|(叔叔)|(女士)|(同志)|总)$', '', item['item'])

                if len(name) >= 2:
                    invalid_names = {
                        '区块链': 'n'
                    }

                    if name not in invalid_names:
                        has_per = True
                        item['item'] = name
                    else:
                        format_pos = invalid_names[name]

                else: # 剔除称谓之后的name长度如果小于2，就不是nr，设为n

                     format_pos = 'n'




            item['pos'] = format_pos # 处理之后的pos赋值给原分词的pos

            # 删除无用信息
            item.pop('basic_words')
            item.pop('formal')
            item.pop('byte_length')
            item.pop('byte_offset')
            item.pop('loc_details')
            item.pop('ne')
            item.pop('uri')

            items[restore_idx] = item # 处理之后的分词结果赋给原分词结果
            restore_idx += 1 # 继续下一个item

            
        # 若遇到了拼接符"!!!"或对所有非拼接符的分词结果处理完毕    
        if restore_idx + SPLIT_LINE_MARKER_SIZE - 1 < items_size: # 如果不是句末的最后一个拼接符
            needCut = True # 将修改之后的分词结果仍以一个sent为单位cut
            for i in range(SPLIT_LINE_MARKER_SIZE - 1): # 再次判断后一位是否是拼接符，如不是，不需要cut
                if items[restore_idx + i + 1]['item'] != SPLIT_LINE_MARKER:
                    needCut = False
                    break

            if needCut:
                ed = max(restore_idx, 0)

                sentence_items = items[last_restore_idx:ed] # 切句子，[0:restore_idx]
                if len(sentence_items) != 0 and (has_per or not only_per):
                    # print('per:', sentence_items)
                    restore_sentences.append(sentence_items)

                next_st = min(ed + SPLIT_LINE_MARKER_SIZE, items_size)
                last_restore_idx = next_st

                restore_idx += SPLIT_LINE_MARKER_SIZE
            else:
                restore_idx += 1

        else: # 句末最后一个拼接符
            ed = max(restore_idx, 0)
            sentence_items = items[last_restore_idx:ed]
            if len(sentence_items) != 0 and (has_per or not only_per):

                restore_sentences.append(sentence_items)

            restore_idx = items_size

        has_per = False

    # print(restore_sentences)
    return restore_sentences, isSucc


# 解析并标注HTML
def posHtml(sentences, only_per=False):
#     sentences = sentencesMaker(html) # 将带有html标签的段落整理成一个list,去除标签的文字段落。


    cut_str = '' 

    pos_sentences = []

    for sent in sentences:
        sent = sent.strip()
        if not sent:  # 若句子为空，跳出循环
            continue
        if not cut_str: # 若cut_str为空，即第一句之前，把第一句话赋值给tmp_str
            tmp_str = sent
        else: # 若cut_str里有句子，将后续句子拼接，以“!!!”作为拼接符，再赋值给tmp_str
            tmp_str = cut_str + SPLIT_LINE_MARKER * SPLIT_LINE_MARKER_SIZE + sent
        if sys.getsizeof(tmp_str) < 3700: # 拼接之后的句子小于3700字节
            cut_str = tmp_str # tmp_str赋值给cut_str，继续拼接sent
        else:
            try:
                if cut_str:
                    time.sleep(0.5) # 拼接之后的tmp_str若大于3700字节，取未拼接该sent之前的cut_str拿来分词
                    sents, issucc = restoreSentences(cut_str, only_per)
                    pos_sentences += sents
    #                 if not issucc:
    #                     print(html)

            except Exception as e:
                print(e)
                print('error: ', sent)

            cut_str = sent # 把刚才没有拼接成功的sent重新赋给cut_str


    if not not cut_str: # 最后一句sent
        time.sleep(0.5)
        try:
            if cut_str:
                time.sleep(0.5) # 拼接之后的tmp_str若大于3700字节，取未拼接该sent之前的cut_str拿来分词
                sents, issucc = restoreSentences(cut_str, only_per)
                pos_sentences += sents
#                 if not issucc:
#                     print(html)
        except Exception as e:
            print(e)
            print('error: ', sent)
    return pos_sentences


In [3]:
'''
    不产生中间结果json文件，直接将分词结果写成word pos形式，保存在.txt文件中。避免从json中读取时出错。
    
    输入： testData文件夹下，38779.txt --- 42471.txt，成段的文本
    
    输出： testData_1.txt，word pos形式，粗糙 
'''

'''
    提示： invalid literal for int() with base 10: ''
    是因为有.ipynb_checkpoints，非空文件夹。
    删除一次即可。
    每次先检查一下是否有.ipynb_checkpoints
'''


import os
# import shutil
# shutil.rmtree("./testData/.ipynb_checkpoints") 

import json
import re
total_number = 0
output = open("./testData_1.txt", "w", encoding="utf-8")
# f = open('justTest.json', 'w', encoding="utf-8")
for file in os.listdir(r'./testData'):
    number = int(file.split('.')[0])
    domain = os.path.abspath(r'./testData')
    file = os.path.join(domain, file) 
    if 38779 <= number and number <= 42471: 
#         print(file)
        data = open(file, "r", encoding="utf-8")
        title = ''
        content = ''
        para = []
        for line in data:
            line = line.strip()
            try:
                if line.startswith("title"):
                    title = line[6:]
                if line.startswith("content"):
                    content = line[8:]
            except Exception as e:
                print(e)
        text = title + "。" +content
        text = re.split("([。])", text)
        for line in text:
            para.append(line)
#         f.write(json.dumps(posHtml(para),ensure_ascii=False,indent=2))
        result = posHtml(para)
        for sent in result:
            for pair in sent:
                word = pair.get("item")
                pos = pair.get("pos")
                if word == "。":
                    output.write(word + " " + pos + "\r\n")
                    output.write("\n")
                else:
                    output.write(word + " " + pos + "\r\n")
#         total_number += 1
                
    else:
        continue
        
output.close()

# print("total_number", total_number)

error: 分词api请求失败多次！link: http://cvpr2018.thecvf.com/CVPR 2018论文列表CVPR 2018 Accepted PapersSingle-Shot Refinement Neural Network for Object DetectionVideo Captioning via Hierarchical Reinforcement LearningDensePose: Multi-Person Dense Human Pose Estimation In The WildDensePose: Multi-Person Dense Human Pose Estimation In The WildFrustum PointNets for 3D Object Detection from RGB-D DataTips and Tricks for Visual Question Answering: Learnings from the 2017 ChallengeRethinking the Faster R-CNN Architecture for Temporal Action LocalizationShape from Shading through Shape EvolutionShape from Shading through Shape EvolutionA High-Quality Denoising Dataset for Smartphone CamerasImproving Color Reproduction Accuracy in the Camera Imaging PipelineEnd-to-End Dense Video Captioning with Masked TransformerEnd-to-End Dense Video Captioning with Masked TransformerpOSE: Pseudo Object Space Error for Initialization-Free Bundle AdjustmentLearning to Segment Every ThingDensity-aware Single Image De-raini

In [4]:
'''
    直接调用百度API之后的分词结果，可能由于英文，导致分出来的词是多个单词；
    在形式上看就不是两列了，需要将除pos之外的列合并。
    
    输入： testData_1.txt, word pos形式，较粗糙，可能不止两列，
    
    输出： testDataNew_1.txt， word pos形式， 全部转成两列格式。
'''

cut_str = ""
data_new = open("./testDataNew_1.txt", "w", encoding="utf-8")
with open("./testData_1.txt", "r", encoding="utf-8") as data:
    for line in data:
        line = line.strip()
        item = line.split(" ")
        l = len(line.split(" "))
        if l == 2:
            word = item[0]
            pos = item[1]
            data_new.write(word + " " + pos + "\r\n")
        if l > 2:
            pos = item[-1]
            for i in range(l-1):
                if  not cut_str:
                    tmp_str = item[i]
                else:
                    tmp_str = cut_str + "$" + item[i]
                if i < l-1:
                    cut_str = tmp_str
            data_new.write(cut_str + " " + pos + "\r\n")
            cut_str = ""
            tmp_str = ""
data_new.close()

In [5]:
'''
    第一步：将所有拼接的word拆开，全部转成两列的word pos格式
    
    输入： testDataNew_1.txt, word（包括拼接）pos形式， 全部转成两列格式。
    
    输出： testDataNewAgain_1.txt， 
'''

data_new = open("./testDataNewAgain_1.txt", "w", encoding="utf-8")
with open("./testDataNew_1.txt", "r", encoding="utf-8") as data:
    for line in data:
        line = line.encode("utf-8").decode("utf_8-sig")
        line = line.strip()
        if len(line.split(" ")) != 2:
            print(line)
            
        if len(line.split(" ")) == 2:
            word = line.split(" ")[0]
            pos = line.split(" ")[1]
            
        if len(word.split("$")) == 1: # word不是拼接的
            data_new.write(word + " " + pos + "\r\n")
            
        if len(word.split("$")) > 1: # word是拼接的
            wordList = word.split("$")
            l = len(word.split("$"))
            for i in range(l):
                data_new.write(wordList[i] + " " + pos + "\r\n")
data_new.close()

In [6]:
'''
    第二步：如果pos不是nr nt ti，转换成o
    
    输入： testDataNewAgain_1.txt， word已经全部是拆开之后的格式
    
    输出： testDataNewAgain_2.txt 
    
'''

data_new = open("./testDataNewAgain_2.txt", "w", encoding="utf-8")
with open("./testDataNewAgain_1.txt", "r", encoding="utf-8") as data:
    for line in data:
        line = line.encode("utf-8").decode("utf_8-sig")
        line = line.strip()
        if len(line.split(" ")) != 2:
            print(line)
            
        if len(line.split(" ")) == 2:
            word = line.split(" ")[0]
            pos = line.split(" ")[1]
            
        if  pos == "nt" or pos =="nr" or pos =="ti":
            
            data_new.write(word + " " + pos + "\r\n")
        else:
            data_new.write(word + " " + "o" + "\r\n")
            
data_new.close()

n
n
nz
nt
nt
nt
nt
nt
nt
nt
nz
nt
nt
nt
nr
w
w
w
w
nt
xc
xc
xc
xc
xc
xc
xc
xc
xc
xc
xc
xc
xc
xc
xc
xc
nz
m
xc
xc
nz
xc
xc
w
w
nz
xc
xc
w
w
m
xc
xc
xc
xc
xc
xc
w
w
w
w
nt
nz
nr
ns
ns
m
m
nw
nw
nz
nt
nt
m
nz
nz
nz
nz
nz
nz
w
w
nr
nr
nr
nz
nz
nz
nz
nz
nz
nr
nr
nz
nr
nz
nr
nt
nr
nr
nz
nz
w
w
w
w
w
w
w
w
w
w
xc
xc
w
w
nt
nt
w
w
w
w
w
w
w
w
w
w
w
w
w
w
w
w
w
w
w
w
w
w
w
w
w
w
nz
nz
nz
nz
nz
nz
nz
nz
nz
nz
nz
nz
nr
nr
nz
nz
nz
nz
nz
nz
nz
nr
w
w
w
w
w
w
w
w
w
w
w
w
w
w
nz
nz
nz
nz
nz
nz
nz
nz
nz
nr
nr
nz
nz
nz
nz
nr
nr
nz
nz
nz
nz
nz
nr
w
w
w
w
w
w
w
w
xc
xc
nz
nz
xc
xc
nz
nz
xc
xc
xc
xc
xc
xc
xc
xc
xc
xc
nz
nz
xc
xc
w
w
nz


In [7]:
def allIsChinese(text):
    return all('\u4e00' <= char <= '\u9fff' for char in text)

def isSymbol(text):
    import string
    punc = string.punctuation
    punc = punc.replace(".", "")
    return all(char in punc for char in text)

In [8]:
'''
    第三步：将word转成单独的汉字或者单词，但是汉字/字母/数字/符号混合格式的数据暂未处理，一起保存下来
    
    输入： testDataNewAgain_2.txt， 
    
    输出： testDataNewAgain_3.txt，
    
'''

data_new = open("./testDataNewAgain_3.txt", "w", encoding="utf-8")
with open("./testDataNewAgain_2.txt", "r", encoding="utf-8") as data:
    for line in data:
        line = line.encode("utf-8").decode("utf_8-sig")
        line = line.strip()
        if len(line.split(" ")) != 2:
            print(line)
            
        if len(line.split(" ")) == 2:
            word = line.split(" ")[0]
            pos = line.split(" ")[1]
        
        if len(word) == 1:
            data_new.write(word + " " + pos + "\r\n")  
        
        if len(word) > 1:
            if allIsChinese(word): # 全是汉字
                l = len(word)
                for i in range(l):
                    data_new.write(word[i] + " " + pos + "\r\n")   
                    
            if word.encode( 'UTF-8' ).isalpha():  # 全是字母           
                data_new.write(word + " " + pos + "\r\n") 
            
            if word.encode( 'UTF-8' ).isdigit():  # 全是数字           
                data_new.write(word + " " + pos + "\r\n") 
                
            if not allIsChinese(word) and not word.encode( 'UTF-8' ).isalpha() and not word.encode( 'UTF-8' ).isdigit():
                data_new.write(word + " " + pos + "\r\n")
                
data_new.close()

In [9]:
'''
    第四步：将汉字/字母/数字/符号混合格式的数据处理，
                ① 相邻元素类型不一致就划分（但小数被拆成了两个整数）
    
    输入： testDataNewAgain_3.txt，word转成单独的汉字或者单词，但是汉字/字母/数字/符号混合格式的数据暂未处理
    
    输出： testDataExp.txt，
    
'''

data_new = open("./testDataExp.txt", "w", encoding="utf-8")
with open("./testDataNewAgain_3.txt", "r", encoding="utf-8") as data:
    tmp_eng = ""
    tmp_num = ""
    for line in data:
        line = line.encode("utf-8").decode("utf_8-sig")
        line = line.strip()
        if len(line.split(" ")) != 2:
            print(1)
            print(line)
            
        if len(line.split(" ")) == 2:
            word = line.split(" ")[0]
            pos = line.split(" ")[1]
            
            if len(word) == 1:  # 如果word长度等于1，单独的汉字，字母，数字，标点符号等
                data_new.write(word + " " + pos + "\r\n")
                
            if len(word) > 1: # 如果word长度不小于1
                
                if allIsChinese(word): # 全是汉字
                    l = len(word)
                    for i in range(l):
                        data_new.write(word[i] + " " + pos + "\r\n")  

                if word.encode( 'UTF-8' ).isalpha():  # 全是字母           
                    data_new.write(word + " " + pos + "\r\n") 
                

                if word.encode( 'UTF-8' ).isdigit():  # 全是数字           
                    data_new.write(word + " " + pos + "\r\n") 
                    
                if not allIsChinese(word) and not word.encode( 'UTF-8' ).isalpha() and not word.encode( 'UTF-8' ).isdigit():
            
                    for i in range(len(word)):

                        if word[i] in tmp_eng and word[i-1].encode('utf-8').isalpha():
                            continue

                        if word[i] in tmp_num and word[i-1].encode('utf-8').isdigit():
                            continue

                        if word[i].encode( 'UTF-8' ).isalpha():
                            eng = word[i]
                            for j in range(i+1, len(word)):
                                if j < len(word):
                                    if word[j].encode('utf-8').isalpha():
                                        eng  = eng + word[j]
                                    else: # 遇到相邻的字符，类型不一致
                                        data_new.write(eng + " " + pos + "\r\n")
                                        tmp_eng = eng
                                        eng = ""
                                        break
                            else:
                                data_new.write(eng + " " + pos + "\r\n") # 相邻的字符类型都一致，并且把字符串遍历完了。
                                tmp_eng = eng
                                eng = ""
                            continue


                        if word[i].encode( 'UTF-8' ).isdigit():
                            num = word[i]
                            for j in range(i+1, len(word)):
                                if j < len(word):
                                    if word[j].encode('utf-8').isdigit():
                                        num  = num + word[j]     
                                    else:
                                        data_new.write(num + " " + pos + "\r\n")
                                        tmp_num = num
                                        num = ""
                                        break
                            else:
                                data_new.write(num + " " + pos + "\r\n")
                                tmp_num = num
                                num = ""
                            continue


                        if allIsChinese(word[i]):
                            data_new.write(word[i] + " " + pos + "\r\n")

                        if isSymbol(word[i]):
                            data_new.write(word[i] + " " + pos + "\r\n") 
                            

data_new.close()