In [None]:
import os
from tqdm import tqdm
import numpy as np
from ckipnlp.driver.ss import CkipSentenceSegmenter as SentSeg
# Make sentence segmentation obj (data structure that ckip tagger use)
from ckiptagger import construct_dictionary, WS, POS, NER
import warnings
warnings.filterwarnings('ignore')

In [28]:
# init pipeline
ss = SentSeg(delims = {",", "。", ":", "?", "!", ";"})
ws = WS("../ckip", disable_cuda=False)
pos = POS("../ckip", disable_cuda=False)
ner = NER("../ckip", disable_cuda=False)

NER_LABLE = ['GPE', 'PERSON', 'DATE', 'EVENT']




Exception ignored in: <function POS.__del__ at 0x7f417f258cb0>
Traceback (most recent call last):
  File "/home/guest/r08944052/anaconda3/envs/pg3/lib/python3.7/site-packages/ckiptagger/api.py", line 185, in __del__
    self.model.sess.close()
AttributeError: 'POS' object has no attribute 'model'




In [34]:
data_pth = '../data'
train = np.load(os.path.join(data_pth, 'train_ner.npy'), allow_pickle=True)
test = np.load(os.path.join(data_pth, 'test_ner.npy'), allow_pickle=True)

In [31]:
def get_token_words(ws_lst):
    all_token = [' '.join(ws_sent) for ws_sent in ws_lst]
    all_sent = ' '.join(all_token).strip()
    return all_sent

In [41]:
def tokenizer(dataset):
    for i, data in tqdm(enumerate(dataset)):
        art_seg = ss(raw = data['content'])
        art_ws_lst = ws(art_seg)
        
        title_seg = ss(raw = data['title'])
        title_ws_lst = ws(title_seg)
        
        art_pos_lst = pos(art_ws_lst)
        ner_lst = ner(art_ws_lst, art_pos_lst)
        ner_set = []
        
        dataset[i]['token_content'] = get_token_words(art_ws_lst)
        dataset[i]['token_title'] = get_token_words(title_ws_lst)
        
        for j, sentence in enumerate(art_seg):
            for entity in sorted(ner_lst[j]):
                if entity[2] in NER_LABLE:
                    ner_w = entity[-2:]
                    if ner_w not in ner_set:
                        ner_set.append(entity[-2:])
                        
        ner_tgt = ' '.join([x[1] for x in ner_set])
    
        dataset[i]['ner'] = ner_set
        dataset[i]['ner_tgt'] = ner_tgt
        
        del art_seg, art_ws_lst, title_seg, title_ws_lst, art_pos_lst, ner_lst, ner_set
    
    return dataset

In [42]:
train_token = tokenizer(train)

4581it [1:04:10,  1.19it/s]


KeyboardInterrupt: 

In [None]:
np.save('../data/train_token.npy', train_token)
np.save('../data/test_token.npy', train_token)

In [None]:
import struct
import collections
from tensorflow.core.example import example_pb2

In [None]:
VOCAB_SIZE = 50_000  # 词汇表大小
CHUNK_SIZE = 1000    # 每个分块example的数量，用于分块的数据

# tf模型数据文件存放目录
FINISHED_FILE_DIR = '../data/finished_files'
CHUNKS_DIR = os.path.join(FINISHED_FILE_DIR, 'chunked')

In [None]:
def chunk_file(finished_files_dir, chunks_dir, name, chunk_size):
    """构建二进制文件"""
    in_file = os.path.join(finished_files_dir, '%s.bin' % name)
    print(in_file)
    reader = open(in_file, "rb")
    chunk = 0
    finished = False
    while not finished:
        chunk_fname = os.path.join(chunks_dir, '%s_%03d.bin' % (name, chunk))  # 新的分块
        with open(chunk_fname, 'wb') as writer:
            for _ in range(chunk_size):
                len_bytes = reader.read(8)
                if not len_bytes:
                    finished = True
                    break
                str_len = struct.unpack('q', len_bytes)[0]
                example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
                writer.write(struct.pack('q', str_len))
                writer.write(struct.pack('%ds' % str_len, example_str))
            chunk += 1


def chunk_all():
    # 创建一个文件夹来保存分块
    if not os.path.isdir(CHUNKS_DIR):
        os.mkdir(CHUNKS_DIR)
    # 将数据分块
    for name in ['train', 'val']:
        print("Splitting %s data into chunks..." % name)
        chunk_file(FINISHED_FILE_DIR, CHUNKS_DIR, name, CHUNK_SIZE)
    print("Saved chunked data in %s" % CHUNKS_DIR)


def write_to_bin(input_file, out_file, makevocab=False):
    """生成模型需要的文件"""
    if makevocab:
        vocab_counter = collections.Counter()

    with open(out_file, 'wb') as writer:
        
        for data in input_file:
                article = data['token_content']
                abstract = data['token_title']
                # 写入tf.Example
                tf_example = example_pb2.Example()
                tf_example.features.feature['article'].bytes_list.value.extend([bytes(article, encoding='utf-8')])
                tf_example.features.feature['abstract'].bytes_list.value.extend([bytes(abstract, encoding='utf-8')])
                tf_example_str = tf_example.SerializeToString()
                str_len = len(tf_example_str)
                writer.write(struct.pack('q', str_len))
                writer.write(struct.pack('%ds' % str_len, tf_example_str))

                # 如果可以，将词典写入文件
                if makevocab:
                    art_tokens = article.split(' ')
                    abs_tokens = abstract.split(' ')
                    abs_tokens = [t for t in abs_tokens]  # 从词典中删除这些符号
                    tokens = art_tokens + abs_tokens
                    tokens = [t.strip() for t in tokens]     # 去掉句子开头结尾的空字符
                    tokens = [t for t in tokens if t != ""]  # 删除空行
                    vocab_counter.update(tokens)
    print("Finished writing file %s\n" % out_file)

    # 将词典写入文件
    if makevocab:
        print("Writing vocab file...")
        with open(os.path.join(FINISHED_FILE_DIR, "vocab"), 'w', encoding='utf-8') as writer:
            for word, count in vocab_counter.most_common(VOCAB_SIZE):
                writer.write(word + ' ' + str(count) + '\n')
        print("Finished writing vocab file")