In [142]:
import numpy as np
import pandas as pd
import json
import pickle
import re

In [5]:
train_path = '../data/train_data.json'
dev_path = '../data/dev_data.json'
test_path = '../data/test1_data_postag.json'
schema_path = '../data/all_50_schemas'

In [3]:
train = []
with open(train_path, 'r', encoding='utf8') as f:
    for line in f:
        train.append(json.loads(line.strip()))

In [4]:
dev = []
with open(dev_path, 'r', encoding='utf8') as f:
    for line in f:
        dev.append(json.loads(line.strip()))

In [6]:
test = []
with open(test_path, 'r', encoding='utf8') as f:
    for line in f:
        test.append(json.loads(line.strip()))

## 信息抽取数据集 EDA
- 将数据整合为Dataframe格式，便于后续分析优化

### 初始化DataFrame

In [16]:
def create_df(data):
    text = []
    spo_list = []
    postag = []
    for sample in data:
        text_ = sample['text']
        postag_ = sample.get('postag', [])
        spo_list_ = sample.get('spo_list', [])
        text.append(text_)
        spo_list.append(json.dumps(spo_list_, ensure_ascii=False))
        postag.append(json.dumps(postag_, ensure_ascii=False))
    return pd.DataFrame(data={'text': text, 'spo_list': spo_list, 'postag': postag})

In [17]:
train_df = create_df(train)
dev_df = create_df(dev)
test_df = create_df(test)

### 处理官方postag数据
- words：官方分词
- poses：官方pos
- has_postag: 是否提供postag

In [173]:
def get_words(postag):
    pt = json.loads(postag)
    words = [x['word'] for x in pt]
    return '\n'.join(words)
def get_poses(postag):
    pt = json.loads(postag)
    poses = [x['pos'] for x in pt]
    return ' '.join(poses)
def has_postag(postag):
    pt = json.loads(postag)
    return len(pt) > 0

In [22]:
def process_postag(df):
    df['words'] = df.apply(lambda x: get_words(x['postag']), axis=1)
    df['poses'] = df.apply(lambda x: get_poses(x['postag']), axis=1)
    df['has_postag'] = df.apply(lambda x: has_postag(x['postag']), axis=1)
    return df

In [174]:
train_df = process_postag(train_df)
dev_df = process_postag(dev_df)
test_df = process_postag(test_df)

### 处理SPO数据
- spo: 将spo_list 转换为 [((sub_s, sub_e), (obj_s, obj_e), rel_id), ...] 格式 (sub_s 为subject在text开始位置)
- no_ref_sub: 存在subject未出现在text
- no_ref_obj: 存在object未出现在text

In [41]:
def get_schema2id():
    id2schema = []
    with open('schema_vocab.txt', 'r', encoding='utf8') as f:
        for line in f:
            schema = line.split('\t')[0]
            id2schema.append(schema)
    schema2id = {}
    for i in range(len(id2schema)):
        schema2id[id2schema[i]] = i
    return schema2id
schema2id = get_schema2id()
def get_tri(raw_tri, text):
    """
    [((sub_s, sub_e), (obj_s, obj_e), rel_id), ...]
    """
    pro_tri = []
    for t in raw_tri:
        sub = t['subject'].lower()
        obj = t['object'].lower()
        pre = t['predicate']
        pid = schema2id[pre]
        subs = text.find(sub)
        sube = subs + len(sub) - 1 if subs != -1 else subs
        objs = text.find(obj)
        obje = objs + len(obj) - 1 if objs != -1 else objs
        pro_tri.append(((subs, sube), (objs, obje), pid))
    return json.dumps(pro_tri, ensure_ascii=False)
def no_ref_sub(tri):
    for t in tri:
        subs = t[0][0]
        if subs == -1:
            return True
    return False
def no_ref_obj(tri):
    for t in tri:
        objs = t[1][0]
        if objs == -1:
            return True
    return False

In [88]:
def process_tri(df):
    df['spo'] = df.apply(lambda x: get_tri(json.loads(x['spo_list']), x['text'].lower()), axis=1)
    df['no_ref_sub'] = df.apply(lambda x: no_ref_sub(json.loads(x['spo'])), axis=1)
    df['no_ref_obj'] = df.apply(lambda x: no_ref_obj(json.loads(x['spo'])), axis=1)
    df['num_spo'] = df.apply(lambda x: len(json.loads(x['spo'])), axis=1)
    return df

In [160]:
train_df = process_tri(train_df)
dev_df = process_tri(dev_df)
test_df = process_tri(test_df)

### 修正训练集当中 spo实体未出现在text中的label

In [90]:

train_df[(train_df['no_ref_obj'] == True) | (train_df['no_ref_sub'] == True)]

Unnamed: 0,postag,spo_list,text,words,poses,has_postag,spo,no_ref_sub,no_ref_obj,num_spo


In [64]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
def analy_spo(index):
    data = train_df.iloc[index]
    text = data['text']
    print(f'index:{index}')
    print(f'text:{text}')
    sl, sp = json.loads(data['spo_list']), json.loads(data['spo'])
    for i in range(len(sl)):
        spo = sp[i]
        if spo[0][0] == -1 or spo[1][0] == -1:
            print('\nNOREF:')
            pp.pprint(sl[i])

In [54]:
indexs = list(train_df[(train_df['no_ref_obj'] == True) | (train_df['no_ref_sub'] == True)].index)

In [70]:
train_df.loc[16866,'spo_list'] = '[{"predicate": "作者", "object_type": "人物", "subject_type": "图书作品", "object": "胡抗美", "subject": "康里巎巎杂诗(全彩色高清珍藏本)"}]'

In [72]:
#, {"predicate": "改编自", "object_type": "作品", "subject_type": "影视作品", "object": "裸婚——80后的新结婚时代", "subject": "裸婚时代"}
train_df.loc[32313,'spo_list'] = '[{"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "张萌", "subject": "沙海"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "吴磊", "subject": "斗破苍穹"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "朱杰", "subject": "沙海"}, {"predicate": "作者", "object_type": "人物", "subject_type": "图书作品", "object": "南派三叔", "subject": "沙海"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "张铭恩", "subject": "沙海"}]'

In [74]:
train_df.loc[82535,'spo_list'] = '[{"predicate": "主持人", "object_type": "人物", "subject_type": "电视综艺", "object": "王立群", "subject": "百家讲坛"}]'

In [77]:
train_df.loc[93654,'spo_list'] = '[{"predicate": "编剧", "object_type": "人物", "subject_type": "影视作品", "object": "威廉·莫纳汉", "subject": "天国王朝"}, {"predicate": "编剧", "object_type": "人物", "subject_type": "影视作品", "object": "威廉·莫纳汉", "subject": "无间行者"}]'

In [80]:
train_df.loc[99891,'text'] = '概述桂林国际会展中心位于素有百里画廊之称的漓江之滨，是桂林市标志性建筑，占地面积15万平方米，建筑总面积5'

In [82]:
train_df.loc[138087,'spo_list'] = '[{"predicate": "出版社", "object_type": "出版社", "subject_type": "书籍", "object": "人民文学出版社", "subject": "千字文全解"}]'

In [84]:
train_df.loc[152858,'spo_list'] = '[{"predicate": "作者", "object_type": "人物", "subject_type": "图书作品", "object": "马银琴", "subject": "周秦时代诗的传播史"}, {"predicate": "出版社", "object_type": "出版社", "subject_type": "书籍", "object": "社会科学文献出版社", "subject": "周秦时代诗的传播史"}]'

In [86]:
train_df.loc[167561,'spo_list'] = '[{"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "万茜", "subject": "裸婚时代"}, {"predicate": "出品公司", "object_type": "企业", "subject_type": "影视作品", "object": "北京光彩世纪文化艺术有限公司", "subject": "裸婚时代"}, {"predicate": "导演", "object_type": "人物", "subject_type": "影视作品", "object": "滕华涛", "subject": "裸婚时代"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "韩童生", "subject": "裸婚时代"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "文章", "subject": "裸婚时代"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "姚笛", "subject": "裸婚时代"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "张凯丽", "subject": "裸婚时代"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "丁嘉丽", "subject": "裸婚时代"}, {"predicate": "改编自", "object_type": "作品", "subject_type": "影视作品", "object": "裸婚——80后的新结婚时代", "subject": "裸婚时代"}]'

In [65]:
for ind in indexs:
    analy_spo(ind)

index:16866
text:这本《康里巎巎杂诗(全彩色高清珍藏本)》(作者胡抗美)是其中一册

NOREF:
{   'object': '胡抗美',
    'object_type': '人物',
    'predicate': '作者',
    'subject': '康里巎巎《杂诗》',
    'subject_type': '图书作品'}
index:32313
text:吴磊《沙海》《斗破苍穹》吴磊的《沙海》，除了吴磊 ，还有秦昊 、杨蓉、张萌、张铭恩、朱杰 等人主演的现代探险题材电视剧，该剧改编自南派三叔同名小说，普通高中生黎簇被卷入一个以世界现状为目的庞大计划中，然后认识了这一切的幕后布局者、以旅行摄影作家关根身份登场的吴邪的故事

NOREF:
{   'object': '裸婚——80后的新结婚时代',
    'object_type': '作品',
    'predicate': '改编自',
    'subject': '裸婚时代',
    'subject_type': '影视作品'}
index:82535
text:2008年1月24日上午9点半，央视“百家讲坛”栏目主讲人王立群将携《王立群读史记之 项羽 》在北京长城饭店召开新书首发会，欢迎有兴趣的读者到场参加

NOREF:
{   'object': '王立群',
    'object_type': '人物',
    'predicate': '主持人',
    'subject': '王立群读《史记》',
    'subject_type': '电视综艺'}
index:93654
text:2010年9月，在《创:战纪》上映前，迪士尼曾聘请《无间行者》、《天国王朝》编剧威廉·莫纳汉为电影版改编剧本

NOREF:
{   'object': '白发魔女传',
    'object_type': '作品',
    'predicate': '改编自',
    'subject': '白发魔女传',
    'subject_type': '影视作品'}
index:99891
text:概述桂林国际会展中心位于素有百里画廊之称的漓江之滨，是桂林市标志性建筑，占地面积 15 万平方米，建筑总面积 5

NOREF:
{   'object': '1

In [217]:
train_df.to_csv('train.csv', index=False, encoding='utf8')
dev_df.to_csv('dev.csv', index=False, encoding='utf8')
test_df.to_csv('test.csv', index=False, encoding='utf8')

### 定义三种spo实体关系：
- 1. NORMAL：不包含下列情况
- 2. OVERLAP：同一实体 有多种spo关系  
- 3. MULTI-LABEL： 同一实体对 有多种关系
- 4. NEST： 实体嵌套

In [138]:
def is_normal(tri):
    entities = set()
    for t in tri:
        entities.add(tuple(t[0]))
        entities.add(tuple(t[1]))
    return len(entities) == 2 * len(tri)

def is_overlap(tri):
    if is_normal(tri):
        return False
    entities_pair = set()
    for t in tri:
        entities_pair.add(tuple([t[0][0], t[0][1], t[1][0], t[1][1]]))
    entities = set()
    for ep in entities_pair:
        entities.add(tuple([ep[0], ep[1]]))
        entities.add(tuple([ep[2], ep[3]]))
    return len(entities) != 2 * len(entities_pair)

def is_multi(tri):
    if is_normal(tri):
        return False
    entities_pair = list()
    for t in tri:
        entities_pair.append(tuple([t[0][0], t[0][1], t[1][0], t[1][1]]))
    return len(set(entities_pair)) != len(entities_pair)

def is_nest(tri):
    entities = set()
    for t in tri:
        entities.add(tuple(t[0]))
        entities.add(tuple(t[1]))
#     entities_pos = list()
#     for e in entities:
#         entities_pos.append(e[0])
#         entities_pos.append(e[1])
#     return len(set(entities_pos)) != len(entities_pos)
    entities = sorted(list(entities), key=lambda x: x[0])
    end = -1
    for e in entities:
        if e[0] <= end:
            return True
        end = e[1]
    return False

def pro_entities_relation(df):
    df['normal'] = df.apply(lambda x: is_normal(json.loads(x['spo'])), axis=1)
    df['overlap'] = df.apply(lambda x: is_overlap(json.loads(x['spo'])), axis=1)
    df['multi'] = df.apply(lambda x: is_multi(json.loads(x['spo'])), axis=1)
    df['nest'] =df.apply(lambda x: is_nest(json.loads(x['spo'])), axis=1)
    return df

In [139]:
train_df = pro_entities_relation(train_df)
dev_df = pro_entities_relation(dev_df)

In [130]:
train_df[train_df['nest']==True].iloc[0]['text']

'《如果我爱你》是由海润影视与明道工作室联合出品，徐辅军执导，明道、李沁、胡兵、白歆惠、狄杰等人气明星联袂主演的浪漫偶像剧'

在NEST状况中可以发现，一个实体可能会多次重复出现

如:
>'《如果我爱你》是由海润影视与明道工作室联合出品，徐辅军执导，明道、李沁、胡兵、白歆惠、狄杰等人气明星联袂主演的浪漫偶像剧'

明道工作室和明道可能存在嵌套，而之后存在单独的实体明道。

解决方案：
1. cpoy -> generate
2. 数据处理，若出现嵌套，向后继续搜索

In [169]:
def mod_spo(df, index):
    row = df.iloc[index]
    text = row['text']
    raw_spo = row['spo']
    spo = json.loads(row['spo'])
    entities = set()
    for t in spo:
        entities.add(tuple(t[0]))
        entities.add(tuple(t[1]))
    entities_list = sorted(list(entities), key=lambda x: (x[0], -(x[1]-x[0])))
    end = -1
    change_list = []
    for e in entities_list:
        if e[0] <= end:
            new = text.find(text[e[0]:e[1]+1], e[0]+1)
            if new != -1:
                change_list.append((e, (new, new+e[1]-e[0])))
        else:
            end = e[1]
    for change in change_list:
        old = json.dumps(change[0])
        new = json.dumps(change[1])
        raw_spo = raw_spo.replace(old,new)
        print(1)
    df.loc[index, 'spo'] = raw_spo
    
def mod_df_spo(df):
    dest_indexes = list(df[df['nest']==True].index)
    print(f'BEFORE MOD DEST NUMBER:{len(dest_indexes)}')
    for index in dest_indexes:
        mod_spo(df, index)
    df['nest'] =df.apply(lambda x: is_nest(json.loads(x['spo'])), axis=1)
    print("AFTER MOD DEST NUMBER:{}".format(len(list(df[df['nest']==True].index))))
    print('REDUCE {}'.format(len(dest_indexes) - len(list(df[df['nest']==True].index))))
    return df
        

In [161]:
train_df = mod_df_spo(train_df)

BEFORE MOD DEST NUMBER:6096
AFTER MOD DEST NUMBER:3676
REDUCE 2420


In [162]:
dev_df = mod_df_spo(dev_df)

BEFORE MOD DEST NUMBER:719
AFTER MOD DEST NUMBER:429
REDUCE 290


In [163]:
# LETS TRY AGAIM
train_df = mod_df_spo(train_df)

BEFORE MOD DEST NUMBER:3676
AFTER MOD DEST NUMBER:3661
REDUCE 15


In [164]:
dev_df = mod_df_spo(dev_df)

BEFORE MOD DEST NUMBER:429
AFTER MOD DEST NUMBER:423
REDUCE 6


In [165]:
# LETS TRY AGAIN AGAIN
train_df = mod_df_spo(train_df)
dev_df = mod_df_spo(dev_df)

BEFORE MOD DEST NUMBER:3661
AFTER MOD DEST NUMBER:3659
REDUCE 2
BEFORE MOD DEST NUMBER:423
AFTER MOD DEST NUMBER:423
REDUCE 0


In [170]:
# LETS TRY AGAIN AGAIN AGAIN
train_df = mod_df_spo(train_df)
dev_df = mod_df_spo(dev_df)

BEFORE MOD DEST NUMBER:3659
AFTER MOD DEST NUMBER:3659
REDUCE 0
BEFORE MOD DEST NUMBER:423
AFTER MOD DEST NUMBER:423
REDUCE 0


In [167]:
train_df.loc[42, 'spo']

'[[[1, 5], [9, 12], 17], [[1, 5], [33, 34], 0], [[1, 5], [24, 26], 4], [[1, 5], [39, 41], 0], [[1, 5], [14, 18], 17], [[1, 5], [43, 44], 0], [[1, 5], [36, 37], 0], [[1, 5], [30, 31], 0]]'

### 分词错误率
检测是否会由于分词造成实体不能正常识别

In [191]:
def check_seg(words, text, spo):
    """
    return:
    -1 无官方分词结果
    0  存在误分实体单词
    1  不存在误分
    """
    if len(words) == 0:
        return -1
    ws = words.split('\n')
    assert len(ws) + len(text) - 1 == len(words), f'{len(text)}\n{len(words)}\n{len(ws)}'
    entities = set()
    for t in spo:
        entities.add(tuple(t[0]))
        entities.add(tuple(t[1]))
    points = set()
    begin = 0
    for w in ws:
        points.add(begin)
        points.add(begin + len(w) - 1)
        begin += len(w)
    for e in entities:
        if e[0] not in points or e[1] not in points:
            return 0
    return 1

In [213]:
# dev_df['check_seg'] = dev_df.apply(lambda x: check_seg(x['words'], x['text'], json.loads(x['spo'])), axis=1)
train_df['check_seg'] = train_df.apply(lambda x: check_seg(x['words'], x['text'], json.loads(x['spo'])), axis=1)

In [216]:
train_df[train_df['check_seg']==0].shape[0]

28530

In [212]:
train_df.loc[99891, 'postag']='[{"word": "概述", "pos": "v"}, {"word": "桂林国际会展中心", "pos": "ns"}, {"word": "位于", "pos": "v"}, {"word": "素有", "pos": "v"}, {"word": "百里", "pos": "m"}, {"word": "画廊", "pos": "n"}, {"word": "之", "pos": "u"}, {"word": "称", "pos": "n"}, {"word": "的", "pos": "u"}, {"word": "漓江", "pos": "ns"}, {"word": "之", "pos": "u"}, {"word": "滨", "pos": "n"}, {"word": "，", "pos": "w"}, {"word": "是", "pos": "v"}, {"word": "桂林市", "pos": "ns"}, {"word": "标志性", "pos": "n"}, {"word": "建筑", "pos": "n"}, {"word": "，", "pos": "w"}, {"word": "占地面积", "pos": "n"}, {"word": "15万平方米", "pos": "m"}, {"word": "，", "pos": "w"}, {"word": "建筑", "pos": "n"}, {"word": "总", "pos": "a"}, {"word": "面积", "pos": "n"},{"word": "5", "pos": "m"}]'

In [209]:
train_df.loc[99891 , 'words'] = '概述\n桂林国际会展中心\n位于\n素有\n百里\n画廊\n之\n称\n的\n漓江\n之\n滨\n，\n是\n桂林市\n标志性\n建筑\n，\n占地面积\n15万平方米\n，\n建筑\n总\n面积\n5'

In [202]:
dev_df[dev_df['check_seg']==0].loc[8, 'text']

'马志舟，1907年出生，陕西三原人，汉族，中国共产党，任红四团第一连连长，1933年逝世'

In [203]:
dev_df[dev_df['check_seg']==0].loc[8, 'words']

'马志舟\n，\n1907年\n出生\n，\n陕西\n三原\n人\n，\n汉族\n，\n中国共产党\n，\n任\n红四团第一连\n连长\n，\n1933年\n逝世'

In [204]:
dev_df[dev_df['check_seg']==0].loc[8, 'spo_list']

'[{"predicate": "国籍", "object_type": "国家", "subject_type": "人物", "object": "中国", "subject": "马志舟"}, {"predicate": "出生日期", "object_type": "Date", "subject_type": "人物", "object": "1907年", "subject": "马志舟"}, {"predicate": "民族", "object_type": "Text", "subject_type": "人物", "object": "汉族", "subject": "马志舟"}, {"predicate": "出生地", "object_type": "地点", "subject_type": "人物", "object": "陕西三原", "subject": "马志舟"}]'

In [196]:
s1 = '查尔斯·阿兰基斯（Charles Aránguiz），1989年4月17日出生于智利圣地亚哥，智利职业足球运动员，司职中场，效力于德国足球甲级联赛勒沃库森足球俱乐部'

In [186]:
s2 = '查尔斯\n·\n阿兰基斯\n（\nCharles Aránguiz\n）\n，\n1989年4月17日\n出生\n于\n智利圣地亚哥\n，\n智利\n职业\n足球\n运动员\n，\n司职\n中场\n，\n效力\n于\n德国\n足球\n甲级\n联赛\n勒沃库森足球俱乐部'

In [180]:
len(s1)

82

In [187]:
len(s2)

108

In [188]:
len(s2.split('\n'))

27

In [183]:
l = 0
for w in s2.split('\n'):
    l += len(w)

In [184]:
l

164

'查尔斯\n·\n阿兰基斯\n（\nCharles Aránguiz\n）\n，\n1989年4月17日\n出生\n于\n智利圣地亚哥\n，\n智利\n职业\n足球\n运动员\n，\n司职\n中场\n，\n效力\n于\n德国\n足球\n甲级\n联赛\n勒沃库森足球俱乐部查尔斯\n·\n阿兰基斯\n（\nCharles Aránguiz\n）\n，\n1989年4月17日\n出生\n于\n智利圣地亚哥\n，\n智利\n职业\n足球\n运动员\n，\n司职\n中场\n，\n效力\n于\n德国\n足球\n甲级\n联赛\n勒沃库森足球俱乐部'