# 1. 获取事件链正例

遍历事件树，构造长度为5的事件链

## 1.1 从corpus中读取事件列表

In [1]:
import os
dataset_dir = os.path.join('..', 'corpus')
data_list = os.listdir(dataset_dir)

print('Total news number:', len(data_list))

Total news number: 221


## 1.2 构造事件链

In [25]:
def find_all_path(graph, start, end, path=[]):
    """
    遍历图的所有路径
    :param graph:
    :param start:
    :param end:
    :param path: 存储路径
    :return:
    """
    path = path + [start]
    if start == end:
        return [path]
    paths = []  # 存储所有路径
    for node in graph[start]:
        if graph[start][node] == '并列关系':
            continue
        if node not in path:
            newpaths = find_all_path(graph, node, end, path)
            for newpath in newpaths:
                paths.append(newpath)
    return paths

In [26]:
import json
from itertools import combinations

good_event_chain = []

for data_file_name in data_list:
    try:
        # 读取文件内容
        data_file_path = os.path.join(dataset_dir, data_file_name)
        with open(data_file_path, 'r', encoding='utf-8') as f:
            data = json.loads(f.read())  # 文件JSON内容

        keys = sorted(data['event_relation'].keys())
        graph_start = keys[0]
        graph_end = data["event_element"][-1]["event_graph"][-1]["child_event_id"]

        # 获取第一句话的event node
        first_event_keys = [x["child_event_id"] for x in data["event_element"][0]["event_graph"]]

        # 获取所有完整事件链
        event_chains = []
        event_chains = find_all_path(data['event_relation'], graph_start, graph_end, event_chains)

        # 获取长度为5的事件链，其中第一个节点属于first_event_keys
        ret_event_chains = []
        for event_chain in event_chains:
            if len(event_chain) < 5:
                continue
            if event_chain[0] not in first_event_keys:
                continue

            first_events = [x for x in event_chain if x in first_event_keys]
            for i in range(len(event_chain)):
                if event_chain[i] in first_events:
                    second_events = event_chain[i + 1:]
                    tmp_event_chains = list(combinations(second_events, 4))
                    tmp_event_chains = [[event_chain[i]] + list(x) for x in tmp_event_chains]
                    ret_event_chains += tmp_event_chains

        # 事件链去重
        ret = []
        for event_item in ret_event_chains:
            if event_item not in ret:
                ret.append(event_item)
        news_id = data_file_name.split('_')[0]
        ret_event_chains = ret
        del ret

        # 补充news id
        for event_item in ret_event_chains:
            good_event_chain.append([news_id + "_" + x for x in event_item])
    except Exception as e:
        print('Find error in', data_file_name)
        print(e)
print('Total event chain number:', len(good_event_chain))

Total event chain number: 10445


In [27]:
# with open(os.path.join('..', 'dataset', 'tmp', 'good.data'), 'w', encoding='utf-8') as f:
#     for chain in good_event_chain:
#         f.write('\t'.join(chain) + '\n')
# good_event_chain = good_event_chain[:10]
print('Done!!!')


Done!!!


In [28]:
# for item in good_event_chain:
#     if len(item) != 5:
#         print('error')
print(good_event_chain[0])

['0835_e01', '0835_e02', '0835_e04', '0835_e05', '0835_e06']


# 2. 通过正例获取事件链负例

将正例事件链的最后一个事件替换为错误的事件（在这里调整正负比）

In [29]:
ids = []
for event_id in good_event_chain:
    ids += event_id
print(len(ids))
ids = list(set(ids))
print(len(ids))

50
9


In [30]:
import json

def check_node(event_info):
    news_id, event_id = event_info.split('_')[0],event_info.split('_')[1]
    for file_name in data_list:
        if file_name.split('_')[0] == news_id:
            with open(os.path.join(dataset_dir,file_name) , 'r', encoding='utf-8') as f:
                json_data = json.loads(f.read())
            for item in json_data['event_element']:
                for node in item['event_graph']:
                    if node['child_event_id'] == event_id:
                        return node
    return None

print(check_node('12048_e01'))


{'child_event_id': 'e01', 'trigger_agent': '旅行车与货车', 'trigger': '发生', 'trigger_object': '追尾事故', 'agent_attri': '一辆由北京中国青年年旅行社组织/载有德籍旅客的北京牌照中型', 'object_attri': '特大', 'time': '2012年10月1日上午8时30分', 'time_align': '2015-10-01-08-30', 'location': '京津塘高速公路下行54公里处', 'organization': '', 'person': ''}


In [31]:
import random

bad_event_chain = []

def check_same(events, node):
    node_info = check_node(node)
    for e in events:
        e_info = check_node(e)
        if e_info['trigger'] == node_info['trigger']:
            return False
    return True

for eventchain in good_event_chain:
    random.shuffle(ids)
    bad_nodes = ids
    cnt = 0
    for bad_node in bad_nodes:
        if cnt >= 4:  # 在这里调整正负比，4表示正负比是4
            break
        if check_same(eventchain, bad_node):
            tmp_event = eventchain[:-1] + [bad_node]
#             print(tmp_event)
            bad_event_chain.append(tmp_event)
            cnt += 1
#     break
print(len(bad_event_chain))

40


In [32]:
# with open(os.path.join('..', 'dataset', 'tmp', 'bad.data'), 'w', encoding='utf-8') as f:
#     for chain in bad_event_chain:
#         f.write('\t'.join(chain) + '\n')

print('Done!!!')

Done!!!


In [33]:
# for item in bad_event_chain:
#     if len(item )!= 5:
#         print("error")
print(bad_event_chain[0])

['0835_e01', '0835_e02', '0835_e04', '0835_e05', '0835_e09']


# 3. 分词

对数据中非id字段进行分词，用空格分割存储

In [None]:
from pyltp import Segmentor
model_dir = os.path.join("ltp_data_v3.4.0")

segmentor = Segmentor()
segmentor.load(os.path.join(model_dir, "cws.model"))


for chain in good_event_chain:
    for event in chain:
        for k in event:
            if k != 'id':
                event[k] = " ".join([str(x) for x in segmentor.segment(event[k])])

for chain in bad_event_chain:
    for event in chain:
        for k in event:
            if k != 'id':
                event[k] = " ".join([str(x) for x in segmentor.segment(event[k])])
                
print('finish segment...')

# 4. 分割数据集

train、eval、test （6：2：2）

In [34]:
# 将id转化为数据

pop_keys = ['child_event_id',
            'time',
            'time_align',
            'location',
            'organization',
            'person']
good_data = []

for i in range(len(good_event_chain)):
    tmp_chain = []

    for item in good_event_chain[i]:
        tmp_node = check_node(item)
        tmp_node['id'] = item
        
        for k in pop_keys:
            if k in tmp_node.keys():
                tmp_node.pop(k)
        
        tmp_chain.append(tmp_node)
    good_data.append(tmp_chain)
    
bad_data = []
for i in range(len(bad_event_chain)):
    tmp_chain = []
    for item in bad_event_chain[i]:
        tmp_node = check_node(item)
        tmp_node['id'] = item
        for k in pop_keys:
            if k in tmp_node.keys():
                tmp_node.pop(k)
        tmp_chain.append(tmp_node)
    bad_data.append(tmp_chain)
print("good: {}".format(str(len(good_data))))
print("bad: {}".format(str(len(bad_data))))

good: 10
bad: 40


In [35]:
# print(good_data[0])
# print(bad_data[0])

In [36]:
train_data = []
test_data = []
eval_data = []

from sklearn.utils import shuffle
good_data = shuffle(good_data)
bad_data = shuffle(bad_data)

contents = []
for item in good_data:
    contents.append({
        'event':item,
        'label':1
    })
good_data = contents

contents = []
for item in bad_data:
    contents.append({
        'event':item,
        'label':0
    })
bad_data = contents

In [37]:
train_data = good_data[0: int(0.6*(len(good_data)))] + bad_data[0: int(0.6*(len(bad_data)))]
test_data = good_data[int(0.6*(len(good_data))):int(0.8*(len(good_data)))] + bad_data[int(0.6*(len(bad_data))):int(0.8*(len(bad_data)))] 
eval_data = good_data[int(0.8*(len(good_data))):] + bad_data[int(0.8*(len(bad_data))):]

print(len(train_data))
print(len(test_data))
print(len(eval_data))

30
10
10


In [38]:
train_data = shuffle(train_data)
test_data = shuffle(test_data)
eval_data = shuffle(eval_data) 
    
# with open(os.path.join('..','dataset', 'tmp','train.data'),'w',encoding='utf-8') as f:
#     f.write(json.dumps(train_data, ensure_ascii=False))
# with open(os.path.join('..','dataset', 'tmp','test.data'),'w',encoding='utf-8') as f:
#     f.write(json.dumps(test_data, ensure_ascii=False))
# with open(os.path.join('..','dataset', 'tmp','eval.data'),'w',encoding='utf-8') as f:
#     f.write(json.dumps(eval_data, ensure_ascii=False))
print('done')



done


# 5. 划分数据类型

In [39]:
# 事件类型标注

event_types = dict()
event_types[0] = '爆炸'
event_types[1] = '火灾'
event_types[2] = '地质 灾害'
event_types[3] = '交通 事故'
event_types[4] = '人身 伤害'

data_ids = {0: [839, 835, 828, 12, 54, 12050, 424, 12051, 4, 855, 859, 852, 840, 851, 829, 815, 19, 12031, 52, 843, 12029, 830, 834, 811, 824, 818, 850, 854, 404, 841, 454, 48, 848, 803, 455, 12037, 402, 817, 415, 437, 833, 18, 801, 45, 816, 9, 826, 853, 1, 858, 27, 856, 44, 16, 59, 431, 17, 831, 860, 846, 823], 1: [37, 12032, 12030, 447, 409, 441, 3, 814, 845, 12044, 836, 12010, 433, 12028, 452, 12023, 11, 414, 857, 820, 39, 57, 812, 822, 438, 60, 849, 427, 12039, 20, 813], 2: [842, 12046, 58, 819, 419, 12035, 449, 805, 12043, 808, 847, 432, 33, 38, 832, 425, 837, 456, 26, 46, 12041, 23, 821, 12033, 806, 35, 12018, 838, 12011, 55, 51, 844, 421, 408, 407, 12047, 12015, 827, 6, 5, 12042, 422, 420, 406], 3: [12048, 429, 15, 450, 8, 49, 24, 36, 53, 453, 12045, 460, 807, 410, 12016, 436, 47, 12038, 12013, 412, 12026, 426, 12017, 22, 804, 459, 29, 12040, 802, 43, 34, 417, 42, 21, 7, 413, 12049, 416, 12019, 411, 40, 458, 13, 405], 4: [418, 31, 440, 28, 10, 446, 435, 41, 439, 50, 442, 32, 12025, 448, 2, 56, 444, 430, 12022, 12014, 30, 401, 428, 443, 809, 810, 12012, 423, 451, 12027, 12034, 12024, 461, 14, 25, 457, 403, 12021, 434, 445, 12020]}

In [40]:
tmp_data = []
for item in eval_data:
    item_id = int(item['event'][0]['id'].split('_')[0])
    type_id = -1
    for k in data_ids:
        if item_id in data_ids[k]:
            type_id = k
            break
    tmp = item
    tmp['event_type'] = event_types[type_id]
    tmp_data.append(tmp)
eval_data = tmp_data

tmp_data = []
for item in test_data:
    item_id = int(item['event'][0]['id'].split('_')[0])
    type_id = -1
    for k in data_ids:
        if item_id in data_ids[k]:
            type_id = k
            break
    tmp = item
    tmp['event_type'] = event_types[type_id]
    tmp_data.append(tmp)
test_data = tmp_data

tmp_data = []
for item in train_data:
    item_id = int(item['event'][0]['id'].split('_')[0])
    type_id = -1
    for k in data_ids:
        if item_id in data_ids[k]:
            type_id = k
            break
    tmp = item
    tmp['event_type'] = event_types[type_id]
    tmp_data.append(tmp)
train_data = tmp_data

In [41]:
with open(os.path.join('..','dataset', '1_4','train.data'),'w',encoding='utf-8') as f:
    f.write(json.dumps(train_data, ensure_ascii=False))
with open(os.path.join('..','dataset', '1_4','test.data'),'w',encoding='utf-8') as f:
    f.write(json.dumps(test_data, ensure_ascii=False))
with open(os.path.join('..','dataset', '1_4','eval.data'),'w',encoding='utf-8') as f:
    f.write(json.dumps(eval_data, ensure_ascii=False))

print("done")

done
