In [1]:
import os
import json
import re

In [2]:
def merge_details(file_path):
    with open(file_path,'r',encoding='utf-8') as f:
        forward_details = f.readlines()


    merged_list = []
    curreve_item = ""

    for i, item in enumerate(forward_details):
        if item.strip() and item.strip()[0].isdigit():
            j = 1
            while j < len(item.strip()) and item.strip()[j].isdigit():
                j += 1
            if j < len(item.strip()) and item.strip()[j] == "：":
                merged_list.append(curreve_item)
                curreve_item = item.strip()
            else:
                curreve_item += item.strip()
        else:
            curreve_item += item.strip()

    merged_list.append(curreve_item)
    merged_list.pop(0)
    return merged_list


def get_extend_map(merged_list, mode):
    rel_extend_map = {}
    for i, item in enumerate(merged_list):
        if mode == "zh":
            item = item.replace(" ","")

        gold = item.split("gold:")[1]
        if gold not in rel_extend_map:
            rel_extend_map[gold] = []


        extend_rel_str = item.split("gold:")[0]
        matches = re.findall(r'\[(.*?)\]', extend_rel_str)

        for match in matches:
            if mode == "zh":
                match = match.replace('"','').replace("'",'').replace('（','').replace('）','').replace('(','').replace(')','')
                match = match.replace("，",",").split(",")
            else:
                match = match.replace('"','').replace("'",'').replace('（','').replace('）','').replace('(','').replace(')','')            
                match = match.split(",")
            
            for extend_rel in match:
                extend_rel = extend_rel.strip()
                if extend_rel not in rel_extend_map[gold]:
                    rel_extend_map[gold].append(extend_rel)

    lengths = [len(rel_extend_map[key]) for key in rel_extend_map]

    with open(f'./results/rel_extend_map_{mode}.json','w',encoding='utf-8') as f:
        json.dump(rel_extend_map, f, ensure_ascii=False, indent=4)
    
    return rel_extend_map, lengths


def get_filtered_extend_map(rel_extend_map, mode):
    if '_zh' in mode:
        with open('../CMeIE/processed_data/labels.txt','r',encoding='utf-8') as f:
            gold_type = [item.strip('\n') for item in f.readlines()]
    elif '_en' in mode:
        gold_type = ['HYPONYM-OF', 
                    'FEATURE-OF',
                    'USED-FOR',
                    'CONJUNCTION',
                    'EVALUATE-FOR',
                    'PART-OF',
                    'COMPARE']

    
    filtered_rel_extend_map = {}

    for key in rel_extend_map:
        tmp = list(set(rel_extend_map[key]))
        filtered_rel_extend_map[key] = [item for item in tmp if item not in gold_type]


    filtered_lengths = [len(filtered_rel_extend_map[key]) for key in filtered_rel_extend_map]

    with open(f'./results/filtered_rel_extend_map_{mode}.json','w',encoding='utf-8') as f:
        json.dump(filtered_rel_extend_map, f, ensure_ascii=False, indent=4)

    return filtered_rel_extend_map, filtered_lengths


# 处理关系类型扩展词中的交集部分
def human_process(filtered_event_extend_map, mode):
    repeated_values = []
    value_to_keys = {}


    for key1, values1 in filtered_event_extend_map.items():
        for key2, values2 in filtered_event_extend_map.items():
            if key1 != key2 and key1 < key2:
                for value in values1:
                    if value in values2:
                        repeated_values.append(value)
                        if value in value_to_keys.keys():
                            if key1 not in value_to_keys[value]:
                                value_to_keys[value].append(key1)
                            if key2 not in value_to_keys[value]:
                                value_to_keys[value].append(key2)
                        else:
                            value_to_keys[value] = [key1, key2]

    with open(f'./results/交集_{mode}_raw.txt','w',encoding='utf-8') as f:
        for value, keys in value_to_keys.items():
            f.write('{} ：{}\n'.format(value, '、'.join(keys)))


CMeIE

In [None]:
merged_list = merge_details('./results/extend_forward_zh.txt')
rel_extend_map, lengths = get_extend_map(merged_list, mode="zh")
filtered_rel_extend_map, filtered_lengths = get_filtered_extend_map(rel_extend_map, mode="zh")
human_process(filtered_rel_extend_map, mode="zh")

SCIERC

In [None]:
merged_list = merge_details('./results/extend_forward_en.txt')
rel_extend_map, lengths = get_extend_map(merged_list, mode="en")
filtered_rel_extend_map, filtered_lengths = get_filtered_extend_map(rel_extend_map, mode="en")
human_process(filtered_rel_extend_map, mode="en")

Alpaca-CMeIE

In [None]:
merged_list = merge_details('./results/extend_forward_alpaca_33B_zh.txt')
rel_extend_map, lengths = get_extend_map(merged_list, mode="alpaca_33B_zh")
filtered_rel_extend_map, filtered_lengths = get_filtered_extend_map(rel_extend_map, mode="alpaca_33B_zh")
human_process(filtered_rel_extend_map, mode="alpaca_33B_zh")

Alpaca-SCIERC

In [None]:
merged_list = merge_details('./results/extend_forward_alpaca_33B_en.txt')
rel_extend_map, lengths = get_extend_map(merged_list, mode="alpaca_33B_en")
filtered_rel_extend_map, filtered_lengths = get_filtered_extend_map(rel_extend_map, mode="alpaca_33B_en")
human_process(filtered_rel_extend_map, mode="alpaca_33B_en")

Llama2-SCIERC

In [None]:
merged_list = merge_details('./results/extend_forward_llama2_70B_en.txt')
rel_extend_map, lengths = get_extend_map(merged_list, mode="llama2_70B_en")
filtered_rel_extend_map, filtered_lengths = get_filtered_extend_map(rel_extend_map, mode="llama2_70B_en")
human_process(filtered_rel_extend_map, mode="llama2_70B_en")

ChatGLM-CMeIE

In [None]:
merged_list = merge_details('./results/extend_forward_chatglm_6B_zh.txt')
rel_extend_map, lengths = get_extend_map(merged_list, mode="chatglm_6B_zh")
filtered_rel_extend_map, filtered_lengths = get_filtered_extend_map(rel_extend_map, mode="chatglm_6B_zh")
human_process(filtered_rel_extend_map, mode="chatglm_6B_zh")

ChatGLM-SCIERC

In [None]:
merged_list = merge_details('./results/extend_forward_chatglm_6B_en.txt')
rel_extend_map, lengths = get_extend_map(merged_list, mode="chatglm_6B_en")
filtered_rel_extend_map, filtered_lengths = get_filtered_extend_map(rel_extend_map, mode="chatglm_6B_en")
human_process(filtered_rel_extend_map, mode="chatglm_6B_en")

GPT4-CMeIE

In [None]:
merged_list = merge_details('./results/extend_forward_gpt4_zh.txt')
rel_extend_map, lengths = get_extend_map(merged_list, mode="gpt4_zh")
filtered_rel_extend_map, filtered_lengths = get_filtered_extend_map(rel_extend_map, mode="gpt4_zh")
human_process(filtered_rel_extend_map, mode="gpt4_zh")

GPT4-SCIERC

In [None]:
merged_list = merge_details('./results/extend_forward_gpt4_en.txt')
rel_extend_map, lengths = get_extend_map(merged_list, mode="gpt4_en")
filtered_rel_extend_map, filtered_lengths = get_filtered_extend_map(rel_extend_map, mode="gpt4_en")
human_process(filtered_rel_extend_map, mode="gpt4_en")

Baichuan2-CMeIE

In [None]:
merged_list = merge_details('./results/extend_forward_baichuan2_13B_zh.txt')
rel_extend_map, lengths = get_extend_map(merged_list, mode="baichuan2_13B_zh")
filtered_rel_extend_map, filtered_lengths = get_filtered_extend_map(rel_extend_map, mode="baichuan2_13B_zh")
human_process(filtered_rel_extend_map, mode="baichuan2_13B_zh")

Baichuan2-SCIERC

In [None]:
merged_list = merge_details('./results/extend_forward_baichuan2_13B_en.txt')
rel_extend_map, lengths = get_extend_map(merged_list, mode="baichuan2_13B_en")
filtered_rel_extend_map, filtered_lengths = get_filtered_extend_map(rel_extend_map, mode="baichuan2_13B_en")
human_process(filtered_rel_extend_map, mode="baichuan2_13B_en")

In [3]:
def get_final_map(mode):
    with open(f'./results/filtered_rel_extend_map_{mode}.json','r',encoding='utf-8') as f:
        filtered_rel_extend_map_zh = json.load(f)

    with open(f'./results/交集_{mode}_raw.txt','r',encoding='utf-8') as f:
        raw_zh = f.readlines()

    tmp_dict1 = {item.split(' ：')[0].strip(): item.split(' ：')[1].strip().split('、') for item in raw_zh}
    jiaoji_raw_dict = {value: [key for key, val in tmp_dict1.items() if value in val] for key, values in tmp_dict1.items() for value in values}

    with open(f'./results/交集_{mode}.txt','r',encoding='utf-8') as f:
        new_zh = f.readlines()

    tmp_dict2 = {item.split(' ：')[0].strip(): item.split(' ：')[1].strip() for item in new_zh}
    jiaoji_new_dict = {k:[] for k in jiaoji_raw_dict.keys()}
    for key, value in tmp_dict2.items():
        if value in jiaoji_new_dict:
            jiaoji_new_dict[value].append(key)
        else:
            jiaoji_new_dict[value] = [key]

    for key in jiaoji_raw_dict.keys():
        tmp_list = jiaoji_raw_dict[key]
        wuguan_list = [value for value in filtered_rel_extend_map_zh[key] if value not in tmp_list]
        ok_list = jiaoji_new_dict[key]
        filtered_rel_extend_map_zh[key] = wuguan_list + ok_list

    with open(f'./results/final_rel_extend_map_{mode}.json','w',encoding='utf-8') as f:
        json.dump(filtered_rel_extend_map_zh, f, ensure_ascii=False, indent=4)



In [4]:
# get_final_map('zh')
# get_final_map('en')
# get_final_map('alpaca_33B_zh')
# get_final_map('alpaca_33B_en')
# get_final_map('llama2_70B_en')
# get_final_map('chatglm_6B_zh')
# get_final_map('chatglm_6B_en')
get_final_map('baichuan2_13B_zh')
get_final_map('baichuan2_13B_en')