In [1]:
import json
import re

with open('character_table.json', 'r', encoding='utf-8') as file:
    character_table = json.load(file)


import stanza
pipe = stanza.Pipeline("en", processors="tokenize,coref,ner")

  from .autonotebook import tqdm as notebook_tqdm
2024-06-16 06:25:37 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 16.4MB/s]                    
2024-06-16 06:25:37 INFO: Downloaded file to /home/hansirui/stanza_resources/resources.json
2024-06-16 06:25:38 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| coref     | ontonotes_electra-large   |
| ner       | ontonotes-ww-multi_charlm |

2024-06-16 06:25:42 INFO: Using device: cuda
2024-06-16 06:25:42 INFO: Loading: tokenize
2024-06-16 06:25:51 INFO: Loading: mwt
2024-06-16 06:25:51 INFO: Loading: coref
2

In [2]:

def get_Cooccurrence_count(text,Amcs,Bmcs,Cmcs):
    output = pipe(text)
    output_json = json.loads(str(output))
    count = 0
    for sentence in output_json:
        for mc in Amcs:
            for token in sentence:
                try:
                    mention_inA = mc in token["text"] or (token["coref_chains"] and mc in token["coref_chains"][0]["representative_text"])
                except:
                    continue
                if mention_inA:
                    break

        for mc in Bmcs:
            for token in sentence:
                try:
                    mention_inB = mc in token["text"] or (token["coref_chains"] and mc in token["coref_chains"][0]["representative_text"])
                except:
                    continue
                if mention_inB:
                    break
        if Cmcs:
            for mc in Cmcs:
                for token in sentence:
                    try:
                        mention_inC = mc in token["text"] or (token["coref_chains"] and mc in token["coref_chains"][0]["representative_text"])
                    except:
                        continue
                    if mention_inC:
                        break
        else:
            mention_inC = False
                  
        if (mention_inA and mention_inB) or (mention_inA and mention_inC) or (mention_inB and mention_inC):
            count += 1
    return count

In [3]:
from tqdm import trange
def get_Sec_metric(results):
    count_list = []
    for i in trange(len(results)):
        aid = results[i]['aid']
        bid = results[i]['bid']
        cid = results[i]['cid']
        outline = results[i]['outline']
        response = results[i]['response']

        Amcs = character_table[aid]
        Bmcs = character_table[bid]

        if cid:
            Cmcs = character_table[cid]
        else:
            Cmcs = []
                    # if aid == "9b4b890c-2b0c-42e2-a291-3a6d36eacae6" and  bid == "70cb1895-c5d3-4c6a-902a-c37493f83cff":
        #     count = 0
        # else:
        try:
            count = get_Cooccurrence_count(response,Amcs,Bmcs,Cmcs)
        except:
            print(outline)
            print(response)
            raise ValueError
        count_list.append(count)
    return count_list

In [4]:
def tokens_to_sentences(tokens_list):
    """
    将Stanza tokenize后的tokens列表转换回句子列表。
    :param tokens_list: Stanza tokenize处理后的输出列表
    :return: 由句子组成的列表，每个句子为一个字符串
    """
    sentences = []  # 存储还原的句子
    current_sentence = ""  # 当前正在构建的句子

    for token in tokens_list:
        # 添加当前token的文本到当前句子，但先不加空格
        if token["text"].strip():  # 避免添加空字符串
            if current_sentence:  # 如果当前句子非空
                # 检查上一个token是否以标点结束，如果是，则不添加额外空格
                if not re.match(r'[\.\?!]+$', token["text"]) and \
                   not re.match(r'^[\[\(\{\<\]\)\}\>]', token["text"]):
                    current_sentence += " "  # 否则，在新token前添加空格
            current_sentence += token["text"]

    # 如果current_sentence非空，说明最后一个句子未被加入到sentences中，应将其添加
    if current_sentence:
        sentences.append(current_sentence)

    return sentences

In [5]:
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def get_mentioned_flag(targeted_sentences,reference_sentence):
    targeted_sentences.append(reference_sentence)
    embeddings = model.encode(targeted_sentences)
    # print(targeted_sentences[-1])
    for i in range(len(targeted_sentences)):
        similarity_score = 1 - cosine(embeddings[i], embeddings[-1])
        # print(targeted_sentences[i])
        if similarity_score > 0.5:
            return True
    return False

# types =["a0 a1 B3 a3 a4", "a0 B0 a2 B4 a4","a0 B0 C0 a2 B4 C4 a4","a0 B0 a1 B1 a2 B2 a3 B3 a4 B4"]
def get_plot_mentioned(outline,response,type):
    output = pipe(response)
    output_json = json.loads(str(output))
    targeted_sentences =[]
    for sentence in output_json:
        bfsent = tokens_to_sentences(sentence)
        targeted_sentences.extend(bfsent)

    flag = True
    for outli in outline:
        flag = get_mentioned_flag(targeted_sentences,outli) and flag
        if not flag:
            break
    # if type == "a0 a1 B3 a3 a4":
    #     flag = get_mentioned_flag(targeted_sentences,outline[2])
    # elif type == "a0 B0 a2 B4 a4":
    #     flag1 = get_mentioned_flag(targeted_sentences,outline[1])
    #     flag2 = get_mentioned_flag(targeted_sentences,outline[3])
    #     flag = flag1 and flag2
    # elif type == "a0 B0 C0 a2 B4 C4 a4":
    #     flag1 = get_mentioned_flag(targeted_sentences,outline[1])
    #     flag2 = get_mentioned_flag(targeted_sentences,outline[2])
    #     flag3 = get_mentioned_flag(targeted_sentences,outline[4])
    #     flag4 = get_mentioned_flag(targeted_sentences,outline[5])
    #     flag = flag1 and flag2 and flag3 and flag4
    # elif type == "a0 B0 a1 B1 a2 B2 a3 B3 a4 B4":
    #     flag1 = get_mentioned_flag(targeted_sentences,outline[1])
    #     flag2 = get_mentioned_flag(targeted_sentences,outline[3])
    #     flag3 = get_mentioned_flag(targeted_sentences,outline[5])
    #     flag4 = get_mentioned_flag(targeted_sentences,outline[7])
    #     flag5 = get_mentioned_flag(targeted_sentences,outline[9])

    #     flag = flag1 and flag2 and flag3 and flag4 and flag5
    return flag


def get_first_metric(results,type):
    mentioned_flag_list = []
    for i in trange(len(results)):
        outline = results[i]['outline']
        response = results[i]['response']
        flag = get_plot_mentioned(outline,response,type)
        mentioned_flag_list.append(flag)
    return mentioned_flag_list



In [7]:
import os


type_match = {
    "主线情节点": "a0 a1 B3 a3 a4",
    "主线支线": "a0 B0 a2 B4 a4",
    "主线双支线": "a0 B0 C0 a2 B4 C4 a4",
    "双主线": "a0 B0 a1 B1 a2 B2 a3 B3 a4 B4"
}
task_result = "GPT4/IO/temp_0_7"
for filename in os.listdir(f"{task_result}_results"):
    file_path = os.path.join(f"{task_result}_results", filename)
    type_file = file_path.split('/')[-1].split('.')[0]
    type = type_match[type_file]
    print(type_file)
    if type_file == "主线双支线":
        continue
    elif type_file == "主线支线":
        continue
    
    with open(file_path, 'r', encoding='utf-8') as file:
        results = []
        for line in file:
            res = {}
            data = json.loads(line.strip())
            
            aid = data.get('Aid_list')
            bid = data.get('Bid_list')
            cid = data.get('Cid_list')
            outline = data.get('outline')
            response = data.get('responses')

            res['aid'] = aid
            res['bid'] = bid
            res['cid'] = cid
            res['outline'] = outline
            res['response'] = response
            results.append(res)

        # mentioned_flag_list = get_first_metric(results,type)
        count_list = get_Sec_metric(results)
        
        # output the true ratio in first metric
        # print(mentioned_flag_list.count(True)/len(mentioned_flag_list))
        # output the average count in second metric
        print(sum(count_list)/len(count_list))

主线双支线
主线支线
双主线


100%|██████████| 130/130 [00:12<00:00, 10.58it/s]


2.9846153846153847
主线情节点


100%|██████████| 130/130 [00:13<00:00,  9.88it/s]

3.3615384615384616





In [None]:
for i in range(len(count_list)):
    if count_list[i]==0:
        print(results[i]['outline'])
        print(results[i]['response'])
        break


['Emma fell in love with Paul.', "Paul didn't feel the same for Emma and rejected Emma.", "Colleen is helping calm Dina's fears by being supportive.", "Emma felt even worse when the authorities wouldn't listen.", "Emma walked under a mattress for a year to show everyone Emma's feelings."]
Emma had always been smitten with Paul, and she finally mustered the courage to confess her feelings to him. But to her dismay, Paul didn't feel the same way and rejected her, leaving Emma heartbroken.

As she struggled to cope with the rejection, Emma turned to her friends for support. Dina, who was already anxious about her own problems, began to fear that she too would experience a similar heartbreak. Colleen, sensing Dina's distress, took it upon herself to calm her friend's fears, offering words of encouragement and reassurance.

Meanwhile, Emma's pain and frustration only intensified when she tried to report Paul's rejection to the authorities, only to be met with indifference. They didn't take 