In [8]:
import json
import re

with open('character_table.json', 'r', encoding='utf-8') as file:
    character_table = json.load(file)


import stanza
pipe = stanza.Pipeline("en", processors="tokenize,coref,ner")

2024-06-16 13:37:55 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 11.9MB/s]                    
2024-06-16 13:37:55 INFO: Downloaded file to /home/hansirui/stanza_resources/resources.json
2024-06-16 13:37:56 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| coref     | ontonotes_electra-large   |
| ner       | ontonotes-ww-multi_charlm |

2024-06-16 13:37:56 INFO: Using device: cuda
2024-06-16 13:37:56 INFO: Loading: tokenize
2024-06-16 13:37:56 INFO: Loading: mwt
2024-06-16 13:37:56 INFO: Loading: coref
2024-06-16 13:37:59 INFO: Loading: ner
2024-06-16 13:37:59 INFO: Done loading processors!


In [9]:

def get_Cooccurrence_count(text,Amcs,Bmcs,Cmcs):
    output = pipe(text)
    output_json = json.loads(str(output))
    count = 0
    for sentence in output_json:
        for mc in Amcs:
            for token in sentence:
                try:
                    mention_inA = mc in token["text"] or (token["coref_chains"] and mc in token["coref_chains"][0]["representative_text"])
                except:
                    continue
                if mention_inA:
                    break

        for mc in Bmcs:
            for token in sentence:
                try:
                    mention_inB = mc in token["text"] or (token["coref_chains"] and mc in token["coref_chains"][0]["representative_text"])
                except:
                    continue
                if mention_inB:
                    break
        if Cmcs:
            for mc in Cmcs:
                for token in sentence:
                    try:
                        mention_inC = mc in token["text"] or (token["coref_chains"] and mc in token["coref_chains"][0]["representative_text"])
                    except:
                        continue
                    if mention_inC:
                        break
        else:
            mention_inC = False
                  
        if (mention_inA and mention_inB) or (mention_inA and mention_inC) or (mention_inB and mention_inC):
            count += 1
    return count

In [10]:
from tqdm import trange
def get_Sec_metric(results):
    count_list = []
    for i in trange(len(results)):
        aid = results[i]['aid']
        bid = results[i]['bid']
        cid = results[i]['cid']
        outline = results[i]['outline']
        response = results[i]['response']

        Amcs = character_table[aid]
        Bmcs = character_table[bid]

        if cid:
            Cmcs = character_table[cid]
        else:
            Cmcs = []
        if aid == "018cc08a-3902-4ebe-9da9-fac862431f96" and  bid == "bb669de4-490b-48d8-9811-f81ce446f67a":
            count = 0
        else:
            try:
                count = get_Cooccurrence_count(response,Amcs,Bmcs,Cmcs)
            except:
                print(outline)
                print(response)
                raise ValueError
            count_list.append(count)
    return count_list

In [11]:
def tokens_to_sentences(tokens_list):
    """
    将Stanza tokenize后的tokens列表转换回句子列表。
    :param tokens_list: Stanza tokenize处理后的输出列表
    :return: 由句子组成的列表，每个句子为一个字符串
    """
    sentences = []  # 存储还原的句子
    current_sentence = ""  # 当前正在构建的句子

    for token in tokens_list:
        # 添加当前token的文本到当前句子，但先不加空格
        if token["text"].strip():  # 避免添加空字符串
            if current_sentence:  # 如果当前句子非空
                # 检查上一个token是否以标点结束，如果是，则不添加额外空格
                if not re.match(r'[\.\?!]+$', token["text"]) and \
                   not re.match(r'^[\[\(\{\<\]\)\}\>]', token["text"]):
                    current_sentence += " "  # 否则，在新token前添加空格
            current_sentence += token["text"]

    # 如果current_sentence非空，说明最后一个句子未被加入到sentences中，应将其添加
    if current_sentence:
        sentences.append(current_sentence)

    return sentences

In [12]:
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def get_mentioned_flag(targeted_sentences,reference_sentence):
    targeted_sentences.append(reference_sentence)
    embeddings = model.encode(targeted_sentences)
    # print(targeted_sentences[-1])
    for i in range(len(targeted_sentences)):
        similarity_score = 1 - cosine(embeddings[i], embeddings[-1])
        # print(targeted_sentences[i])
        if similarity_score > 0.5:
            return True
    return False

# types =["a0 a1 B3 a3 a4", "a0 B0 a2 B4 a4","a0 B0 C0 a2 B4 C4 a4","a0 B0 a1 B1 a2 B2 a3 B3 a4 B4"]
def get_plot_mentioned(outline,response,type):
    try:
        output = pipe(response)
    except:
        print(outline)
        print(response)
        return False
        # raise ValueError("error")
    output_json = json.loads(str(output))
    targeted_sentences =[]
    for sentence in output_json:
        bfsent = tokens_to_sentences(sentence)
        targeted_sentences.extend(bfsent)

    flag = True
    for outli in outline:
        flag = get_mentioned_flag(targeted_sentences,outli) and flag
        if not flag:
            break
    # if type == "a0 a1 B3 a3 a4":
    #     flag = get_mentioned_flag(targeted_sentences,outline[2])
    # elif type == "a0 B0 a2 B4 a4":
    #     flag1 = get_mentioned_flag(targeted_sentences,outline[1])
    #     flag2 = get_mentioned_flag(targeted_sentences,outline[3])
    #     flag = flag1 and flag2
    # elif type == "a0 B0 C0 a2 B4 C4 a4":
    #     flag1 = get_mentioned_flag(targeted_sentences,outline[1])
    #     flag2 = get_mentioned_flag(targeted_sentences,outline[2])
    #     flag3 = get_mentioned_flag(targeted_sentences,outline[4])
    #     flag4 = get_mentioned_flag(targeted_sentences,outline[5])
    #     flag = flag1 and flag2 and flag3 and flag4
    # elif type == "a0 B0 a1 B1 a2 B2 a3 B3 a4 B4":
    #     flag1 = get_mentioned_flag(targeted_sentences,outline[1])
    #     flag2 = get_mentioned_flag(targeted_sentences,outline[3])
    #     flag3 = get_mentioned_flag(targeted_sentences,outline[5])
    #     flag4 = get_mentioned_flag(targeted_sentences,outline[7])
    #     flag5 = get_mentioned_flag(targeted_sentences,outline[9])

    #     flag = flag1 and flag2 and flag3 and flag4 and flag5
    return flag


def get_first_metric(results,type):
    mentioned_flag_list = []
    for i in trange(len(results)):
        outline = results[i]['outline']
        response = results[i]['response']
        flag = get_plot_mentioned(outline,response,type)
        mentioned_flag_list.append(flag)
    return mentioned_flag_list

In [13]:
# # last_story_part = text.rsplit("\n\nStory:", 1)[-1]
# def unwarp_response(response):
#     if "\n\nStory:" in response:
#         return response.rsplit("\n\nStory:",1)[1].strip(), True
#     elif "\n\nstory:" in response:
#         return response.split("\n\nstory:")[1].strip(), True
#     elif "\n\n**Story:" in response:
#         return response.split("\n\n**Story:")[1].strip(), True
#     elif "\n\n**story:" in response:
#         return response.split("\n\n**story:")[1].strip(), True
#     else:
#         return response, False

# last_story_part = text.rsplit("\n\nStory:", 1)[-1]
def unwarp_response(response):
    if "\n\nStory:" in response:
        return response.rsplit("\n\nStory:",1)[-1].strip(), True
    elif "\n\nstory:" in response:
        return response.rsplit("\n\nstory:",1)[-1].strip(), True
    elif "\n\n**Story:" in response:
        return response.rsplit("\n\n**Story:",1)[-1].strip(), True
    elif "\n\n**story:" in response:
        return response.rsplit("\n\n**story:",1)[-1].strip(), True
    else:
        return response, False

In [14]:
import os


type_match = {
    "主线情节点": "a0 a1 B3 a3 a4",
    "主线支线": "a0 B0 a2 B4 a4",
    "主线双支线": "a0 B0 C0 a2 B4 C4 a4",
    "双主线": "a0 B0 a1 B1 a2 B2 a3 B3 a4 B4"
}
task_result = "GPT4/COT/temp_0_7"
for filename in os.listdir(f"{task_result}_results"):
    file_path = os.path.join(f"{task_result}_results", filename)
    print(filename)
    # if filename != "主线情节点.jsonl":
    #     continue
    with open(file_path, 'r', encoding='utf-8') as file:
        results = []
        for line in file:
            res = {}
            data = json.loads(line.strip())
            
            aid = data.get('Aid_list')
            bid = data.get('Bid_list')
            cid = data.get('Cid_list')
            outline = data.get('outline')
            raw_response = data.get('responses')
            if raw_response == "Error":
                continue
            try:
                response,flag = unwarp_response(raw_response)
            except:
                print(outline)
                print(raw_response)

            if not flag:
                print(raw_response)
                print("response not found")
                # raise ValueError("response not found")
                response = ""
                # break
            # print(response)
            # print("---------------------")

            res['aid'] = aid
            res['bid'] = bid
            res['cid'] = cid
            res['outline'] = outline
            res['response'] = response
            results.append(res)
        type_file = file_path.split('/')[-1].split('.')[0]
        type = type_match[type_file]
        # mentioned_flag_list = get_first_metric(results,type)
        count_list = get_Sec_metric(results)
        print(type_file)
        # output the true ratio in first metric
        # print(mentioned_flag_list.count(True)/len(mentioned_flag_list))
        # output the average count in second metric
        print(sum(count_list)/len(count_list))

主线双支线.jsonl


100%|██████████| 127/127 [00:12<00:00, 10.30it/s]


主线双支线
4.420634920634921
主线支线.jsonl


100%|██████████| 128/128 [00:12<00:00, 10.27it/s]


主线支线
3.7559055118110236
双主线.jsonl


100%|██████████| 129/129 [00:12<00:00, 10.52it/s]


双主线
2.3203125
主线情节点.jsonl


100%|██████████| 129/129 [00:12<00:00, 10.31it/s]

主线情节点
2.7734375





In [15]:
for i in range(len(count_list)):
    if count_list[i]==0:
        print(results[i]['outline'])
        print(results[i]['response'])
        break


['Sam saw his brother Barry eating a peanut butter and banana sandwich.', 'Sam thought it was the grossest thing Sam ever saw.', "During Holly's senior year Holly was able to apply to graduate schools.", 'After many weeks, Sam reluctantly agreed to take a bite.', 'Sam now eats peanut butter and banana sandwiches every week.']
Sam had always considered himself a man of simple tastes, with food preferences that leaned towards the traditional. But one afternoon, as he walked into the kitchen, he saw his brother Barry munching on something that threatened to topple his world of culinary certainty. Barry was eating a peanut butter and banana sandwich.

The sight of the gooey peanut butter mixing with the squishy banana, the way Barry savored each bite, sent a shudder down Sam's spine. He stared, wide-eyed and aghast, as Barry finished the sandwich with a sat


In [16]:
text = """ Plan:\n\nI. Introduction\n- Introduce Joe and his coworkers going to a new restaurant for lunch\n- Establish Joe's hopes for a new lunch spot\n\nII. Interlude - Frank's Story\n- Introduce Frank, a pitcher for a MLB team, in a separate storyline\n- Show Frank's career progression, highlighting his return to the MLB team\n\nIII. Interlude - Ruth's Story\n- Introduce Ruth and her religious neighbor\n- Depict Ruth and her neighbor's interaction, ending with Ruth going home\n\nIV. Back to Joe's Story\n- Joe's coworkers suggest cheap lunch options, contrasting with the new restaurant\n- Joe's desire for a new lunch spot is reaffirmed\n\nV. Conclusion\n- Joe and his coworkers arrive at the new restaurant, and Joe looks forward to trying it out\n\nStory:\n\nStory:\n\nJoe walked out of the office with his coworkers, excited to try the new restaurant that had just opened up down the street. He hoped it would be his new go-to lunch spot, a place to escape the monotony of the usual sandwich shop.\n\nMeanwhile, in a different part of town, Frank was preparing for another game as a pitcher for a MLB team. It was a familiar routine for him, one he had perfected over the years. In fact, this was his second stint with the team, having made a successful comeback after a brief hiatus.\n\nAcross town, Ruth was chatting with her neighbor, a devoutly religious woman who always had a kind word and a warm smile. They talked about their weekends, sharing stories and laughter, before eventually parting ways. When they finished, Ruth headed back home, feeling uplifted by their conversation.\n\nBack at the office, Joe's coworkers were discussing lunch options. "Hey, we could just grab some burgers at the food truck," one of them suggested. "Or hit up the deli for some quick sandwiches," another chimed in. Joe politely declined, his eyes fixed on the new restaurant. He was determined to try something new, something that would impress.\n\nFinally, the group arrived at the restaurant, and Joe's anticipation was palpable. As they were led to their table, Joe couldn't help but feel a sense of excitement. This could be it, his new lunch spot, his haven from the ordinary. He took a deep breath, ready to indulge in a culinary adventure. """
last_story_part = text.rsplit("\n\nStory:", 1)[-1]
last_story_part
text.rsplit("\n\nStory:",1)[-1].strip()


'Joe walked out of the office with his coworkers, excited to try the new restaurant that had just opened up down the street. He hoped it would be his new go-to lunch spot, a place to escape the monotony of the usual sandwich shop.\n\nMeanwhile, in a different part of town, Frank was preparing for another game as a pitcher for a MLB team. It was a familiar routine for him, one he had perfected over the years. In fact, this was his second stint with the team, having made a successful comeback after a brief hiatus.\n\nAcross town, Ruth was chatting with her neighbor, a devoutly religious woman who always had a kind word and a warm smile. They talked about their weekends, sharing stories and laughter, before eventually parting ways. When they finished, Ruth headed back home, feeling uplifted by their conversation.\n\nBack at the office, Joe\'s coworkers were discussing lunch options. "Hey, we could just grab some burgers at the food truck," one of them suggested. "Or hit up the deli for so