In [1]:
import stanza
pipe = stanza.Pipeline("en", processors="tokenize,coref")
text = "Alice went to the store. She bought a cake. John was there too. He bought a pie."
output = pipe(text)

import json
import re

output_json = json.loads(str(output))
def tokens_to_sentences(tokens_list):
    """
    将Stanza tokenize后的tokens列表转换回句子列表。
    :param tokens_list: Stanza tokenize处理后的输出列表
    :return: 由句子组成的列表，每个句子为一个字符串
    """
    sentences = []  # 存储还原的句子
    current_sentence = ""  # 当前正在构建的句子

    for token in tokens_list:
        # 添加当前token的文本到当前句子，但先不加空格
        if token["text"].strip():  # 避免添加空字符串
            if current_sentence:  # 如果当前句子非空
                # 检查上一个token是否以标点结束，如果是，则不添加额外空格
                if not re.match(r'[\.\?!]+$', token["text"]) and \
                   not re.match(r'^[\[\(\{\<\]\)\}\>]', token["text"]):
                    current_sentence += " "  # 否则，在新token前添加空格
            current_sentence += token["text"]

    # 如果current_sentence非空，说明最后一个句子未被加入到sentences中，应将其添加
    if current_sentence:
        sentences.append(current_sentence)

    return sentences
filtered_sentences = []

for sentence in output_json:
    # 检查当前句子中是否有提及"John"的代词或名词
    mentions_john = any(token["text"] == "John" or 
                         (token["coref_chains"] and 
                          token["coref_chains"][0]["representative_text"] == "John")
                        for token in sentence)
    
    # 如果没有提及"John"，则保留该句子
    if not mentions_john:
        filtered_sentences.append(sentence)

beautiful_sentences = []
for idx, sentence in enumerate(filtered_sentences, start=1):
    beautiful_sentences.extend(tokens_to_sentences(sentence))
fragmented_story = " ".join(beautiful_sentences)
fragmented_story

  from .autonotebook import tqdm as notebook_tqdm
2024-06-13 16:18:26 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 11.4MB/s]                    
2024-06-13 16:18:27 INFO: Downloaded file to /home/hansirui/stanza_resources/resources.json
2024-06-13 16:18:27 INFO: Loading these models for language: en (English):
| Processor | Package                 |
---------------------------------------
| tokenize  | combined                |
| mwt       | combined                |
| coref     | ontonotes_electra-large |

2024-06-13 16:18:27 INFO: Using device: cuda
2024-06-13 16:18:27 INFO: Loading: tokenize
2024-06-13 16:18:41 INFO: Loading: mwt
2024-06-13 16:18:41 INFO: Loading: coref
2024-06-13 16:19:01 INFO: Done loading processors!


'Alice went to the store. She bought a cake.'