In [1]:
# ## coref-env
from spacy.tokens import Doc
import spacy
import json
import copy

###add to utilities
def read_json_file(file_path):
    """
    Read a JSON file and return its contents as a Python dictionary.

    :param file_path: The path to the JSON file.
    :type file_path: str
    :return: A dictionary representing the JSON data.
    :rtype: dict
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in file {file_path}: {e}")
    except Exception as e:
        print(f"An error occurred while reading the file {file_path}: {e}")

#export a json file
def save_json_file(output_path,json_data):
    with open(output_path, 'w') as json_file:
        json.dump(json_data, json_file, indent=2)
        print("file was saved succesfully")



class CoreferencesResolver:

    def __init__(self, data_path, output_path, corpus):
        self.data_path = data_path
        self.output_path = output_path
        self.data = {}
        self.result_data = {}
        self.corpus = corpus


    def replace_references(self,doc: Doc) -> str:
        """Function for resolving references with the coref ouput
        doc (Doc): The Doc object processed by the coref pipeline
        RETURNS (str): The Doc string with resolved references
        """
        token_mention_mapper = {}
        output_string = ""
        clusters = [
            val for key, val in doc.spans.items() if key.startswith("coref_cluster")
        ]
    
        # Iterate through every found cluster
        for cluster in clusters:
            first_mention = cluster[0]
            #print(first_mention,":",first_mention[0].idx, len(first_mention.text) )
            first_mention_Endposition = first_mention[0].idx + len(first_mention.text)
            # Iterate through every other span in the cluster
            for mention_span in list(cluster)[1:]:
                #print(mention_span[0].idx)
                
                if first_mention_Endposition > mention_span[0].idx   :
                        print(mention_span)
                else:
                    ## replace
    
                    # Set first_mention as value for the first token in mention_span in the token_mention_mapper
                    token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_
    
                    for token in mention_span[1:]:
                        # Set empty string for all the other tokens in mention_span
                        #print(token.idx, ":",token)
                        token_mention_mapper[token.idx] = ""
    
        # Iterate through every token in the Doc
        for token in doc:
            # Check if token exists in token_mention_mapper
            if token.idx in token_mention_mapper:
                output_string += token_mention_mapper[token.idx]
            # Else add original token text
            else:
                output_string += token.text + token.whitespace_
        return output_string

    def coreference_resolver(self,text,coref_spacy):
        doc = coref_spacy(text)
        return self.replace_references(doc)

    
    def corpus_coreferences_resolution(self):
        coref_spacy = spacy.load("en_coreference_web_trf")
        self.data = read_json_file(self.data_path)
        temp_data = copy.deepcopy(self.data)
        if self.corpus == "Computer_science":
            for element in self.data:
               temp_data[element]["abstract"] = self.coreference_resolver(self.data[element]["abstract"],coref_spacy)
        elif  self.corpus == "Music":
            for element in self.data:
                temp_data[element]["paragraph"] = self.coreference_resolver(self.data[element]["paragraph"],coref_spacy)
            
        else:
            print("corpus name invalid !")
            return
        self.result_data = temp_data
        save_json_file(self.output_path, self.result_data)

In [4]:
####Computer_science
# corpus = "Computer_science"
# data_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/data/ziwei_cs_data/CS_bench.json"
# output_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/data/ziwei_cs_data/CS_bench_coref.json"

###### Music
data_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/data/Music_Bench/Bench_music.json"
output_path = "C:/Users/admin-user/Desktop/my_phd/implementations_KG/data/Music_Bench/Bench_music_coref.json"
corpus = "Music"
cr = CoreferencesResolver(data_path,output_path,corpus)
cr.corpus_coreferences_resolution()

file was saved succesfully
