In [1]:
import pandas as pd
import json
import re
from ast import literal_eval
import spacy
import neuralcoref

# Calculating Unresolved References using NeuralCoref

In [12]:
df_conversations = pd.read_csv('../data/conversations_heuristics.csv', converters={"prompts": literal_eval, "answers": literal_eval})
df_conversations.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,conversation_id,annotated_gaps,prompts,answers,processed_style,status,answers_fulltext,...,subordinate_clauses_list,num_entities,first_prompt_length,num_repeated_3_grams,num_repeated_2_grams,entailment,neutral,contradiction,num_unresolved_ref,prompts_fulltext_clean
0,0,0,0,3,[{'No gap'}],[You are an expert search query generator.\n\n...,"[Certainly, here are 10 high-quality search qu...",persona,closed,"Certainly, here are 10 high-quality search que...",...,0,13,216,8,12,0.064329,0.734124,0.201547,6,You are an expert search query generator.\n\nI...
1,1,1,1,4,[{'No gap'}],[Please analyse the text below to find the roo...,[Here is the JSON output analyzing the root ca...,chain-of-thoughts(cot),closed,Here is the JSON output analyzing the root cau...,...,6,47,1727,60,153,0.272672,0.711683,0.015645,15,Please analyse the text below to find the root...
2,2,2,2,5,"[{'missing specification', 'missing context'},...","[https://github.com/arakoodev/EdgeChains, can ...",[Finished browsingShow workHide workEdgeChains...,directive prompting,open,"Finished browsingShow workHide workEdgeChains,...",...,0,0,1,0,0,0.054083,0.517871,0.428047,1,[URL] can this be used from python:\n[URL] giv...
3,3,3,3,6,"[{'unclear instruction', 'missing context'}, {...",[DeviceData.jsJavaScriptThe attached Next.js p...,[The error messages you provided give insight ...,chain-of-thought (cot),open,The error messages you provided give insight i...,...,12,41,270,11,46,0.201644,0.774854,0.023501,54,DeviceData.jsJavaScriptThe attached Next.js pa...
4,4,4,4,7,"[{'No gap'}, {'multiple context', 'unclear ins...","[how to make pdf downloader through HTML , CSS...","[Creating a PDF downloader through HTML, CSS, ...",directive prompting,open,"Creating a PDF downloader through HTML, CSS, J...",...,0,3,13,1,4,0.021345,0.264188,0.714467,3,"how to make pdf downloader through HTML , CSS ..."


In [13]:
# Load spaCy's English model
nlp = spacy.load("en_core_web_md")

In [14]:
# Add NeuralCoref to spaCy's pipe
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name="neuralcoref")

In [15]:
# Add neuralcoref to spaCy's pipeline
# neuralcoref.add_to_pipe(nlp)

# Example text
text = "The software was updated yesterday. It should fix the issue, but we are still unclear."

# Process the text
doc = nlp(text)

# Print coreference resolution
print(doc._.coref_clusters)

[The software: [The software, It]]


In [16]:
def count_unresolved_references(text):
    # Parse the text with spaCy + NeuralCoref
    doc = nlp(text)

    # Coreference resolution information
    if doc._.has_coref:
        coref_clusters = doc._.coref_clusters  # These are resolved references
    else:
        coref_clusters = []

    # Initialize a list to store unresolved references
    unresolved_references = []

    # List to store potential antecedents (nouns/named entities)
    antecedents = []

    # Iterate through the tokens
    for token in doc:
        # Check if the token is a pronoun (like 'he', 'she', 'it', 'they', etc.)
        if token.pos_ == "PRON":
            # Check if the pronoun is part of a resolved coreference
            is_resolved = False
            for cluster in coref_clusters:
                for mention in cluster.mentions:  # Each mention is a Span
                    if token in mention:
                        is_resolved = True
                        break
                if is_resolved:
                    break
            if not is_resolved:
                unresolved_references.append(token.text)
        
        # Store nouns/named entities as potential antecedents
        if token.pos_ in ["NOUN", "PROPN"]:  # NOUN = common noun, PROPN = proper noun
            antecedents.append(token)
    
    # Return the count of unresolved references
    return len(unresolved_references)

corref_list = []

for index, row in df_conversations.iterrows():
    full_clean_text = row['prompts_fulltext_clean']
    num_unresolved_ref = count_unresolved_references(full_clean_text)

    corref_list.append(num_unresolved_ref)

df_conversations['num_unresolved_ref'] = corref_list

In [9]:
df_conversations.head(n=20)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,conversation_id,annotated_gaps,prompts,answers,processed_style,status,answers_fulltext,prompts_fulltext,...,modifiers_list,subordinate_clauses_list,num_entities,first_prompt_length,num_repeated_3_grams,num_repeated_2_grams,entailment,neutral,contradiction,num_unresolved_ref
0,0,0,3,[{'No gap'}],[You are an expert search query generator.\n\n...,"[Certainly, here are 10 high-quality search qu...",persona,closed,"Certainly, here are 10 high-quality search que...",You are an expert search query generator.\n\nI...,...,23,0,12,216,8,12,0.064329,0.734124,0.201547,6
1,1,1,4,[{'No gap'}],[Please analyse the text below to find the roo...,[Here is the JSON output analyzing the root ca...,chain-of-thoughts(cot),closed,Here is the JSON output analyzing the root cau...,Please analyse the text below to find the root...,...,293,6,50,1727,60,153,0.272672,0.711683,0.015645,15
2,2,2,5,"[{'missing specification', 'missing context'},...","[https://github.com/arakoodev/EdgeChains, can ...",[Finished browsingShow workHide workEdgeChains...,directive prompting,open,"Finished browsingShow workHide workEdgeChains,...",https://github.com/arakoodev/EdgeChains can th...,...,2,0,0,1,0,0,0.054083,0.517871,0.428047,1
3,3,3,6,"[{'unclear instruction', 'missing context'}, {...",[DeviceData.jsJavaScriptThe attached Next.js p...,[The error messages you provided give insight ...,chain-of-thought (cot),open,The error messages you provided give insight i...,DeviceData.jsJavaScriptThe attached Next.js pa...,...,101,12,36,270,23,60,0.201644,0.774854,0.023501,54
4,4,4,7,"[{'No gap'}, {'multiple context', 'unclear ins...","[how to make pdf downloader through HTML , CSS...","[Creating a PDF downloader through HTML, CSS, ...",directive prompting,open,"Creating a PDF downloader through HTML, CSS, J...","how to make pdf downloader through HTML , CSS ...",...,1,0,4,13,1,4,0.021345,0.264188,0.714467,3
5,5,5,8,"[{'missing context'}, {'No gap'}]",[How to run a java class inside of a container...,[To run a Java class inside a container using ...,directive prompting,closed,To run a Java class inside a container using T...,How to run a java class inside of a container ...,...,2,0,0,12,1,2,0.01806,0.93933,0.04261,1
6,6,6,9,"[{'No gap'}, {'No gap'}, {'No gap'}]",[How to add a java class in a generic containe...,[To add a Java class in a generic container fr...,reflection,closed,To add a Java class in a generic container fro...,How to add a java class in a generic container...,...,6,0,0,17,1,4,0.385202,0.430229,0.184569,2
7,7,7,12,[{'No gap'}],[I have a django and rasa application (rasa is...,[If you want to set a URL for the Rasa applica...,directive prompting,open,If you want to set a URL for the Rasa applicat...,I have a django and rasa application (rasa is ...,...,4,0,1,40,0,1,0.003798,0.983779,0.012423,4
8,8,8,14,"[{'missing specification', 'missing context'},...",[Can I use local storage in the browser to sto...,"[Yes, you can use the browser's local storage ...",chain-of-thought (cot),open,"Yes, you can use the browser's local storage t...",Can I use local storage in the browser to stor...,...,8,1,0,17,2,6,0.026539,0.952437,0.021025,6
9,9,9,15,[{'missing context'}],[Execution failed for task ':app:mergeSsoDebug...,[The error message you're encountering is rela...,directive prompting,closed,The error message you're encountering is relat...,Execution failed for task ':app:mergeSsoDebugJ...,...,2,0,0,540,0,0,0.247526,0.26167,0.490804,0


In [17]:
df_conversations.to_csv('../data/conversations_heuristics.csv', index=False)

In [11]:
len(df_conversations)

433