<a href="https://colab.research.google.com/github/RajanMehta/mini-projects/blob/master/coref_resolution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install neuralcoref
!pip install spacy==2.1.0
!python3 -m spacy download en

In [3]:
import re
import spacy
import neuralcoref
from nltk.tokenize import sent_tokenize

nlp = spacy.load('en')

# Add neural coref to SpaCy's pipe
neuralcoref.add_to_pipe(nlp,greedyness=0.5,max_dist=50,blacklist=False)

text = "Scientists know many things about the Sun. They know how old it is. \
The Sun is more than 4½ billion years old. It is also a star that is the centre of our solar system. \
They also know the Sun’s size."

doc = nlp(text)
clusters = doc._.coref_clusters
print("clusters: \n",clusters)
print ("\n")

resolved_coref = doc._.coref_resolved
print ("Resolved by NeuralCoref: \n", resolved_coref) # the pronoun resolved neural co-reference text is very noisy.

clusters: 
 [Scientists: [Scientists, They, They], the Sun: [the Sun, The Sun, Sun, It]]


Resolved by NeuralCoref: 
 Scientists know many things about the Sun. Scientists know how old it is. the Sun the Sun is more than 4½ billion years old. the Sun is also a star that is the centre of our solar system. Scientists also know the Sun’s size.


In [0]:
# custom pronoun resolution 

pronoun_list=['he','she','it','they','them','him','her','his','hers','its','we','us']

def get_sentence_for_index(index, resolved_list):
    """
    Example Input:
      index: 8
      resolved_list = ['Scientists ', 'know ', 'many ', 'things ', 'about ', 'the ', 'Sun', '. ', 
        'They ', 'know ', 'how ', 'old ', 'it ', 'is', '. ', 
        'The ', 'Sun ', 'is ', 'more ', 'than ', '4½ ', 'billion ', 'years ', 'old', '. ', 
        'It ', 'is ', 'also ', 'a ', 'star', '. ', 
        'They ', 'also ', 'know ', 'the ', 'Sun', '’s ', 'size', '.']
    Expected Output: ['They ', 'know ', 'how ', 'old ', 'it ', 'is', '. ']
    """
    end_of_sentence_punctuation = ['.','!','?']
    beginning_index = index
    ending_index = index
    while (beginning_index>=0):
        val = resolved_list[beginning_index].strip()
        if (val in end_of_sentence_punctuation):
            break
        else:
            beginning_index = beginning_index -1
    while (ending_index<=(len(resolved_list)-1)):
        val = resolved_list[ending_index].strip()
        if (val in end_of_sentence_punctuation):
            break
        else:
            ending_index = ending_index +1
    return resolved_list[beginning_index+1:ending_index+1],beginning_index+1


def get_resolved(doc, clusters):
    """
    Return a list of utterrances text where the coref are resolved to the most representative mention
    """
    resolved = list(tok.text_with_ws for tok in doc)
    questions =[]
    for cluster in clusters:
        for coref in cluster:
            coref_text = coref.text.lower()
            cluster_main_text = cluster.main.text.lower()
            if len(coref_text.split())==1 and coref_text != cluster_main_text and coref_text in pronoun_list:
                get_sentence,start_index = get_sentence_for_index(coref.start,resolved)
                get_sentence_string = ''.join(get_sentence).lower()
                cluster_main_string = ' '.join(cluster_main_text.strip().split())
                if (cluster_main_string not in get_sentence_string):
                    resolved[coref.start] = cluster.main.text + doc[coref.end-1].whitespace_
                    if start_index == coref.start:
                        resolved[coref.start] = resolved[coref.start].capitalize() 
                    final_sentence = ''.join(get_sentence)
                    # Leave out very short sentences to frame questions.
                    if len(final_sentence)>20:
                        questions.append([final_sentence,coref.text,resolved[coref.start]])

    return ''.join(resolved),questions

In [13]:
resolved, questions = get_resolved (doc, clusters)
print ("Custom resolved: \n")
print (resolved)

Custom resolved: 

Scientists know many things about the Sun. Scientists know how old it is. The Sun is more than 4½ billion years old. The sun is also a star that is the centre of our solar system. Scientists also know the Sun’s size.


In [7]:
print ("Questions generated :")
print ("[Note: There might be a few answer errors because of the errors in the coreference algorithm itself] \n")

for index,question in enumerate(questions):
    print ('%d) What does \"%s\" refer to in the sentence - \"%s\"?'%(index+1,question[1],question[0].strip()))
    print ("Ans : %s\n"%(question[2]))

Questions generated :
[Note: There might be a few answer errors because of the errors in the coreference algorithm itself] 

1) What does "They" refer to in the sentence - "They know how old it is."?
Ans : Scientists 

2) What does "They" refer to in the sentence - "They also know the Sun’s size."?
Ans : Scientists 

3) What does "It" refer to in the sentence - "It is also a star that is the centre of our solar system."?
Ans : The sun 

