In [146]:
import string
import spacy
import neuralcoref
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [0]:
ENTITY = 0
POS_TAG = 1

PERSONAL_PRONOUN = 'PRP'

def dumb_coref(ne_tree):
  tokens = []
  last_ne = None
  
  for node in ne_tree:
    if type(node) is nltk.Tree:
      tokens.append(' ')
      last_ne = ' '.join([part[ENTITY] for part in node])
      tokens.append(last_ne)
    else:
        
      if node[POS_TAG] not in string.punctuation:
        tokens.append(' ')

      if node[POS_TAG] == PERSONAL_PRONOUN and last_ne:
        tokens.append(last_ne)
      else:
        tokens.append(node[0])

  res_text = ''.join(tokens).strip()
  return res_text

def dumb_coref_v2(ne_tree):
  tokens = []
  last_ne = None
  
  for i, node in enumerate(ne_tree):
    if type(node) is nltk.Tree:
      tokens.append(' ')
      last_ne = ' '.join([part[ENTITY] for part in node])
      tokens.append(last_ne)
      if ne_tree[i-1][ENTITY] == 'and' and type(ne_tree[i-2]) is nltk.Tree:
          last_ne = ' '.join([part[ENTITY] for part in ne_tree[i-2]]) + ' ' + ne_tree[i-1][ENTITY] + ' ' + last_ne
    else:
      if node[POS_TAG] not in string.punctuation:
        tokens.append(' ')

      if node[POS_TAG] == PERSONAL_PRONOUN and last_ne:
        tokens.append(last_ne)
      else:
        tokens.append(node[0])

  res_text = ''.join(tokens).strip()
  return res_text

In [0]:
#@title 1. Have some short input document (a couple of sentences, which contain Corefs.)

INPUT_SENTENCES = [
  'I had a friend Jim. Me and him had good times.',
  'Monkey King is an important character of Eastern culture. Many people in Asia know stories about his adventures.',
  'Sun Wukong is an important character of Eastern culture. Many people in Asia know stories about his adventures.',
  'Romeo and Juliet is one of the most famous love stories in the world. They won over many passionate readers across the whole globe.'
]
EXPECTED_RESULTS = [
  'I had a friend Jim. Me and Jim had good times.',
  'Monkey King is an important character of Eastern culture. Many people in Asia know stories about Monkey King adventures.',
  'Sun Wukong is an important character of Eastern culture. Many people in Asia know stories about Sun Wukong adventures.',
  'Romeo and Juliet is one of the most famous love stories in the world. Romeo and Juliet won over many passionate readers across the whole globe.'
]


In [0]:
#@title 2. Apply POS-tagging to the document (with NLTK or spaCy)

pos_tags = [None] * len(INPUT_SENTENCES)
for i, sentence in enumerate(INPUT_SENTENCES):
  pos_tags[i] = nltk.pos_tag(nltk.tokenize.word_tokenize(sentence))
  

In [0]:
#@title 3. Apply NE-Recognition to the document (with spaCy or NLTK)

ne_trees = [None] * len(INPUT_SENTENCES)
for i, sentence in enumerate(INPUT_SENTENCES):
  ne_trees[i] = nltk.ne_chunk(pos_tags[i])
  

In [151]:
#@markdown ###4. For any pronoun – replace it with previous Named Entity
#@markdown ###5. Print to result text
#@markdown ###6. Evaluate your system with a couple of input sentences – you are the domain expert!
#@markdown ###7. Present results

for i, sentence in enumerate(INPUT_SENTENCES):
  dumb_coref_result = dumb_coref(ne_trees[i])
  bleu_score = nltk.translate.bleu_score.sentence_bleu([EXPECTED_RESULT[i].split()], dumb_coref_result.split())
  
  print('INPUT')
  print(sentence)
  print('EXPECTED')
  print(EXPECTED_RESULT[i])
  print('DUMB COREF')
  print(dumb_coref_result)
  print(f'BLEU Score: {bleu_score}')
  print()
  

INPUT
I had a friend Jim. Me and him had good times.
EXPECTED
I had a friend Jim. Me and Jim had good times.
DUMB COREF
I had a friend Jim. Me and Jim had good times.
BLEU Score: 1.0

INPUT
Monkey King is an important character of Eastern culture. Many people in Asia know stories about his adventures.
EXPECTED
Monkey King is an important character of Eastern culture. Many people in Asia know stories about Monkey King adventures.
DUMB COREF
Monkey King is an important character of Eastern culture. Many people in Asia know stories about his adventures.
BLEU Score: 0.8434168123760957

INPUT
Sun Wukong is an important character of Eastern culture. Many people in Asia know stories about his adventures.
EXPECTED
Sun Wukong is an important character of Eastern culture. Many people in Asia know stories about Sun Wukong adventures.
DUMB COREF
Sun Wukong is an important character of Eastern culture. Many people in Asia know stories about his adventures.
BLEU Score: 0.8434168123760957

INPUT
Rome

In [152]:
#@markdown ###8. Run spaCy coref on the same text
#@markdown ###9. Compare evaluation results of spaCy and your system

# you may need to downgrade spacy to 2.1.0
# https://github.com/huggingface/neuralcoref/issues/158
nlp = spacy.load('en_core_web_md')
neuralcoref.add_to_pipe(nlp)

for i, sentence in enumerate(INPUT_SENTENCES):
  nlp_sentence = nlp(sentence)
  spacy_coref_result = nlp_sentence._.coref_resolved
  bleu_score = nltk.translate.bleu_score.sentence_bleu([EXPECTED_RESULT[i].split()], spacy_coref_result.split())
  
  print('INPUT')
  print(sentence)
  print('EXPECTED')
  print(EXPECTED_RESULT[i])
  print('SPACY COREF')
  print(spacy_coref_result)
  print(f'BLEU Score: {bleu_score}')
  print()

INPUT
I had a friend Jim. Me and him had good times.
EXPECTED
I had a friend Jim. Me and Jim had good times.
SPACY COREF
I had a friend Jim. Me and a friend Jim had good times.
BLEU Score: 0.6703420896351792

INPUT
Monkey King is an important character of Eastern culture. Many people in Asia know stories about his adventures.
EXPECTED
Monkey King is an important character of Eastern culture. Many people in Asia know stories about Monkey King adventures.
SPACY COREF
Monkey King is an important character of Eastern culture. Many people in Asia know stories about Monkey King adventures.
BLEU Score: 1.0

INPUT
Sun Wukong is an important character of Eastern culture. Many people in Asia know stories about his adventures.
EXPECTED
Sun Wukong is an important character of Eastern culture. Many people in Asia know stories about Sun Wukong adventures.
SPACY COREF
Sun Wukong is an important character of Eastern culture. Many people in Asia know stories about Sun Wukong adventures.
BLEU Score: 1.0

In [153]:
#@title Bonus: Make your coref detection a bit more smart - if you want

for i, sentence in enumerate(INPUT_SENTENCES):
  dumb_coref_result = dumb_coref_v2(ne_trees[i])
  bleu_score = nltk.translate.bleu_score.sentence_bleu([EXPECTED_RESULT[i].split()], dumb_coref_result.split())
  
  print('INPUT')
  print(sentence)
  print('EXPECTED')
  print(EXPECTED_RESULT[i])
  print('DUMB COREF')
  print(dumb_coref_result)
  print(f'BLEU Score: {bleu_score}')
  print()

INPUT
I had a friend Jim. Me and him had good times.
EXPECTED
I had a friend Jim. Me and Jim had good times.
DUMB COREF
I had a friend Jim. Me and Jim had good times.
BLEU Score: 1.0

INPUT
Monkey King is an important character of Eastern culture. Many people in Asia know stories about his adventures.
EXPECTED
Monkey King is an important character of Eastern culture. Many people in Asia know stories about Monkey King adventures.
DUMB COREF
Monkey King is an important character of Eastern culture. Many people in Asia know stories about his adventures.
BLEU Score: 0.8434168123760957

INPUT
Sun Wukong is an important character of Eastern culture. Many people in Asia know stories about his adventures.
EXPECTED
Sun Wukong is an important character of Eastern culture. Many people in Asia know stories about Sun Wukong adventures.
DUMB COREF
Sun Wukong is an important character of Eastern culture. Many people in Asia know stories about his adventures.
BLEU Score: 0.8434168123760957

INPUT
Rome