In [0]:
!apt-get install libmysqlclient-dev
!pip install pattern

import urllib
import spacy
from typing import List, Tuple, Dict
from tqdm import tqdm
from collections import Counter, defaultdict
from pattern.en import conjugate, PRESENT, PROGRESSIVE


In [0]:
def load_wiki(url = "http://nlp.biu.ac.il/~ravfogs/neural_decomposition/wiki.raw.250k"):
    
  response = urllib.request.urlopen(url)
  raw = response.read().decode('utf8')
  sents = raw.split("\n")
  
  return sents


def create_docs(sentences: List[str]) -> List[spacy.tokens.Doc]:

    #parsing and toekenizing (NOTE: takes about 10 minutes)
    
    nlp = spacy.load('en_core_web_sm')
    nlp.remove_pipe("ner")
    
    start = time.time()
    
    docs = list(nlp.pipe(sentences, batch_size = 100))
    
    docs = list(docs)
    print(time.time() - start)
    
    return docs
  

def count_obj_verb_cooccurrences(docs: List[spacy.tokens.Doc]) -> Dict[str, Counter]:
  """
  Arguments:
    docs: a list of docs, each representing one sentence.
  -------------------------------
  Return:
    counter: a Counter that counts co-occurrences of (direct object, verb). e.g., counter["book"]["read"] counts co-occurrences of 'book' as the direct
    object of 'read', as in the sentence 'I read a book. 
    Note that the verbs are conjugated to their lemma form.'
  """
  counter = defaultdict(Counter)
  for doc in tqdm(docs, total = len(docs)):
    
      for tok in doc:
        
        if tok.dep_ == "dobj":
          parent = tok.head
          obj_word_lemma, verb_word_lemma = tok.lemma_, parent.lemma_
          counter[obj_word_lemma][verb_word_lemma] += 1
  
  return counter


def process_one_sentence(sentence: str, counter: defaultdict) -> Tuple[List[str], bool, int]:
  
  """
  analyze one sentence, and try to add a verb if possible. 
  --------------------
  Arguments:
    sentence: a string. The sentence is assumed not to be of the easy class (i.e., no sentences like "begin reading the book")
    counter: the Counter for (dobj, verb co-occurrences)
  --------------------
  Return:
  
    a tuple (new_sentence: List[str], modified: bool, index: int)
            new_sentence: the sentence, possibly after the addition of a verb.
            modified: a flag that is true if the sentence was modified, false otherwise.
            index: the index of the added word. default: -1
  """
  
  nlp = spacy.load('en_core_web_sm')
  doc = nlp(sentence)
  as_list = [tok.text for tok in doc]
  
  modified = False
  
  for i, tok in enumerate(doc):
    
    
    if tok.dep_ == "dobj":
      
      verb_lemma = tok.head.lemma_
      if tok.text in counter:
        
        modified = True
        possible_verbs = [verb for (verb, count) in counter[tok.lemma_].most_common(10)]
        most_probable_verb = possible_verbs[0]
        verb_progressive = conjugate(most_probable_verb, tense = PRESENT, aspect = PROGRESSIVE)
        verb_index = tok.head.i
        as_list.insert(verb_index + 1, verb_progressive)
        return (as_list, True, verb_index + 1)
      
  return (as_list, False, -1)


In [0]:
sents = load_wiki()
docs = create_docs(sents)

560.0931024551392


In [108]:
obj2verb = count_obj_verb_cooccurrences(docs)
print("Common verbs for the word 'book': {}".format(obj2verb["book"].most_common(5)))
print("Common verbs for the word 'door': {}".format(obj2verb["door"].most_common(5)))
print("Common verbs for the word 'building': {}".format(obj2verb["building"].most_common(5)))
print("Common verbs for the word 'food': {}".format(obj2verb["food"].most_common(5)))
print("Common verbs for the word 'initiative': {}".format(obj2verb["initiative"].most_common(5)))




100%|██████████| 250001/250001 [00:03<00:00, 73169.07it/s]

Common verbs for the word 'book': [('write', 65), ('publish', 53), ('read', 15), ('author', 13), ('have', 9)]
Common verbs for the word 'door': [('open', 31), ('close', 17), ('show', 4), ('have', 3), ('leave', 2)]
Common verbs for the word 'building': [('design', 10), ('include', 9), ('have', 8), ('construct', 8), ('purchase', 6)]
Common verbs for the word 'food': [('eat', 7), ('prepare', 6), ('provide', 6), ('buy', 4), ('grow', 4)]
Common verbs for the word 'initiative': [('take', 7), ('support', 5), ('promote', 4), ('announce', 2), ('have', 1)]





In [116]:
aspectual = "I just begun the book."
aspectual2 = "I continue the movie"
non_aspectual = "I throw the book."
non_aspectual2 = "I started an initiative."
non_aspectual3 = "They continued their journey to Paris."
print(process_one_sentence(aspectual, obj2verb))
print(process_one_sentence(aspectual2, obj2verb))
print(process_one_sentence(non_aspectual, obj2verb))
print(process_one_sentence(non_aspectual2, obj2verb)) # grammatical but awkward -> probably higher perplexity
print(process_one_sentence(non_aspectual3, obj2verb)) # this is grammatical, but doesn't sound so good. Maybe higher perplexity?



(['I', 'just', 'begun', 'writing', 'the', 'book', '.'], True, 3)
(['I', 'continue', 'making', 'the', 'movie'], True, 2)
(['I', 'throw', 'writing', 'the', 'book', '.'], True, 2)
(['I', 'started', 'taking', 'an', 'initiative', '.'], True, 2)
(['They', 'continued', 'making', 'their', 'journey', 'to', 'Paris', '.'], True, 2)
