I would like to reproduce models used in the paper “How to avoid sentences spelling boring? Towards a neural approach to unsupervised metaphor generation” by Zhiwei Yu and Xiaojun Wan (NAACL-HLT 2019):

https://www.semanticscholar.org/paper/How-to-Avoid-Sentences-Spelling-Boring-Towards-a-to-Yu-Wan/74f08a35123065389a2805e2500141da01c9f6ef

# 1. Automatic extraction of metaphor pairs

## Get the dataset

In [8]:
import nltk

In [42]:
nltk.corpus.gutenberg.fileids()
sents = nltk.corpus.gutenberg.sents('carroll-alice.txt')

# Alternative:
# corpus = nltk.corpus.brown.sents()
sents

[['[', 'Alice', "'", 's', 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', '1865', ']'], ['CHAPTER', 'I', '.'], ...]

In [43]:
words = [word.lower() for word in nltk.corpus.gutenberg.words('carroll-alice.txt')]
words

['[',
 'alice',
 "'",
 's',
 'adventures',
 'in',
 'wonderland',
 'by',
 'lewis',
 'carroll',
 '1865',
 ']',
 'chapter',
 'i',
 '.',
 'down',
 'the',
 'rabbit',
 '-',
 'hole',
 'alice',
 'was',
 'beginning',
 'to',
 'get',
 'very',
 'tired',
 'of',
 'sitting',
 'by',
 'her',
 'sister',
 'on',
 'the',
 'bank',
 ',',
 'and',
 'of',
 'having',
 'nothing',
 'to',
 'do',
 ':',
 'once',
 'or',
 'twice',
 'she',
 'had',
 'peeped',
 'into',
 'the',
 'book',
 'her',
 'sister',
 'was',
 'reading',
 ',',
 'but',
 'it',
 'had',
 'no',
 'pictures',
 'or',
 'conversations',
 'in',
 'it',
 ',',
 "'",
 'and',
 'what',
 'is',
 'the',
 'use',
 'of',
 'a',
 'book',
 ",'",
 'thought',
 'alice',
 "'",
 'without',
 'pictures',
 'or',
 'conversation',
 "?'",
 'so',
 'she',
 'was',
 'considering',
 'in',
 'her',
 'own',
 'mind',
 '(',
 'as',
 'well',
 'as',
 'she',
 'could',
 ',',
 'for',
 'the',
 'hot',
 'day',
 'made',
 'her',
 'feel',
 'very',
 'sleepy',
 'and',
 'stupid',
 '),',
 'whether',
 'the',
 'plea

In [38]:
text = nltk.Text(words)
text.similar('strange') # The text.similar() method takes a word w, finds all contexts w1w w2, then finds all words w' that appear in the same context, i.e. w1w'w2.

down then long two smaller growing gloves yet sad unhappy quietly
decidedly sneezing butter bread music tillie memory queens gravy


### <font color='red'>! next step with wiki-dataset: https://www.kdnuggets.com/2017/11/building-wikipedia-text-corpus-nlp.html</font>

## Get the CBOW model

In [21]:
from gensim.models import Word2Vec

In [44]:
e_corpus = Word2Vec(sents) # creating the embeddings for the words in the corpus
e_corpus.most_similar('strange', topn=5)

  


[('your', 0.998446524143219),
 ('enough', 0.9983960390090942),
 ('off', 0.9983851909637451),
 ('are', 0.9983848929405212),
 ('now', 0.9983736276626587)]

## Create a list of target verbs

#### a) Just find the most frequent verbs using nltk

In [47]:
tagged_corpus = nltk.pos_tag(words)
tagged_corpus

[('[', 'JJ'),
 ('alice', 'NN'),
 ("'", 'POS'),
 ('s', 'NN'),
 ('adventures', 'NNS'),
 ('in', 'IN'),
 ('wonderland', 'NN'),
 ('by', 'IN'),
 ('lewis', 'NN'),
 ('carroll', 'NN'),
 ('1865', 'CD'),
 (']', 'NNP'),
 ('chapter', 'NN'),
 ('i', 'NN'),
 ('.', '.'),
 ('down', 'IN'),
 ('the', 'DT'),
 ('rabbit', 'NN'),
 ('-', ':'),
 ('hole', 'NN'),
 ('alice', 'NN'),
 ('was', 'VBD'),
 ('beginning', 'VBG'),
 ('to', 'TO'),
 ('get', 'VB'),
 ('very', 'RB'),
 ('tired', 'JJ'),
 ('of', 'IN'),
 ('sitting', 'VBG'),
 ('by', 'IN'),
 ('her', 'PRP$'),
 ('sister', 'NN'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('bank', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('of', 'IN'),
 ('having', 'VBG'),
 ('nothing', 'NN'),
 ('to', 'TO'),
 ('do', 'VB'),
 (':', ':'),
 ('once', 'RB'),
 ('or', 'CC'),
 ('twice', 'VB'),
 ('she', 'PRP'),
 ('had', 'VBD'),
 ('peeped', 'VBN'),
 ('into', 'IN'),
 ('the', 'DT'),
 ('book', 'NN'),
 ('her', 'PRP$'),
 ('sister', 'NN'),
 ('was', 'VBD'),
 ('reading', 'VBG'),
 (',', ','),
 ('but', 'CC'),
 ('it', 'PRP'),
 (

In [49]:
word_tag_fd = nltk.FreqDist(tagged_corpus)
verbs = [wt[0] for (wt, _) in word_tag_fd.most_common() if wt[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']] # https://medium.com/@gianpaul.r/tokenization-and-parts-of-speech-pos-tagging-in-pythons-nltk-library-2d30f70af13b
verbs

['said',
 'was',
 'had',
 'be',
 'is',
 'were',
 'went',
 'know',
 'did',
 'began',
 'thought',
 'are',
 'i',
 'do',
 'looked',
 'see',
 'have',
 'came',
 'have',
 'herself',
 'been',
 'get',
 'say',
 'got',
 'do',
 'go',
 'looking',
 'don',
 'think',
 'found',
 'going',
 's',
 'replied',
 'seemed',
 'i',
 ",'",
 'alice',
 'think',
 'took',
 'won',
 'see',
 'added',
 'getting',
 'make',
 'felt',
 'made',
 'tell',
 ",'",
 "!'",
 't',
 'tried',
 'cried',
 'being',
 'find',
 'spoke',
 'talking',
 'put',
 'ran',
 'say',
 'come',
 'am',
 'sat',
 'heard',
 'seen',
 'take',
 'saying',
 'gave',
 'go',
 'done',
 's',
 'asked',
 'saw',
 'come',
 'let',
 'turned',
 'trying',
 'know',
 ",'",
 'wonder',
 'remember',
 'knew',
 ",'",
 'gone',
 'hear',
 ".'",
 'turning',
 'beginning',
 'look',
 'waited',
 "!'",
 'makes',
 'speak',
 'kept',
 'sitting',
 'having',
 'wish',
 'eat',
 'walked',
 'begin',
 'got',
 'thinking',
 'didn',
 'please',
 'hurried',
 'coming',
 'put',
 'wouldn',
 'ask',
 "'",
 'made

In [50]:
# Let's find the most frequent verbs of each verb part-of-speech type.
def findtags(tag_prefix, tagged_text):
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text if tag.startswith(tag_prefix))
    return dict((tag, cfd[tag].most_common(5)) for tag in cfd.conditions())

In [51]:
tagdict = findtags('V', tagged_corpus)
for tag in sorted(tagdict):
    print(tag, tagdict[tag])

VB [('be', 148), ('i', 47), ('see', 44), ('have', 40), ('herself', 39)]
VBD [('said', 462), ('was', 357), ('had', 176), ('were', 85), ('went', 83)]
VBG [('looking', 32), ('going', 27), ('getting', 22), ('being', 19), ('talking', 17)]
VBN [('been', 38), ('seen', 15), ('done', 15), ('come', 14), ('gone', 13)]
VBP [('know', 72), ('are', 54), ('do', 47), ('have', 40), ('don', 31)]
VBZ [('is', 108), ('s', 27), (",'", 21), ("!'", 11), ('makes', 11)]


### <font color='red'>! not sure how to pre-process (basic form?) them further => talk to Semih </font>

#### b) Find the main verbs using Stanford’s CoreNLP

https://towardsdatascience.com/natural-language-processing-using-stanfords-corenlp-d9e64c1e1024

In [53]:
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

In [85]:
check = ' '.join(sents[7])
check

"I shall be late !'"

In [86]:
result = nlp.annotate(check,
                   properties={
                       'annotators': 'tokenize,ssplit,pos,depparse,parse',
                       'outputFormat': 'json',
                       'timeout': 50000,
                   })
result

{'sentences': [{'index': 0,
   'parse': "(ROOT\r\n  (S\r\n    (NP (PRP I))\r\n    (VP (MD shall)\r\n      (VP (VB be)\r\n        (ADJP (JJ late))))\r\n    (. !) ('' ')))",
   'basicDependencies': [{'dep': 'ROOT',
     'governor': 0,
     'governorGloss': 'ROOT',
     'dependent': 4,
     'dependentGloss': 'late'},
    {'dep': 'nsubj',
     'governor': 4,
     'governorGloss': 'late',
     'dependent': 1,
     'dependentGloss': 'I'},
    {'dep': 'aux',
     'governor': 4,
     'governorGloss': 'late',
     'dependent': 2,
     'dependentGloss': 'shall'},
    {'dep': 'cop',
     'governor': 4,
     'governorGloss': 'late',
     'dependent': 3,
     'dependentGloss': 'be'},
    {'dep': 'punct',
     'governor': 4,
     'governorGloss': 'late',
     'dependent': 5,
     'dependentGloss': '!'},
    {'dep': 'punct',
     'governor': 4,
     'governorGloss': 'late',
     'dependent': 6,
     'dependentGloss': "'"}],
   'enhancedDependencies': [{'dep': 'ROOT',
     'governor': 0,
     'governo

In [82]:
result['sentences'][0].keys()

dict_keys(['index', 'parse', 'basicDependencies', 'enhancedDependencies', 'enhancedPlusPlusDependencies', 'tokens'])

In [83]:
word['dep'] for word in result['sentences'][0]['basicDependencies'] if word['']

SyntaxError: invalid syntax (<ipython-input-83-d278247aca89>, line 1)

In [87]:
dependencies = [s['parse'] for s in result['sentences']]
dependencies

["(ROOT\r\n  (S\r\n    (NP (PRP I))\r\n    (VP (MD shall)\r\n      (VP (VB be)\r\n        (ADJP (JJ late))))\r\n    (. !) ('' ')))"]

In [88]:
from nltk.tree import *
Tree.fromstring(dependencies[0]).pretty_print()

          ROOT                 
           |                    
           S                   
  _________|_________________   
 |         VP            |   | 
 |     ____|____         |   |  
 |    |         VP       |   | 
 |    |     ____|___     |   |  
 NP   |    |       ADJP  |   | 
 |    |    |        |    |   |  
PRP   MD   VB       JJ   .   ''
 |    |    |        |    |   |  
 I  shall  be      late  !   ' 



In [89]:
dependencies[0]

"(ROOT\r\n  (S\r\n    (NP (PRP I))\r\n    (VP (MD shall)\r\n      (VP (VB be)\r\n        (ADJP (JJ late))))\r\n    (. !) ('' ')))"

### <font color='red'>! did not yet found our how to get the main verb from the tree => reading </font>

### c) Pre-process the list of target verbs

In [91]:
from nltk.stem.wordnet import WordNetLemmatizer

Get the base form of the found verbs:

In [99]:
base_verbs = list(set([WordNetLemmatizer().lemmatize(verb,'v') for verb in verbs]))
base_verbs

['bark',
 'proceed',
 'star',
 '--"',
 'saw',
 'skurried',
 'face',
 'turtle',
 'walk',
 'prevent',
 'provoke',
 'watch',
 'frown',
 'follow',
 'produce',
 'favour',
 'coward',
 'escape',
 'graze',
 'unroll',
 '*',
 'wear',
 'accustom',
 'needn',
 'quiver',
 'laugh',
 'd',
 'legged',
 'spirit',
 'join',
 'seat',
 'invite',
 'dormouse',
 'kneel',
 'draw',
 'coax',
 'pinch',
 'trim',
 'treat',
 'reel',
 'meet',
 'be',
 ';--',
 'ask',
 'cheat',
 'don',
 'learn',
 'fade',
 'insult',
 'white',
 'whistle',
 'dear',
 'alarm',
 'dress',
 'send',
 'slip',
 'reduce',
 'hold',
 'yell',
 'talk',
 'savage',
 'inquire',
 'scramble',
 'rave',
 'lessen',
 'advance',
 'doubt',
 'belong',
 'william',
 'dare',
 'brave',
 'print',
 'signify',
 'whisper',
 'teach',
 'stamp',
 'count',
 'stop',
 "'",
 'sneeze',
 'puzzle',
 'judge',
 'marmalade',
 'introduce',
 'eye',
 'rub',
 'feel',
 'nod',
 'express',
 'sugar',
 'gather',
 'wave',
 'uglify',
 'fix',
 'point',
 'grin',
 'pity',
 'push',
 'sob',
 'wink',
 '

## Generate metaphorical verb pairs

Set-up the threshold that determines metaphoricity:

In [100]:
e = 0.3

In [102]:
verb = base_verbs[0]

<font color='red'>! Get all inflections of the verb: </font>

In [104]:
def inflect(verb):
    # Do the inflection here: https://stackoverflow.com/questions/3753021/using-nltk-and-wordnet-how-do-i-convert-simple-tense-verb-into-its-present-pas
    return verb

## Save & reuse the model

In [52]:
from pickle import dump
output = open('model.pkl', 'wb')
dump(model, output, -1)
output.close()

In [None]:
from pickle import load
input = open('model.pkl', 'rb')
model = load(input)
input.close()

# 2. Generate a sentence containing the assigned verb

# 3. Metaphor inference / generation