# Nonse Sentence Generation

Generate nonsense sentences similar to Johnson and Goldberg (2013)

In [1]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
import torch
from collections import defaultdict
import random
import math
import pickle
import nltk
from nltk.corpus import treebank
import scipy

import src.sent_encoder

%matplotlib inline
%load_ext autoreload
%autoreload 2
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

## Obtain list of singular nouns, adjectives, past-tense verbs

In [2]:
penn = nltk.corpus.reader.bracket_parse.BracketParseCorpusReader("../data/PTB3", ".*\.tree")

In [3]:
len(penn.parsed_sents())

49207

In [4]:
singular_nouns = defaultdict(int)
adjectives = defaultdict(int)
past_verbs = defaultdict(int)

for tree in penn.parsed_sents():
  for leaf in tree.subtrees(lambda t: t.height() == 2):
    if leaf.label() == "NN":
      singular_nouns[leaf[0].lower()] += 1
    if leaf.label() == "JJ":
      adjectives[leaf[0].lower()] += 1
    if leaf.label() == "VBD":
      past_verbs[leaf[0].lower()] += 1

In [5]:
# Filter out words that are too low-frequency
singular_nouns = dict(filter(lambda w: w[1] >= 10, singular_nouns.items()))
adjectives = dict(filter(lambda w: w[1] >= 10, adjectives.items()))
past_verbs = dict(filter(lambda w: w[1] >= 10, past_verbs.items()))

In [6]:
singular_nouns = list(sorted(singular_nouns))
adjectives = list(sorted(adjectives))
past_verbs = list(sorted(past_verbs))

print(len(singular_nouns))
print(len(adjectives))
print(len(past_verbs))

1996
843
350


## Get contextual vecs for gave/made/put/took

In [7]:
with open("../data/bnc.pkl", "rb") as f:
  bnc_data = pickle.load(f)

In [8]:
LAYER = 11
enc = src.sent_encoder.SentEncoder()

In [9]:
prototype_vecs = {
  'gave': enc.avg_contextual_word_vec(bnc_data, "gave")[LAYER],
  'made': enc.avg_contextual_word_vec(bnc_data, "made")[LAYER],
  'put': enc.avg_contextual_word_vec(bnc_data, "put")[LAYER],
  'took': enc.avg_contextual_word_vec(bnc_data, "took")[LAYER],
  #'handed': enc.avg_contextual_word_vec(bnc_data, "handed")[LAYER],
  #'turned': enc.avg_contextual_word_vec(bnc_data, "turned")[LAYER],
  #'placed': enc.avg_contextual_word_vec(bnc_data, "placed")[LAYER],
  #'removed': enc.avg_contextual_word_vec(bnc_data, "removed")[LAYER],
}

## Generate sentences of each type

In [10]:
random.seed(12345)
NUM_SENTENCES_PER_CXN = 5000
templated_sentences = defaultdict(list)

# Ditransitive: S/he nonseV-ed him/her the nonseN
for i in range(NUM_SENTENCES_PER_CXN):
  pronoun1 = random.choice(["He", "She"])
  pronoun2 = random.choice(["him", "her"])
  nonse_verb = random.choice(past_verbs)
  nonse_noun = random.choice(singular_nouns)
  templated_sentences['ditransitive'].append((
    f"{pronoun1} {nonse_verb} {pronoun2} the {nonse_noun}.",
    nonse_verb
  ))
  
# Resultative: S/he nonseV-ed it nonseAdj.
for i in range(NUM_SENTENCES_PER_CXN):
  pronoun1 = random.choice(["He", "She"])
  nonse_verb = random.choice(past_verbs)
  nonse_adj = random.choice(adjectives)
  templated_sentences['resultative'].append((
    f"{pronoun1} {nonse_verb} it {nonse_adj}.",
    nonse_verb
  ))
  
# Caused-motion: S/he nonseV-ed it on the nonseN.
for i in range(NUM_SENTENCES_PER_CXN):
  pronoun1 = random.choice(["He", "She"])
  nonse_verb = random.choice(past_verbs)
  nonse_noun = random.choice(singular_nouns)
  templated_sentences['caused-motion'].append((
    f"{pronoun1} {nonse_verb} it on the {nonse_noun}.",
    nonse_verb
  ))
  
# Removal: S/he nonseV-ed it from him/her.
for i in range(NUM_SENTENCES_PER_CXN):
  pronoun1 = random.choice(["He", "She"])
  pronoun2 = random.choice(["him", "her"])
  nonse_verb = random.choice(past_verbs)
  templated_sentences['removal'].append((
    f"{pronoun1} {nonse_verb} it from {pronoun2}.",
    nonse_verb
  ))

## Get distances from cxn-verbs to proto-verbs

In [11]:
def is_congruent(cxn, verb):
  if cxn == 'ditransitive':
    return verb in ['gave', 'handed']
  if cxn == 'resultative':
    return verb in ['made', 'turned']
  if cxn == 'caused-motion':
    return verb in ['put', 'placed']
  if cxn == 'removal':
    return verb in ['took', 'removed']
  return False

In [12]:
verb_dist_results = []

for cxn_type, cxn_sentences_and_verbs in templated_sentences.items():
  cxn_sentences = [t[0] for t in cxn_sentences_and_verbs]
  cxn_verbs = [t[1] for t in cxn_sentences_and_verbs]
  cxn_verb_vecs = enc.sentence_vecs(cxn_sentences, cxn_verbs)[:, LAYER]
  
  for proto_verb, proto_verb_vec in prototype_vecs.items():
    for i, cxn_verb_vec in enumerate(cxn_verb_vecs):
      dist = np.linalg.norm(proto_verb_vec - cxn_verb_vec)
      verb_dist_results.append(pd.Series({
        'cxn_sentence': cxn_sentences[i],
        'cxn': cxn_type,
        'verb': proto_verb,
        'congruent': is_congruent(cxn_type, proto_verb),
        'dist': dist,
      }))
      
verb_dist_results = pd.DataFrame(verb_dist_results)

## Summarize results

In [13]:
for verb in prototype_vecs.keys():
  for cxn in templated_sentences.keys():
    m = verb_dist_results[(verb_dist_results.cxn == cxn) & (verb_dist_results.verb == verb)].dist.mean()
    sd = verb_dist_results[(verb_dist_results.cxn == cxn) & (verb_dist_results.verb == verb)].dist.std()
    print(cxn, verb, float(m), float(sd))

ditransitive gave 11.89890238647461 1.1858078835315617
resultative gave 11.92425343208313 1.2382925883849938
caused-motion gave 11.691192469120026 1.216427248591975
removal gave 11.74043079328537 1.4384559320850094
ditransitive made 12.295198278522491 1.2375450478885313
resultative made 11.700742177677155 1.170060562745414
caused-motion made 11.593213554191589 1.1080229723329489
removal made 11.954456674575805 1.375468496063373
ditransitive put 12.56705058412552 1.2184826145659435
resultative put 11.867872425460815 1.3231187354543499
caused-motion put 11.394599109172821 1.0637222877342791
removal put 11.936269455337525 1.318441316306424
ditransitive took 12.32781259689331 1.2393412236182806
resultative took 11.863502470588685 1.2378532009659635
caused-motion took 11.599411346721649 1.1931807748120369
removal took 11.516971401119232 1.200000629549786


In [14]:
print('Mean congruent:', verb_dist_results[verb_dist_results.congruent].dist.mean())
print('Std congruent:', verb_dist_results[verb_dist_results.congruent].dist.std())
print('Mean incongruent:', verb_dist_results[~verb_dist_results.congruent].dist.mean())
print('Std incongruent:', verb_dist_results[~verb_dist_results.congruent].dist.std())

Mean congruent: 11.627803768610955
Std congruent: 1.1716843222011528
Mean incongruent: 11.94672200674216
Std incongruent: 1.2978878691076732
