# Explore Warstadt's vocabulary file

In [1]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
import torch
from collections import defaultdict, Counter
import random
import math
import pickle

%matplotlib inline
%load_ext autoreload
%autoreload 2
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [2]:
data = pd.read_csv("../data/vocabulary.csv")

In [36]:
data.head(5)

Unnamed: 0,expression,category,category_2,verb,noun,non_v_pred,frequent,sg,pl,mass,animate,person,properNoun,finite,bare,pres,past,ing,en,3sg,arg_1,arg_2,arg_3,root,wh_np_verb,responsive,passive,strict_intrans,strict_trans,causative,spray_load,inchoative,agentive,event,adjs,restrictor_DE,scope_DE,NPI,agent,occupation,clothing,appearance,physical,conceptual,artifact,start_with_vowel,frontable,gender,irrpl,special_en_form,irr_verb,irr_past,document,negated,locale,institution,arg_clause,homophonous,pluralform,singularform,sgequalspl,topic,image,v_embed_sc,change_of_state,initial_state,change_arg,vehicle,vegetable,food,light,liquid,animal,openable,climbable,cleanable,buildable,drinkable,singable,boat,money,interior,quantifier,antonym,locative_prepositions,synonym_hypernym_hyponym,OOV_inductive_biases,past_or_participle
0,boy,N,N,,1.0,,1.0,1.0,0.0,0.0,1.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,1.0,,,0.0,,m,,,,,,,,,,,boys,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
1,girl,N,N,,1.0,,1.0,1.0,0.0,0.0,1.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,1.0,,,0.0,,f,,,,,,,,,,,girls,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
2,man,N,N,,1.0,,1.0,1.0,0.0,0.0,1.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,1.0,,,0.0,,m,1.0,,,,,,,,,,men,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
3,woman,N,N,,1.0,,1.0,1.0,0.0,0.0,1.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,1.0,,,0.0,,f,1.0,,,,,,,,,,women,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
4,lady,N,N,,1.0,,1.0,1.0,0.0,0.0,1.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,1.0,,,0.0,,f,,,,,,,,,,,ladies,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0


## Examples of each CCG tag

In [29]:
for pos, count in data.category.value_counts().items():
  if count >= 10:
    print(count, pos, data[data.category == pos].sample(3).expression.tolist())

1093 S\NP ['escapes', 'departed', 'screamed']
893 (S\NP)/NP ['hiring', 'insult', 'loses']
720 N ['Bethany', 'song', 'Vincent']
246 (S\NP)/S ['learned', 'discovering', 'loves']
180 ((S\NP)/(S[to]\NP))/NP ['motivate', 'advised', 'requiring']
109 N/N ['worst', 'bigger', 'employed']
108 (S\NP)/(S[to]\NP) ['propose', 'intended', 'tried']
97 N\NP[poss] ['podiatrists', 'partner', 'hairdresser']
93 (S[Pred]/NP[it])/S ['noteworthy', 'lucky', 'fearful']
86 S[pred]\NP ['editor', 'ordinary', 'damp']
83 (S\NP)/Q ['learning', 'forget', 'wonder']
78 (S\NP)/S[to] ['need', 'preferring', 'judge']
74 N\N ['lengthy', 'damp', 'cool']
64 N/NP ['plays about', 'friend of', 'daughter of']
48 S/S[to] ['turn out', 'look', 'fails']
32 (S\NP)/(S[bare]\NP) ['did', "aren't", 'do']
24 NP[pron] ['ours', 'it', 'his']
21 (S/(S\NP))/N ['the', 'more than three', 'most']
15 (S[A]\NP)/((S[to]\NP)/NP) ['easy', 'enjoyable', 'annoying']
14 (S/S)/S ['keeping in mind the fact that', 'taking into account the fact that', 'keeping 

## Extract singular and plural nouns

In [52]:
df_nouns = data[data.category == 'N']
singular_nouns = []
plural_nouns = []

for _, row in data[(data.category == 'N') & (data.properNoun != 1)].iterrows():
  if row.pl == 1:
    singular_nouns.append(row.singularform)
    plural_nouns.append(row.expression)
  else:
    singular_nouns.append(row.expression)
    plural_nouns.append(row.pluralform)

singular_nouns = list(set(singular_nouns) - {np.nan})
plural_nouns = list(set(plural_nouns) - {np.nan})

In [55]:
print(len(singular_nouns))
print(singular_nouns[:5])

233
['dress', 'bike', 'glove', 'synopsis', 'money']


In [56]:
print(len(plural_nouns))
print(plural_nouns[:5])

215
['hairdressers', 'forks', 'shoes', 'essays', 'skirts']
