# Union-find Lemma Merging Experiments

Play with the following idea: take all the set of all words that have lemma A, and the set of all words that have lemma B, and if there is any overlap between the two sets, then we merge A and B into the same lemma.

In [1]:
import sys
sys.path.append('../')

import glob
from disjoint_set import DisjointSet
from collections import defaultdict, Counter
import numpy as np
import pandas as pd

import src.corpus

%load_ext autoreload
%autoreload 2

In [2]:
FRENCH_UD_FILES = [f for f in glob.glob('../data/ud_all/ud-treebanks-v2.5/**/*.conllu') if 'French' in f]

In [3]:
FRENCH_UD_FILES[:3]

['../data/ud_all/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-test.conllu',
 '../data/ud_all/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-dev.conllu',
 '../data/ud_all/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-train.conllu']

## Construct sets that share lemmas

In [4]:
corpus = src.corpus.POSCorpus.create_from_ud(data_file_list=FRENCH_UD_FILES)

In [5]:
# Helper iterate to return N/V words and lemmas in corpus, lowercased
def iterate_words(corpus):
  for sentence in corpus.sentences:
    for token in sentence:
      if token['pos'] in ['NOUN', 'VERB']:
        yield token['word'].lower(), token['lemma'].lower()

In [6]:
ds = DisjointSet()
for word, lemma in iterate_words(corpus):
  ds.union(word, lemma)

In [7]:
print(ds.find('voyage'))
print(ds.find('voyages'))
print(ds.find('voyager'))
print(ds.find('voyagent'))

voyager
voyager
voyager
voyager


In [8]:
print(ds.find('chant'))
print(ds.find('chants'))
print(ds.find('chanter'))
print(ds.find('chante'))
print(ds.find('chantant'))

chant
chant
chanter
chanter
chanter


## Group words that share the same lemma

In [9]:
lemma_counter = Counter()
for _, lemma in iterate_words(corpus):
  lemma_counter[lemma] += 1

In [10]:
lemma_groups = defaultdict(set)
for word, lemma in iterate_words(corpus):
  lemma_groups[ds.find(word)].add(word)

In [33]:
lemma_groups[ds.find('mourir')]

{'meurent',
 'meurt',
 'mort',
 'morte',
 'mortes',
 'morts',
 'mourant',
 'mourir',
 'mourront',
 'moururent',
 'mourut'}

In [12]:
# Name of the group is the most frequent lemma in the group
def get_name_for_group(word):
  maxn, maxw = 0, None
  for w in lemma_groups[ds.find(word)]:
    if lemma_counter[w] > maxn:
      maxn = lemma_counter[w]
      maxw = w
  return maxw

In [13]:
print(get_name_for_group('parle'))
print(get_name_for_group('font'))

parler
faire


## NV flexibility stats

Modified from `corpus.py`

In [14]:
flexibility_threshold = 0.05
lemma_forms = defaultdict(list)
for sentence in corpus.sentences:
  for token in sentence:
    lemma = token['lemma'].lower()
    word = token['word'].lower()
    pos = token['pos']
    lemma_forms[ds.find(lemma)].append((pos, word))

lemma_count_df = []
for lemma, lemma_occurrences in lemma_forms.items():
  noun_count = len([word for (pos, word) in lemma_occurrences if pos == 'NOUN'])
  verb_count = len([word for (pos, word) in lemma_occurrences if pos == 'VERB'])
  lemma_count_df.append({'lemma': get_name_for_group(lemma), 'noun_count': noun_count, 'verb_count': verb_count})
lemma_count_df = pd.DataFrame(lemma_count_df)

lemma_count_df = lemma_count_df[lemma_count_df['noun_count'] + lemma_count_df['verb_count'] > 0]
lemma_count_df['majority_tag'] = np.where(lemma_count_df['noun_count'] >= lemma_count_df['verb_count'], 'NOUN', 'VERB')
lemma_count_df['total_count'] = lemma_count_df[['noun_count', 'verb_count']].sum(axis=1)
lemma_count_df['minority_count'] = lemma_count_df[['noun_count', 'verb_count']].min(axis=1)
lemma_count_df['minority_ratio'] = lemma_count_df['minority_count'] / lemma_count_df['total_count']
lemma_count_df['is_flexible'] = lemma_count_df['minority_ratio'] > flexibility_threshold
lemma_count_df = lemma_count_df[lemma_count_df['total_count'] >= 10].sort_values('total_count', ascending=False)

In [15]:
lemma_count_df.head(20)

Unnamed: 0,lemma,noun_count,verb_count,majority_tag,total_count,minority_count,minority_ratio,is_flexible
20,avoir,97,1811,VERB,1908,97,0.050839,True
99,faire,259,1100,VERB,1359,259,0.190581,True
67,être,356,888,VERB,1244,356,0.286174,True
159,pouvoir,114,1046,VERB,1160,114,0.098276,True
101,partie,681,255,NOUN,936,255,0.272436,True
483,voir,123,615,VERB,738,123,0.166667,True
218,devoir,17,693,VERB,710,17,0.023944,False
97,monsieur,647,0,NOUN,647,0,0.0,False
885,an,628,0,NOUN,628,0,0.0,False
455,année,610,0,NOUN,610,0,0.0,False


## Syntax flexibility metrics

In [16]:
noun_lemmas = len(lemma_count_df[lemma_count_df['majority_tag'] == 'NOUN'])
verb_lemmas = len(lemma_count_df[lemma_count_df['majority_tag'] == 'VERB'])
noun_flexibility = len(lemma_count_df[(lemma_count_df['majority_tag'] == 'NOUN') & (lemma_count_df['is_flexible'])]) / noun_lemmas
verb_flexibility = len(lemma_count_df[(lemma_count_df['majority_tag'] == 'VERB') & (lemma_count_df['is_flexible'])]) / verb_lemmas

In [17]:
print('Noun Flexibility = P(flexible | noun):', noun_flexibility)

Noun Flexibility = P(flexible | noun): 0.06182212581344902


In [18]:
print('Verb Flexibility = P(flexible | verb):', verb_flexibility)

Verb Flexibility = P(flexible | verb): 0.2573189522342065


In [19]:
# Compute ratio of flexible words that are nouns, to compare with Balteiro (2007)
num_flexible = len(lemma_count_df[lemma_count_df['is_flexible']])
num_flexible_nouns = len(lemma_count_df[(lemma_count_df['majority_tag'] == 'NOUN') & lemma_count_df['is_flexible']])
print("Flexibility Asymmetry = P(noun | flexible):", num_flexible_nouns / num_flexible)

Flexibility Asymmetry = P(noun | flexible): 0.40569395017793597


## Show Examples

In [20]:
# Top flexible nouns
lemma_count_df[(lemma_count_df['majority_tag'] == 'NOUN') & (lemma_count_df['is_flexible'])].head(10)

Unnamed: 0,lemma,noun_count,verb_count,majority_tag,total_count,minority_count,minority_ratio,is_flexible
101,partie,681,255,NOUN,936,255,0.272436,True
328,place,290,92,NOUN,382,92,0.240838,True
763,étude,234,61,NOUN,295,61,0.20678,True
433,forme,150,117,NOUN,267,117,0.438202,True
235,base,116,79,NOUN,195,79,0.405128,True
21,mesure,151,43,NOUN,194,43,0.221649,True
518,marché,166,26,NOUN,192,26,0.135417,True
875,entreprise,157,31,NOUN,188,31,0.164894,True
189,aide,119,66,NOUN,185,66,0.356757,True
1640,livre,142,33,NOUN,175,33,0.188571,True


In [21]:
# Examples of inflexible nouns
lemma_count_df[(lemma_count_df['majority_tag'] == 'NOUN') & (~lemma_count_df['is_flexible'])].head(10)

Unnamed: 0,lemma,noun_count,verb_count,majority_tag,total_count,minority_count,minority_ratio,is_flexible
97,monsieur,647,0,NOUN,647,0,0.0,False
885,an,628,0,NOUN,628,0,0.0,False
455,année,610,0,NOUN,610,0,0.0,False
293,nom,481,0,NOUN,481,0,0.0,False
829,ville,418,0,NOUN,418,0,0.0,False
619,pays,412,0,NOUN,412,0,0.0,False
447,état,410,0,NOUN,410,0,0.0,False
309,droit,375,0,NOUN,375,0,0.0,False
750,jour,370,0,NOUN,370,0,0.0,False
292,président,367,0,NOUN,367,0,0.0,False


In [22]:
# Examples of flexible verbs
lemma_count_df[(lemma_count_df['majority_tag'] == 'VERB') & (lemma_count_df['is_flexible'])].head(10)

Unnamed: 0,lemma,noun_count,verb_count,majority_tag,total_count,minority_count,minority_ratio,is_flexible
20,avoir,97,1811,VERB,1908,97,0.050839,True
99,faire,259,1100,VERB,1359,259,0.190581,True
67,être,356,888,VERB,1244,356,0.286174,True
159,pouvoir,114,1046,VERB,1160,114,0.098276,True
483,voir,123,615,VERB,738,123,0.166667,True
377,prendre,52,487,VERB,539,52,0.096475,True
583,mettre,125,392,VERB,517,125,0.241779,True
241,donner,76,348,VERB,424,76,0.179245,True
689,passer,43,338,VERB,381,43,0.112861,True
409,mourir,139,192,VERB,331,139,0.41994,True


In [23]:
# Examples of inflexible verbs
lemma_count_df[(lemma_count_df['majority_tag'] == 'VERB') & (~lemma_count_df['is_flexible'])].head(10)

Unnamed: 0,lemma,noun_count,verb_count,majority_tag,total_count,minority_count,minority_ratio,is_flexible
218,devoir,17,693,VERB,710,17,0.023944,False
534,aller,6,529,VERB,535,6,0.011215,False
487,dire,3,504,VERB,507,3,0.005917,False
2700,situer,0,479,VERB,479,0,0.0,False
83,naître,1,457,VERB,458,1,0.002183,False
1032,trouver,0,426,VERB,426,0,0.0,False
260,utiliser,0,364,VERB,364,0,0.0,False
1278,connaître,0,350,VERB,350,0,0.0,False
637,devenir,1,344,VERB,345,1,0.002899,False
77,permettre,5,335,VERB,340,5,0.014706,False
