# Multilingual UD

Compute noun/verb frequency-based statistics for all languages in UD

In [1]:
import sys
sys.path.append('../')

from collections import defaultdict
import pandas as pd
import multiprocessing as mp

import src.corpus

%load_ext autoreload
%autoreload 2

In [2]:
UD_PATH = '../data/ud_all/ud-treebanks-v2.5/'
ud_files = src.corpus.group_treebanks_by_language(UD_PATH)
ud_files['French'][:5]

['../data/ud_all/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-test.conllu',
 '../data/ud_all/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-dev.conllu',
 '../data/ud_all/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-train.conllu',
 '../data/ud_all/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-test.conllu',
 '../data/ud_all/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-train.conllu']

## All UD files in one language

In [3]:
corpus = src.corpus.POSCorpus.create_from_ud(data_file_list=ud_files['French'])

In [4]:
lemma_count_df = corpus.get_lemma_stats_merge_method()
lemma_count_df.sort_values('total_count', ascending=False).head(10)

Unnamed: 0,lemma,noun_count,verb_count,majority_tag,total_count,minority_count,minority_ratio,is_flexible
20,avoir,79,1811,VERB,1890,79,0.041799,False
100,faire,259,1100,VERB,1359,259,0.190581,True
68,être,354,888,VERB,1242,354,0.285024,True
161,pouvoir,114,1046,VERB,1160,114,0.098276,True
102,partie,681,255,NOUN,936,255,0.272436,True
487,voir,123,615,VERB,738,123,0.166667,True
220,devoir,17,693,VERB,710,17,0.023944,False
889,an,628,0,NOUN,628,0,0.0,False
98,monsieur,627,0,NOUN,627,0,0.0,False
459,année,610,0,NOUN,610,0,0.0,False


In [5]:
total_tokens = sum([len(sentence) for sentence in corpus.sentences])
print('Total tokens:', total_tokens)

Total tokens: 578670


In [6]:
# Only consider lemmas with at least 10 usages
lemma_count_df = lemma_count_df[lemma_count_df['total_count'] >= 10].sort_values('total_count', ascending=False)
noun_lemmas = len(lemma_count_df[lemma_count_df['majority_tag'] == 'NOUN'])
verb_lemmas = len(lemma_count_df[lemma_count_df['majority_tag'] == 'VERB'])
noun_flexibility = len(lemma_count_df[(lemma_count_df['majority_tag'] == 'NOUN') & (lemma_count_df['is_flexible'])]) / noun_lemmas
verb_flexibility = len(lemma_count_df[(lemma_count_df['majority_tag'] == 'VERB') & (lemma_count_df['is_flexible'])]) / verb_lemmas

In [7]:
print('Noun lemmas with >= 10 usages:', noun_lemmas)
print('Verb lemmas with >= 10 usages:', verb_lemmas)

Noun lemmas with >= 10 usages: 1847
Verb lemmas with >= 10 usages: 648


In [8]:
print('Noun Flexibility = P(flexible | noun):', noun_flexibility)
print('Verb Flexibility = P(flexible | verb):', verb_flexibility)

Noun Flexibility = P(flexible | noun): 0.06172171088251218
Verb Flexibility = P(flexible | verb): 0.25617283950617287


## Loop over all languages

In [None]:
def process_ud_language(args):
  language_name, language_ud_list = args
  print('Processing:', language_name)
  
  corpus = src.corpus.POSCorpus.create_from_ud(data_file_list=ud_files[language_name])
  if len(corpus.sentences) == 0: return None
  total_tokens = sum([len(sentence) for sentence in corpus.sentences])
  lemma_count_df = corpus.get_lemma_stats_merge_method()
  lemma_count_df = lemma_count_df[lemma_count_df['total_count'] >= 10].sort_values('total_count', ascending=False)
  noun_lemmas = len(lemma_count_df[lemma_count_df['majority_tag'] == 'NOUN'])
  verb_lemmas = len(lemma_count_df[lemma_count_df['majority_tag'] == 'VERB'])
  if noun_lemmas == 0 or verb_lemmas == 0: return None
  noun_flexibility = len(lemma_count_df[(lemma_count_df['majority_tag'] == 'NOUN') & (lemma_count_df['is_flexible'])]) / noun_lemmas
  verb_flexibility = len(lemma_count_df[(lemma_count_df['majority_tag'] == 'VERB') & (lemma_count_df['is_flexible'])]) / verb_lemmas
  
  return pd.Series({
    'language': language_name,
    'tokens': total_tokens,
    'noun_lemmas': noun_lemmas,
    'verb_lemmas': verb_lemmas,
    'noun_flexibility': noun_flexibility,
    'verb_flexibility': verb_flexibility,
  })

pool = mp.Pool()
results = pool.map(process_ud_language, ud_files.items())
results = [r for r in results if r is not None]
all_language_stats = pd.DataFrame(results)

In [None]:
all_language_stats = all_language_stats.sort_values('tokens', ascending=False)
all_language_stats

In [None]:
all_language_stats[(all_language_stats.noun_flexibility > 0.05) & (all_language_stats.verb_flexibility > 0.05)]

Processing: Greek
Processing: Akkadian
Processing: Erzya
