# Multilingual UD

Compute noun/verb frequency-based statistics for all languages in UD

In [71]:
import sys
sys.path.append('../')

import glob
import os
from collections import defaultdict
import pandas as pd

import src.corpus

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Group all treebanks by language
UD_PATH = '../data/ud_all/ud-treebanks-v2.5/'
ud_files = defaultdict(list)

for ud_corpus_name in os.listdir(UD_PATH):
  language_name = ud_corpus_name[3:].split('-')[0].replace('_', ' ')
  for conllu_file in glob.glob(UD_PATH + ud_corpus_name + '/*.conllu'):
    #conllu_file_name = os.path.basename(conllu_file)
    #language_code = conllu_file_name.split('_')[0]
    ud_files[language_name].append(conllu_file)

In [114]:
ud_files['French'][:5]

['../data/ud_all/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-test.conllu',
 '../data/ud_all/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-dev.conllu',
 '../data/ud_all/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-train.conllu',
 '../data/ud_all/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-test.conllu',
 '../data/ud_all/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-train.conllu']

## All UD files in one language

In [94]:
corpus = src.corpus.POSCorpus.create_from_ud(data_file_list=ud_files['English'])

In [109]:
lemma_count_df = corpus.get_per_lemma_stats()
lemma_count_df.sort_values('total_count', ascending=False).head(10)

Unnamed: 0,lemma,noun_count,verb_count,majority_tag,total_count,minority_count,minority_ratio,is_flexible
177,doen,1,138,VERB,139,1,0.007194,False
58,jaar,139,0,NOUN,139,0,0.0,False
148,maak,0,116,VERB,116,0,0.0,False
81,land,115,0,NOUN,115,0,0.0,False
789,aansoek,109,0,NOUN,109,0,0.0,False
230,gebruik,31,78,VERB,109,31,0.284404,True
1931,leerder,103,0,NOUN,103,0,0.0,False
41,regering,102,0,NOUN,102,0,0.0,False
75,het,0,101,VERB,101,0,0.0,False
42,mens,96,0,NOUN,96,0,0.0,False


In [110]:
# Only consider lemmas with at least 10 usages
lemma_count_df = lemma_count_df[lemma_count_df['total_count'] >= 10].sort_values('total_count', ascending=False)
noun_lemmas = len(lemma_count_df[lemma_count_df['majority_tag'] == 'NOUN'])
verb_lemmas = len(lemma_count_df[lemma_count_df['majority_tag'] == 'VERB'])
noun_flexibility = len(lemma_count_df[(lemma_count_df['majority_tag'] == 'NOUN') & (lemma_count_df['is_flexible'])]) / noun_lemmas
verb_flexibility = len(lemma_count_df[(lemma_count_df['majority_tag'] == 'VERB') & (lemma_count_df['is_flexible'])]) / verb_lemmas

In [111]:
print('Total tokens:', lemma_count_df.total_count.sum())

Total tokens: 8151


In [112]:
print('Noun lemmas with >= 10 usages:', noun_lemmas)
print('Verb lemmas with >= 10 usages:', verb_lemmas)

Noun lemmas with >= 10 usages: 245
Verb lemmas with >= 10 usages: 117


In [113]:
print('Noun Flexibility = P(flexible | noun):', noun_flexibility)
print('Verb Flexibility = P(flexible | verb):', verb_flexibility)

Noun Flexibility = P(flexible | noun): 0.08571428571428572
Verb Flexibility = P(flexible | verb): 0.20512820512820512


## Loop over all languages

In [None]:
all_language_stats = pd.DataFrame([], columns=['language', 'tokens', 'noun_lemmas', 'verb_lemmas', 'noun_flexibility', 'verb_flexibility'])
for language_name, language_ud_list in ud_files.items():
  print(language_name)
  
  corpus = src.corpus.POSCorpus.create_from_ud(data_file_list=ud_files[language_name])
  if len(corpus.sentences) == 0: continue
  lemma_count_df = corpus.get_per_lemma_stats()
  lemma_count_df = lemma_count_df[lemma_count_df['total_count'] >= 10].sort_values('total_count', ascending=False)
  noun_lemmas = len(lemma_count_df[lemma_count_df['majority_tag'] == 'NOUN'])
  verb_lemmas = len(lemma_count_df[lemma_count_df['majority_tag'] == 'VERB'])
  if noun_lemmas == 0 or verb_lemmas == 0: continue
  noun_flexibility = len(lemma_count_df[(lemma_count_df['majority_tag'] == 'NOUN') & (lemma_count_df['is_flexible'])]) / noun_lemmas
  verb_flexibility = len(lemma_count_df[(lemma_count_df['majority_tag'] == 'VERB') & (lemma_count_df['is_flexible'])]) / verb_lemmas
  
  all_language_stats.loc[len(all_language_stats)] = {
    'language': language_name,
    'tokens': lemma_count_df.total_count.sum(),
    'noun_lemmas': noun_lemmas,
    'verb_lemmas': verb_lemmas,
    'noun_flexibility': noun_flexibility,
    'verb_flexibility': verb_flexibility,
  }

In [106]:
all_language_stats = all_language_stats.sort_values('tokens', ascending=False)
all_language_stats

Unnamed: 0,language,tokens,noun_lemmas,verb_lemmas,noun_flexibility,verb_flexibility
9,German,1022611,4916,2006,0.001017,0.008973
18,Czech,699595,5592,2092,0.000536,0.001912
8,Russian,396493,3981,1904,0.000754,0.002101
13,Spanish,226809,2744,977,0.024052,0.051177
5,Latin,203259,1181,955,0.002540,0.009424
...,...,...,...,...,...,...
12,Bhojpuri,184,3,8,0.000000,0.000000
7,Komi Zyrian,155,5,7,0.200000,0.000000
20,Mbya Guarani,152,4,7,0.250000,0.285714
22,Sanskrit,145,4,5,0.000000,0.000000


In [107]:
all_language_stats.to_csv('multi-language-ud.csv', index=False)

In [108]:
all_language_stats[(all_language_stats.noun_flexibility > 0.05) & (all_language_stats.verb_flexibility > 0.05)]

Unnamed: 0,language,tokens,noun_lemmas,verb_lemmas,noun_flexibility,verb_flexibility
1,English,127095,1734,622,0.234717,0.405145
38,Chinese,68638,1326,633,0.125189,0.391785
31,Japanese,47095,1114,419,0.160682,0.343675
36,Hebrew,39708,881,338,0.059024,0.147929
51,Classical Chinese,38575,271,400,0.409594,0.2775
74,Indonesian,23148,572,243,0.052448,0.127572
25,Danish,15050,329,215,0.051672,0.088372
62,Vietnamese,14031,296,162,0.101351,0.111111
53,Wolof,9426,145,170,0.17931,0.4
55,Scottish Gaelic,8745,191,32,0.17801,0.8125
