# Multilingual UD

Compute noun/verb frequency-based statistics for all languages in UD

In [1]:
import sys
sys.path.append('../')

import glob
import os
from collections import defaultdict
import pandas as pd
import multiprocessing as mp

import src.corpus

%load_ext autoreload
%autoreload 2

In [2]:
# Group all treebanks by language
UD_PATH = '../data/ud_all/ud-treebanks-v2.5/'
ud_files = defaultdict(list)

for ud_corpus_name in os.listdir(UD_PATH):
  language_name = ud_corpus_name[3:].split('-')[0].replace('_', ' ')
  for conllu_file in glob.glob(UD_PATH + ud_corpus_name + '/*.conllu'):
    #conllu_file_name = os.path.basename(conllu_file)
    #language_code = conllu_file_name.split('_')[0]
    ud_files[language_name].append(conllu_file)

In [3]:
ud_files['French'][:5]

['../data/ud_all/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-test.conllu',
 '../data/ud_all/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-dev.conllu',
 '../data/ud_all/ud-treebanks-v2.5/UD_French-Sequoia/fr_sequoia-ud-train.conllu',
 '../data/ud_all/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-test.conllu',
 '../data/ud_all/ud-treebanks-v2.5/UD_French-FTB/fr_ftb-ud-train.conllu']

## All UD files in one language

In [4]:
corpus = src.corpus.POSCorpus.create_from_ud(data_file_list=ud_files['English'])

In [5]:
lemma_count_df = corpus.get_per_lemma_stats()
lemma_count_df.sort_values('total_count', ascending=False).head(10)

Unnamed: 0,lemma,noun_count,verb_count,majority_tag,total_count,minority_count,minority_ratio,is_flexible
64,have,0,2504,VERB,2504,0,0.0,False
16,be,2,1950,VERB,1952,2,0.001025,False
948,say,3,1224,VERB,1227,3,0.002445,False
645,go,3,1165,VERB,1168,3,0.002568,False
193,make,1,1080,VERB,1081,1,0.000925,False
1494,get,0,1078,VERB,1078,0,0.0,False
292,do,3,1054,VERB,1057,3,0.002838,False
208,time,969,2,NOUN,971,2,0.00206,False
766,know,1,961,VERB,962,1,0.00104,False
109,take,11,921,VERB,932,11,0.011803,False


In [6]:
total_tokens = sum([len(sentence) for sentence in corpus.sentences])
print('Total tokens:', total_tokens)

Total tokens: 522856


In [7]:
# Only consider lemmas with at least 10 usages
lemma_count_df = lemma_count_df[lemma_count_df['total_count'] >= 10].sort_values('total_count', ascending=False)
noun_lemmas = len(lemma_count_df[lemma_count_df['majority_tag'] == 'NOUN'])
verb_lemmas = len(lemma_count_df[lemma_count_df['majority_tag'] == 'VERB'])
noun_flexibility = len(lemma_count_df[(lemma_count_df['majority_tag'] == 'NOUN') & (lemma_count_df['is_flexible'])]) / noun_lemmas
verb_flexibility = len(lemma_count_df[(lemma_count_df['majority_tag'] == 'VERB') & (lemma_count_df['is_flexible'])]) / verb_lemmas

In [8]:
print('Noun lemmas with >= 10 usages:', noun_lemmas)
print('Verb lemmas with >= 10 usages:', verb_lemmas)

Noun lemmas with >= 10 usages: 1734
Verb lemmas with >= 10 usages: 622


In [9]:
print('Noun Flexibility = P(flexible | noun):', noun_flexibility)
print('Verb Flexibility = P(flexible | verb):', verb_flexibility)

Noun Flexibility = P(flexible | noun): 0.23471741637831603
Verb Flexibility = P(flexible | verb): 0.40514469453376206


## Loop over all languages

In [None]:
def process_ud_language(args):
  language_name, language_ud_list = args
  print('Processing:', language_name)
  
  corpus = src.corpus.POSCorpus.create_from_ud(data_file_list=ud_files[language_name])
  if len(corpus.sentences) == 0: return None
  total_tokens = sum([len(sentence) for sentence in corpus.sentences])
  lemma_count_df = corpus.get_per_lemma_stats()
  lemma_count_df = lemma_count_df[lemma_count_df['total_count'] >= 10].sort_values('total_count', ascending=False)
  noun_lemmas = len(lemma_count_df[lemma_count_df['majority_tag'] == 'NOUN'])
  verb_lemmas = len(lemma_count_df[lemma_count_df['majority_tag'] == 'VERB'])
  if noun_lemmas == 0 or verb_lemmas == 0: return None
  noun_flexibility = len(lemma_count_df[(lemma_count_df['majority_tag'] == 'NOUN') & (lemma_count_df['is_flexible'])]) / noun_lemmas
  verb_flexibility = len(lemma_count_df[(lemma_count_df['majority_tag'] == 'VERB') & (lemma_count_df['is_flexible'])]) / verb_lemmas
  
  return pd.Series({
    'language': language_name,
    'tokens': total_tokens,
    'noun_lemmas': noun_lemmas,
    'verb_lemmas': verb_lemmas,
    'noun_flexibility': noun_flexibility,
    'verb_flexibility': verb_flexibility,
  })

pool = mp.Pool()
results = pool.map(process_ud_language, ud_files.items())
results = [r for r in results if r is not None]
all_language_stats = pd.DataFrame(results)

In [27]:
all_language_stats = all_language_stats.sort_values('tokens', ascending=False)
all_language_stats

Unnamed: 0,language,tokens,noun_lemmas,verb_lemmas,noun_flexibility,verb_flexibility
9,German,3753945,4916,2006,0.001017,0.008973
18,Czech,2222156,5592,2092,0.000536,0.001912
8,Russian,1262205,3981,1904,0.000754,0.002101
13,Spanish,985174,2744,977,0.024052,0.051177
2,Italian,811510,2405,889,0.006237,0.017998
...,...,...,...,...,...,...
41,Yoruba,2664,7,10,0.000000,0.000000
22,Sanskrit,1842,4,5,0.000000,0.000000
58,Akkadian,1814,11,1,0.000000,0.000000
69,Livvi,1632,2,2,0.000000,0.000000


In [28]:
all_language_stats.to_csv('multi-language-ud.csv', index=False)

In [29]:
all_language_stats[(all_language_stats.noun_flexibility > 0.05) & (all_language_stats.verb_flexibility > 0.05)]

Unnamed: 0,language,tokens,noun_lemmas,verb_lemmas,noun_flexibility,verb_flexibility
1,English,522856,1734,622,0.234717,0.405145
38,Chinese,253838,1326,633,0.125189,0.391785
31,Japanese,225248,1114,419,0.160682,0.343675
36,Hebrew,159789,881,338,0.059024,0.147929
74,Indonesian,121721,572,243,0.052448,0.127572
25,Danish,100733,329,215,0.051672,0.088372
51,Classical Chinese,74770,271,400,0.409594,0.2775
76,Afrikaans,49276,245,117,0.085714,0.205128
53,Wolof,44258,145,170,0.17931,0.4
62,Vietnamese,43754,296,162,0.101351,0.111111
