In [1]:
import argparse
import string
import sys
import os
import json
import time
import pandas as pd
import requests
from tqdm import tqdm
import zipfile
from IPython.display import display, HTML
from panlex import load_panlex_resources, extract_monolingual_lexicon, extract_bilingual_lexicon

In [2]:
%%time
# Test loading resource
print('loading panlex resources...')
stime = time.time()

panlex_dir = 'panlex-20230501-csv'
langvar_df, expr_df, deno_df = load_panlex_resources(panlex_dir)

print(f'finished loading panlex resources in {time.time() - stime:.2f}s')

# Test extract monolingual
print('extracting monolingual lexicon (ind)...')
stime = time.time()

monolingual_lexicon = extract_monolingual_lexicon('ind', langvar_df, expr_df)
display(monolingual_lexicon.head(10))

print(f'finished extracting monolingual lexicon (ind) in {time.time() - stime:.2f}s')

# Test extract bilingual
print('extracting bilingual lexicon (ind-eng)...')
stime = time.time()

bilingual_lexicon = extract_bilingual_lexicon('ind', 'bug', langvar_df, expr_df, deno_df)
display(bilingual_lexicon.head())

print(f'finished extracting bilingual lexicon (ind-end) in {time.time() - stime:.2f}s')

loading panlex resources...
finished loading panlex resources in 50.70s
extracting monolingual lexicon (ind)...


Unnamed: 0,ind
0,menangkalkan
1,menyetrum
2,radiofoto
3,krakatoa
4,jiwei
5,kanker
6,neraca
7,dijamin
8,pantalon
9,focal


finished extracting monolingual lexicon (ind) in 18.61s
extracting bilingual lexicon (ind-eng)...


Unnamed: 0,ind,bug
0,Bahasa Bugis,ᨅᨔ ᨕᨘᨁᨗ
1,bahasa bugis,ᨅᨔ ᨕᨘᨁᨗ
2,mengkudu,ᨅᨍ
3,Jepang,ᨍᨛᨄ
4,Korea,ᨀᨚᨑᨗᨕ


finished extracting bilingual lexicon (ind-end) in 31.07s
CPU times: user 1min 34s, sys: 6.06 s, total: 1min 40s
Wall time: 1min 40s


In [30]:
bilingual_lexicon = extract_bilingual_lexicon('eng', 'cni', langvar_df, expr_df, deno_df)
display(bilingual_lexicon.head())

Unnamed: 0,eng,cni
0,I,nainti
1,I,naro
2,I,narori
3,you,awiro
4,one,aparo


In [31]:
# aym: Jupax sanwa: Mamita, utankastwa || Utar purinxtwa sasaw mamaparux sanxa
# cni: Iriori ikantiro: Ina, nosaiki pankotsiki. || Ikantiro iriniro yaretaja pankotsiki.

In [32]:
bilingual_lexicon

Unnamed: 0,eng,cni
0,I,nainti
1,I,naro
2,I,narori
3,you,awiro
4,one,aparo
5,two,apite
6,person,amatsenka
7,fish,Sima
8,fish,parenti
9,dog,otsitsi


In [34]:
bilingual_lexicon.loc[bilingual_lexicon['cni'] == 'ina',:]

Unnamed: 0,eng,cni


In [36]:
x = 'Iriori ikantiro Ina nosaiki pankotsiki Ikantiro iriniro yaretaja pankotsiki'.lower()
for token in x.split(' '):
    print(token, bilingual_lexicon.loc[bilingual_lexicon['cni'] == token,:].values)

iriori []
ikantiro []
ina []
nosaiki []
pankotsiki []
ikantiro []
iriniro []
yaretaja []
pankotsiki []


In [16]:
bilingual_lexicon.set_index('spa').head(50).to_dict()

{'aym': {'seis': 'suxta',
  'agua': 'uma',
  'tierra': 'uraḳi',
  'suelo': 'uraḳi',
  'polvo': 'laḳˀa',
  'lodo': 'ɲiḳˀi',
  'arena': 'čˀalʸa',
  'loma': 'ḳulʸu',
  'montaña': 'ḳulʸu',
  'campo': 'pampa',
  'llanura': 'pampa',
  'pampa': 'pampa',
  'valle': 'ḳʰirwa',
  'isla': 'isla',
  'costa': 'ḳuta laka',
  'playa': 'ḳuta laka',
  'ribera': 'ḳuta laka',
  'cueva': 'tixi',
  'mar': 'lamara',
  'espuma': 'xupʰuḳu',
  'océano': 'mama-ḳuta',
  'lago': 'ḳuta',
  'laguna': 'ḳuta-ɲa',
  'ola': 'uxi',
  'arroyo': 'xawira',
  'corriente': 'xawira',
  'río': 'xawira',
  'manantial': 'pʰučˀu',
  'pozo': 'pʰučˀu',
  'pantano': 'ḳʰuči',
  'cascada': 'pʰaxča',
  'selva': 'čˀumi',
  'árbol': 'ali',
  'madera': 'kˀulʸu',
  'piedra': 'ḳarḳa',
  'roca': 'ḳarḳa',
  'cielo': 'laḳampu',
  'sol': 'wilʸka',
  'luna': 'pʰaxsi',
  'estrella': 'wara-wara',
  'relámpago': 'ḳˀixu-ḳˀixu',
  'rayo': 'lʸixu-lʸixu',
  'tempestad': 'wayra'}}

In [10]:
langvar_df.loc[langvar_df['lang_code'] == 'cni', :]

Unnamed: 0,id,lang_code,var_code,mutable,name_expr,script_expr,meaning,region_expr,uid_expr,grp
4788,4547,cni,0,t,18586488,18147719,35680514,26528845,18586487,4547


### In-Context Alignment

In [2]:
panlex_dir = 'panlex-20230501-csv'
langvar_df, expr_df, deno_df = load_panlex_resources(panlex_dir)

In [36]:
languages = [
    'amh', 'hau', 'ibo', 'lug', 'pcm', 'sna', 'swc', 'swh', 'xho', 'yor',
    'aym', 'bzd', 'cni', 'grn', 'hch', 'nah', 'oto', 'quy', 'shp', 'tar', 
    'bbc', 'sun', 'jav', 'mad', 'mak', 'min',
    "ara", "fra", "deu", "hin", "ita", "por", "spa"
]

In [37]:
for lang in languages:
    try:
        print(lang)
        if not os.path.exists(f'lexicons/eng-{lang}_lexicon.csv'):
            bilexicon_df = extract_bilingual_lexicon('eng', lang, langvar_df, expr_df, deno_df) 
            bilexicon_df.to_csv(f'lexicons/eng-{lang}_lexicon.csv', index=False)
    except:
        print('ERROR')
        pass

amh
hau
ibo
lug
pcm
sna
swc
swh
xho
yor
aym
bzd
cni
grn
hch
nah
oto
quy
shp
tar
bbc
sun
jav
mad
mak
min
ara
fra
deu
hin
ita
por
spa


In [38]:
bdfs = {}
for lang in languages:
    try:
        bdfs[lang] = pd.read_csv(f'lexicons/eng-{lang}_lexicon.csv')
    except:
        print('ERROR')
        pass

In [39]:
af_labels = ['business', 'entertainment', 'health', 'politics', 'religion', 'sports', 'technology']
af_langs = ['amh', 'hau', 'ibo', 'lug', 'pcm', 'sna', 'swc', 'swh', 'xho', 'yor']

sa_labels = ['entailment', 'neutral', 'contradiction']
sa_langs = ['aym', 'bzd', 'cni', 'grn', 'hch', 'nah', 'oto', 'quy', 'shp', 'tar']

id_labels = ['negative', 'neutral', 'positive']
id_langs = ['bbc', 'sun', 'jav', 'mad', 'mak', 'min']

tsm_labels = ['negative', 'neutral', 'positive']
tsm_langs = ['ara', 'fra', 'deu', 'hin', 'ita', 'por', 'spa']

In [33]:
for lang in af_langs:
    for label in af_labels:
        bdf = bdfs[lang]
        print(bdf.loc[bdf['eng'] == label, lang].values, end='\t')
    print()

[]	[]	['ትምህርተ፡ጤና']	['ፖለቲካ']	['ሃይማኖት' 'ሃይማኖት']	[]	['ቴክኖዎሎጂ']	
['hidimā' 'sabga' 'shagali' 'shaʼani' 'tagala']	[]	['lafiya']	['siyasa']	['àddīnī̀' 'àdíinìi' 'addini' 'addini']	[]	[]	
[]	[]	['ezi ndụ̀' 'àrụ ikē' 'ezi ndụ̀']	[]	[]	[]	['nka na uzu']	
['kusubula' 'okusubula' 'okusubula']	[]	['bulamu' 'kuwangula' 'obulamu' 'okuwangula']	['byʼobufuzi' 'ebyʼobufuzi']	[]	[]	['tekinologia' 'tekinologia' 'tekinologia']	
[]	[]	[]	[]	[]	[]	[]	
[]	['mutambo']	['hutano']	[]	[]	[]	[]	
[]	[]	[]	[]	[]	[]	[]	
['biashara' 'jambo' 'kitu' 'amali' 'bia' 'biashara' 'duka' 'hoja' 'kazi'
 'kisa' 'maishilio' 'shughuli' 'tarafa' 'tarafu' 'tatizo' 'tume'
 'ubiashara' 'amali' 'bia' 'biashara' 'duka' 'hoja' 'kazi' 'kisa'
 'maishilio' 'tatizo' 'shughuli' 'tarafa' 'tarafu' 'tume' 'ubiashara'
 'jambo' 'amali' 'amali' 'bia' 'biashara' 'biashara' 'hoja' 'hoja' 'kazi'
 'maishilio' 'shughuli' 'shughuli' 'tarafa' 'tarafa' 'tarafu' 'tarafu'
 'tume' 'ubiashara' 'jambo' 'amara' 'shughuli' 'amali' 'biashara'
 'biashara' 'biashar

In [34]:
for lang in sa_langs:
    for label in sa_labels:
        bdf = bdfs[lang]
        print(lang, label, bdf.loc[bdf['eng'] == label, lang].values)

aym entailment []
aym neutral []
aym contradiction []
bzd entailment []
bzd neutral []
bzd contradiction []
cni entailment []
cni neutral []
cni contradiction []
grn entailment []
grn neutral []
grn contradiction []
hch entailment []
hch neutral []
hch contradiction []
nah entailment []
nah neutral []
nah contradiction []
oto entailment []
oto neutral []
oto contradiction []
quy entailment []
quy neutral []
quy contradiction ['ayñi' 'hayu']
shp entailment []
shp neutral []
shp contradiction []
tar entailment []
tar neutral []
tar contradiction []


In [35]:
for lang in id_langs:
    for label in id_labels:
        bdf = bdfs[lang]
        print(lang, label, bdf.loc[bdf['eng'] == label, lang].values)

bbc negative []
bbc neutral []
bbc positive []
sun negative []
sun neutral []
sun positive []
jav negative []
jav neutral []
jav positive []
mad negative []
mad neutral []
mad positive []
mak negative []
mak neutral []
mak positive []
min negative []
min neutral []
min positive []


In [40]:
for lang in tsm_langs:
    for label in tsm_labels:
        bdf = bdfs[lang]
        print(lang, label, bdf.loc[bdf['eng'] == label, lang].values)

ara negative []
ara neutral []
ara positive []
fra negative ['négatif' 'négatif' 'cliché' 'épreuve négative' 'négatif' 'négatif'
 'négatif' 'négatif' 'négatif' 'négation' 'démenti' 'négatif' 'négatif'
 'négatif' 'négatif' 'négation' 'nier' 'réfuter' 'rejeter' 'repousser'
 'négatif' 'négatif' 'négatif' 'négatif' 'négatif' 'négatif' 'négatif'
 'négatif' 'négatif' 'négatif' 'négatif' 'négatif' 'négatif' 'négatif'
 'négatif' 'négatif' 'non' 'négatif' 'électronégatif' 'négatif' 'négatif'
 'négatif' 'apophatique' 'négatif' 'négatif' 'strictement négatif'
 'négatif' 'strictement négatif' 'négatif' 'strictement négatif' 'négatif'
 'strictement négatif' 'négatif' 'strictement négatif' 'négatif'
 'strictement négatif' 'négatif' 'strictement négatif' 'négatif'
 'strictement négatif' 'négatif' 'strictement négatif' 'négatif'
 'strictement négatif' 'strictement négatif' 'négatif' 'négatif' 'négatif'
 'film négatif' 'négatif' 'négatif' 'négatif' 'négatif'
 'strictement négatif' 'négatif' 'négatif' '