In [17]:
import pandas as pd
import os
from tqdm import tqdm

In [2]:
indices = []
data = {
    'classification': [],
    'text': [],
}
for file in ['eng_data/eng_train.tsv', 'eng_data/eng_dev.tsv', 'eng_data/eng_test.tsv']:
    with open(file, 'r') as f:
        for line in f:
            idx, classification, text = line.strip().split('\t')
            indices.append(idx)
            data['classification'].append(classification)
            data['text'].append(text)
df = pd.DataFrame(data, index=indices)

In [18]:
dir = 'Taxi1500-c_v3.0'
coverage = {
    'file': [],
    'annotated': [],
    'total': [],
    'proportion': [],
}
for file in tqdm(os.listdir(dir)):
    with open(os.path.join(dir, file), 'r') as f:
        indices = []
        data = {
            'classification': [],
            'text': [],
        }
        coverage['file'].append(file)
        annotated = 0
        total = 0
        for line in f:
            if line.startswith('#'):
                continue
            try:
                idx, text = line.strip().split('\t')
            except ValueError:
                continue
            total += 1
            if idx not in df.index:
                continue
            indices.append(idx)
            data['classification'].append(df.loc[idx]['classification'])
            data['text'].append(text)
            annotated += 1
        coverage['annotated'].append(annotated)
        coverage['total'].append(total)
        coverage['proportion'].append(annotated / total if total > 0 else 0)
        lang_df = pd.DataFrame(data, index=indices)
        lang_df.to_csv(f'processed_data/{file}.csv', index_label='id', encoding='utf-8-sig')
coverage_df = pd.DataFrame(coverage)
coverage_df.to_csv('coverage.csv', index=False, encoding='utf-8-sig')
coverage_df.set_index('file', inplace=True)

100%|██████████| 1384/1384 [00:38<00:00, 36.34it/s]


In [19]:
high_resource_languages = ["eng", "spa", "deu", "jpn", "fra", "ara", "cmn"]

medium_resource_languages = ["ukr", "ceb", "arz", "lav", "ind", "afr", "bos", "glg", "ell", "heb", "bel", "zlm", "tha", "kat", "dan", "tgl", "slv", "tam", "kaz", "ron", "bul", "uzb", "lit", "est", "slk", "ben", "urd", "ltn", "swe", "tur", "kor", "hin", "fas", "por", "ces", "rus", "nld", "pol", "hrv", "ita", "vie", "eus", "hun", "fin", "srp", "cat"]

In [20]:
dir = 'processed_data'
files = {}
for i in range(len(coverage['file'])):
    file = coverage['file'][i]
    if len(file.split('_')) > 1:
        lang = file.split('_')[0]
    elif len(file.split('-')) > 1:
        lang = file.split('-')[0]
    else:
        continue
    if coverage['annotated'][i] < 900:
        continue
    if lang in files:
        files[lang].append(file)
    else:
        files[lang] = [file]

Not enough annotated data for bjp in file bjp_bjp.png.txt
Not enough annotated data for ulk in file ulk_ulk.ebible.txt
Not enough annotated data for dwu in file dwu_dwuliv.ebible.txt
Not enough annotated data for eng in file eng_engwyc2018.ebible.txt
Not enough annotated data for hvn in file hvn_hvn.ebible.txt
Not enough annotated data for adz in file adz_adz.ebible.txt
Not enough annotated data for aui in file aui-x-bible.txt
Not enough annotated data for mgw in file mgw_mgw.ebible.txt
Not enough annotated data for als in file als-x-bible.txt
Not enough annotated data for uri in file uri_uri.ebible.txt
Not enough annotated data for san in file san_sanori.ebible.txt
Not enough annotated data for cha in file cha-x-bible-1908.txt
Not enough annotated data for shj in file shj_shj.ebible.txt
Not enough annotated data for mgh in file mgh_mgh.ebible.txt
Not enough annotated data for eng in file eng_engoke.ebible.txt
Not enough annotated data for eng in file eng_eng-Brenton.ebible.txt
Not eno

In [23]:
transfer_langs = []
for lang in high_resource_languages + medium_resource_languages:
    if lang not in files:
        print(f'No files for language: {lang}')
        continue
    transfer_langs.append(lang)
print(transfer_langs)

No files for language: ara
No files for language: lav
No files for language: afr
No files for language: bos
No files for language: glg
No files for language: ell
No files for language: bel
No files for language: kat
No files for language: slv
No files for language: kaz
No files for language: bul
No files for language: uzb
No files for language: lit
No files for language: est
No files for language: slk
No files for language: ltn
No files for language: tur
No files for language: kor
No files for language: fas
No files for language: cat
['eng', 'spa', 'deu', 'jpn', 'fra', 'cmn', 'ukr', 'ceb', 'arz', 'ind', 'heb', 'zlm', 'tha', 'dan', 'tgl', 'tam', 'ron', 'ben', 'urd', 'swe', 'hin', 'por', 'ces', 'rus', 'nld', 'pol', 'hrv', 'ita', 'vie', 'eus', 'hun', 'fin', 'srp']


In [34]:
train = []
with open('eng_data/eng_train.tsv', 'r') as f:
    for line in f:
        idx, classification, text = line.strip().split('\t')
        train.append(int(idx))
dev = []
with open('eng_data/eng_dev.tsv', 'r') as f:
    for line in f:
        idx, classification, text = line.strip().split('\t')
        dev.append(int(idx))
test = []
with open('eng_data/eng_test.tsv', 'r') as f:
    for line in f:
        idx, classification, text = line.strip().split('\t')
        test.append(int(idx))

In [35]:
for lang in tqdm(files):
    best_coverage = 0
    best_filename = None
    for file in files[lang]:
        if coverage_df.loc[file]['annotated'] > best_coverage:
            best_coverage = coverage_df.loc[file]['annotated']
            best_filename = file
    assert(best_filename is not None)
    lang_train, lang_dev, lang_test = [], [], []
    lang_df = pd.read_csv(os.path.join('processed_data', best_filename + '.csv'), index_col='id')
    for idx in lang_df.index:
        if idx in train:
            lang_train.append((idx, lang_df.loc[idx]['classification'], lang_df.loc[idx]['text']))
        elif idx in dev:
            lang_dev.append((idx, lang_df.loc[idx]['classification'], lang_df.loc[idx]['text']))
        elif idx in test:
            lang_test.append((idx, lang_df.loc[idx]['classification'], lang_df.loc[idx]['text']))
    lang_train_df = pd.DataFrame(lang_train, columns=['id', 'classification', 'text'])
    lang_dev_df = pd.DataFrame(lang_dev, columns=['id', 'classification', 'text'])
    lang_test_df = pd.DataFrame(lang_test, columns=['id', 'classification', 'text'])
    lang_train_df.to_csv(f'final_data/{lang}_train.csv', index=False, encoding='utf-8-sig')
    lang_dev_df.to_csv(f'final_data/{lang}_dev.csv', index=False, encoding='utf-8-sig')
    lang_test_df.to_csv(f'final_data/{lang}_test.csv', index=False, encoding='utf-8-sig')

100%|██████████| 804/804 [00:23<00:00, 33.79it/s]
