In [2]:
import os, torch, sys, scipy
import numpy as np
import torch.nn.functional as F
from utils.text_loader import get_word2id, load_vocabs, write_vocabs
from utils.helper import dict2clsattr
from translation import align_words, load_test_dict, train_supervision, cal_similarity, robust_procrustes, get_dico_dict
from models.embedding import ClipEmbedding
from evals.word_translation import get_csls_word_translation, get_topk_translation_accuracy, get_topk_accuracy, get_candidates, read_txt_embeddings
import argparse, json
import configs
os.environ['TOKENIZERS_PARALLELISM'] = "false"

In [71]:
with open("configs/analysis.json") as f:
    model_config = json.load(f)

args = dict2clsattr({}, model_config)
args.langs = {configs.SRC: args.src_lang, configs.TGT: args.tgt_lang}
# args.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
args.device = torch.device('cpu')

In [72]:
word2ids, embs, id2words = {}, {}, {}
for key, lang in args.langs.items():
    emb_pth = f'../../clip-uwt/dataset/howtotext/howto100m_{lang}_sentences_train_text_txt_processed.txt.vectors.1th.win5.dim300.skipgram.vec'
#     emb_pth = f'../../clip-uwt/muse/data/wiki.{lang}.vec'
    id2words[key], word2ids[key], embs[key] = read_txt_embeddings(emb_pth)

Load vocabs from en-fr

In [73]:
vocabs = {}
new_embs = {}
for key, lang in args.langs.items():
    vocabs[key] = load_vocabs(lang, args.langs, args.word_data, args.data_mode)
    inds = [word2ids[key][w] for w in vocabs[key]]
    new_embs[key] = embs[key][np.asarray(inds)]
    

In [78]:
root = f"../../clip-uwt/dicts/embeddings/fasttext/wiki/fasttext_wiki_{args.langs['src']}_{args.langs['tgt']}"
np.save(root + f"_{args.langs['src']}_test", new_embs['src'])
np.save(root + f"_{args.langs['tgt']}_test", new_embs['tgt'])

(2879, 300)

In [60]:
test_dico = load_test_dict(args, word2ids)
test_dico.shape

torch.Size([2879, 2])

In [61]:
src_ids = []
for i in test_dico[:, 0].cpu().numpy():
    if i not in src_ids:
        src_ids.append(i)

In [62]:
len(src_ids)

1483

In [63]:
src_embs = embs['src'][src_ids]
tgt_embs = embs['tgt'][test_dico[:, 1]]

Use the dictionary dataset

In [64]:
src_embs.shape

(1483, 300)

In [65]:
# src
root = f"../../clip-uwt/dicts/embeddings/howtowtext/wiki/howtowtext_wiki_{args.langs['src']}_{args.langs['tgt']}"
np.save(root + f"_{args.langs['src']}_test", src_embs)
np.save(root + f"_{args.langs['tgt']}_test", tgt_embs)

In [68]:
vocabs = {}
vocabs['src'] = [id2words['src'][k.item()] for k in src_ids]
vocabs['tgt'] = [id2words['tgt'][k.item()] for k in test_dico[:, 1]]

In [69]:
for l, lang in args.langs.items():
    write_vocabs(vocabs[l], args.langs[l], args.langs, args.word_data, args.data_mode)
    print('Done', lang)

Done en
Done fr
