In [None]:
import torch
from torch import Tensor
from transformers import *

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = BertModel.from_pretrained('bert-base-uncased').to(device=device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [None]:
from typing import List
from conllu import parse_incr, TokenList
from polyglot.mapping import Embedding
from polyglot.downloader import downloader
from nltk import word_tokenize

nl_en = {}

embeddings = Embedding.load("embeddings2/nl/embeddings_pkl.tar.bz2")

with open('data/filtered_en_nl_dict.txt', 'r') as bidict:
    for trans in bidict:
        nl, en = trans.split()
        nl_en[nl] = en

In [None]:
def preprocess_data(path):
    proc_data = []
    with open(path, 'r') as data: 
        for sentence in data:
            proc_sentence = word_tokenize(sentence.lower())
            proc_data.append(proc_sentence)
    
    return proc_data

In [None]:
train_data = preprocess_data('data/train.txt')
valid_data = preprocess_data('data/valid.txt')
test_data = preprocess_data('data/test.txt')
print(train_data[0:5])

In [None]:
from tqdm import tqdm
import string

def fetch_sen_reps(data, model, tokenizer, embeddings, nl_en, model_type, device, concat=False):
    
    sen_reps_source = []
    sen_reps_target = []
    sen_len = []
    for sentence in tqdm(data):
        output_source = []
        if model_type=='TF':
            total_tokens = [101]

            for word in sentence:
                if word in embeddings.vocabulary:
                    nl_embed = Tensor(embeddings.get(word)).to(device)
                else:
                    continue
                    
                if word in nl_en:
                    en_word = nl_en[word]
                    input_ids = tokenizer.encode(en_word)[1]
                elif word in string.punctuation:
                    input_ids = tokenizer.encode(word)[1]
                else:
                    continue
                    
                output_source.append(nl_embed)
                total_tokens.append(input_ids)

            total_tokens.append(102)
            output_source = torch.stack(output_source)

            input_sen = Tensor(total_tokens).type(torch.long).unsqueeze(0).to(device)
            output_sen = model(input_sen)[0][0][1:-1].detach()

            if concat:
                sen_reps_target.extend(output_sen.cpu())
                sen_reps_source.extend(output_source.cpu())
                

            else:
                sen_reps_target.append(output_sen.cpu())
                sen_reps_source.append(output_source.cpu())

            sen_len.append(output_sen.size(0))

    if concat:
        return torch.stack(sen_reps_target), torch.stack(sen_reps_source)

    else:
        return sen_reps_target, sen_reps_source, Tensor(sen_len)

In [None]:
sen_reps_target, sen_reps_source, sen_len = fetch_sen_reps(train_data[0:2000], model, tokenizer, embeddings, nl_en, 'TF', device)
