In [None]:
!pip install datasets

In [3]:
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
import string
import nltk
import re
import numpy as np
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from transformers import get_linear_schedule_with_warmup
import torch
from torch import nn, optim
import pickle
from torch.utils.data import Dataset, DataLoader

In [4]:
numbers = {
    '0': 'zero',
    '1': 'one',
    '2': 'two',
    '3': 'three',
    '4': 'four',
    '5': 'five',
    '6': 'six',
    '7': 'seven',
    '8': 'eight',
    '9': 'nine'
}

def remove_puntuations(txt):
    punct = set(string.punctuation)
    txt = " ".join(txt.split("."))
    txt = " ".join(txt.split("!"))
    txt = " ".join(txt.split("?"))
    txt = " ".join(txt.split(":"))
    txt = " ".join(txt.split(";"))
    txt = " ".join(txt.split('-'))

    txt = "".join(ch for ch in txt if ch not in punct)
    return txt

def number_to_words(txt):
    for k in numbers.keys():
        txt = txt.replace(k,numbers[k]+" ")
    return txt

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'_',' ',text)
    text = number_to_words(text)
    text = remove_puntuations(text)
    text = ''.join([i if ord(i) < 128 else '' for i in text])
    text = ' '.join(text.split())
    return text

In [5]:
dataset = load_dataset('multi_eurlex', 'all_languages')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/138k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/47.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.77G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/55000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [24]:
# train_sentences_en = []
# train_ds_sample = dataset['train'].select(range(10000))
# for i in tqdm(train_ds_sample):
#     train_sentences_en.append(preprocess_text(i['text']['en']))

test_sentences = {}
test_langs = ['de', 'hu', 'es', 'bg', 'lv']

for i in tqdm(dataset['test'].select(range(500))):
    for lang in test_langs:
        if lang not in test_sentences.keys():
            test_sentences[lang] = []
        test_sentences[lang].append(preprocess_text(i['text'][lang]))

100%|██████████| 500/500 [00:17<00:00, 27.98it/s]


In [None]:
!pip install easynmt

In [10]:
from easynmt import EasyNMT
translation_model = EasyNMT('opus-mt')

11.9kB [00:00, 5.87MB/s]                   


In [None]:
hu_translated_sentences = {}
i = 0
for sentence in tqdm(test_sentences['hu']):
  hu_translated_sentences[i] = translation_model.translate(sentence, target_lang = 'en')
  i+=1

 89%|████████▊ | 443/500 [23:34<03:35,  3.78s/it]

In [26]:
import pickle
pickle.dump(hu_translated_sentences, open('/content/drive/MyDrive/NLP/Project/Test_translations/hu_to_en.pkl', 'wb'))

In [None]:
class MultiLabelDataset(Dataset):
    def __init__(self, texts, lang, tokenizer, max_len=128):
        self.tokenizer = tokenizer
        self.texts = texts[lang]
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        translated_text = translation_model.translate(text, target_lang = 'en')
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

In [None]:
class XLMRForMultiLabelClassification(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.xlmr = XLMRobertaModel.from_pretrained('xlm-roberta-base')
        self.classifier = nn.Linear(self.xlmr.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.xlmr(input_ids=input_ids, attention_mask=attention_mask)
        # pooled_output = outputs[1]  # pooler_output in XLM-R is the last hidden state of the first token (CLS token)
        # logits = self.classifier(pooled_output)
        # return torch.sigmoid(logits)
        return outputs[0]

In [None]:
nmt_model = XLMRForMultiLabelClassification(num_labels=21).to(device)
nmt_model.load_state_dict(torch.load('/content/drive/MyDrive/NLP/Project/MT/xlm-roberta-base-model_state_dict_2_epochs.pt', map_location=torch.device(device)))

<All keys matched successfully>

In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

In [None]:
# test_langs = ['de', 'hu', 'es', 'bg', 'lv']

In [None]:
train_ds_es = MultiLabelDataset(test_sentences, 'es', tokenizer, max_len=128)

In [None]:
train_ds_es[0]

In [None]:
train_dataloader = DataLoader(train_ds_es, batch_size=64, shuffle=False)

In [None]:
nmt_embeddings = {}
i = 0

nmt_model.eval()
with torch.no_grad():
  for batch in tqdm(train_dataloader):
    input_ids, attention_mask = batch['input_ids'], batch['attention_mask']
    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
    outputs = nmt_model(input_ids, attention_mask)

    j = 0
    while j < len(outputs):
      nmt_embeddings[i] = outputs[j].cpu()
      i+=1
      j+=1


In [None]:
nmt_embeddings[0].shape

torch.Size([128, 768])

In [None]:
len(nmt_embeddings)

10000

In [None]:
pickle.dump(nmt_embeddings, open('/content/drive/MyDrive/NLP/Project/Train_embeddings/nmt.pkl', 'wb'))

In [None]:
embeddings_to_be_saved = {}
for i in range(9000,10000):
  embeddings_to_be_saved[i] = nmt_embeddings[i]
pickle.dump(embeddings_to_be_saved, open(f'/content/drive/MyDrive/NLP/Project/Train_embeddings/nmt_10.pkl', 'wb'))

In [None]:
x = pickle.load(open('/content/drive/MyDrive/NLP/Project/Train_embeddings/nmt_10.pkl', 'rb'))
x