In [None]:
!pip install datasets

In [2]:
from datasets import load_dataset
import torch
import torch.nn as nn
from transformers import XLMRobertaModel, XLMRobertaConfig, XLMRobertaTokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, f1_score
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
import pickle
from tqdm import tqdm
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
dataset = load_dataset('multi_eurlex', language='all_languages')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/138k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/47.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.77G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [4]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [5]:
class XLMRForMultiLabelClassification(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.xlmr = XLMRobertaModel.from_pretrained('xlm-roberta-base')
        self.classifier = nn.Linear(self.xlmr.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.xlmr(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # pooler_output in XLM-R is the last hidden state of the first token (CLS token)
        logits = self.classifier(pooled_output)
        return torch.sigmoid(logits)
        # return outputs[0]

    def get_embeddings(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.xlmr(input_ids=input_ids, attention_mask=attention_mask)
        return outputs[0]


In [6]:
nmt_model = XLMRForMultiLabelClassification(num_labels=21).to(device)
nmt_model.load_state_dict(torch.load('/content/drive/MyDrive/NLP/Project/MT/xlm-roberta-base-model_state_dict_2_epochs.pt', map_location=torch.device(device)))

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

<All keys matched successfully>

In [7]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def label_transform(self, label):
        return torch.tensor([1 if i in label else 0 for i in range(21)], dtype=torch.float)

    def __getitem__(self, index):
        text = self.data[index]['text']['en']
        inputs = self.tokenizer(text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt").to(device)
        ids = inputs['input_ids'].squeeze()
        mask = inputs['attention_mask'].squeeze()
        targets = self.label_transform(self.data[index]['labels'])
        return ids, mask, targets

    def __len__(self):
        return self.len

In [8]:
class AdapterLayer(nn.Module):
    def __init__(self, input_size, bottleneck_size, non_linearity="gelu"):
        super(AdapterLayer, self).__init__()
        self.down_project = nn.Linear(input_size, bottleneck_size)
        self.up_project = nn.Linear(bottleneck_size, input_size)
        self.non_linearity = nn.GELU() if non_linearity == "gelu" else nn.ReLU()

    def forward(self, x):
        x = self.down_project(x)
        x = self.non_linearity(x)
        x = self.up_project(x)
        return x

class XLMRobertaWithAdapters(XLMRobertaModel):
    def __init__(self, config, add_adapters=True, bottleneck_size=64):
        super(XLMRobertaWithAdapters, self).__init__(config)
        self.add_adapters = add_adapters
        if add_adapters:
            self.adapters = nn.ModuleList([AdapterLayer(config.hidden_size, bottleneck_size) for _ in range(config.num_hidden_layers)])

    def forward(self, input_ids, attention_mask=None):
        outputs = super().forward(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        if self.add_adapters:
            for adapter in self.adapters:
                sequence_output = adapter(sequence_output) + sequence_output
        return sequence_output

class Classifier(nn.Module):
    def __init__(self, model_name_or_path, num_labels, bottleneck_size=64):
        super(Classifier, self).__init__()
        config = XLMRobertaConfig.from_pretrained(model_name_or_path)
        self.xlm_roberta = XLMRobertaWithAdapters(config, add_adapters=True, bottleneck_size=bottleneck_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.xlm_roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

    def get_embeddings(self, input_ids, attention_mask=None):
        outputs = self.xlm_roberta(input_ids, attention_mask=attention_mask)
        return outputs


In [9]:
adapter_model = Classifier('xlm-roberta-base', num_labels=21).to(device)

In [10]:
model_path = '/content/drive/MyDrive/NLP/Project/Adapter/Pytorch Model/model.pt'
adapter_model = torch.load('/content/drive/MyDrive/NLP/Project/Adapter/Pytorch Model/model_2.pt', map_location = torch.device(device))

In [11]:
train_ds = dataset['train'].select(range(5000))
train_dataset = TextDataset(train_ds, XLMRobertaTokenizer.from_pretrained('xlm-roberta-base'))
train_data_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)

In [39]:
adapter_embeddings = []
nmt_embeddings = []
final_labels = []
for data in tqdm(train_data_loader):
    with torch.no_grad():
        ids, mask, targets = data
        ids, mask, targets = ids.to(device), mask.to(device), targets.to(device)

        # generated_adapter_embeddings = adapter_model.get_embeddings(ids, mask)
        # generated_nmt_embeddings = nmt_model.get_embeddings(ids, mask).squeeze(0)

        # adapter_embeddings.append(generated_adapter_embeddings)
        # nmt_embeddings.append(generated_nmt_embeddings)
        final_labels.append(targets)

100%|██████████| 313/313 [01:22<00:00,  3.81it/s]


In [13]:
cpu_nmt_embeddings = []
for i in nmt_embeddings:
    cpu_nmt_embeddings.append(i.cpu())

In [14]:
cpu_adapter_embeddings = []
for i in adapter_embeddings:
    cpu_adapter_embeddings.append(i.cpu())

In [40]:
cpu_labels = []
for i in final_labels:
    cpu_labels.append(i.cpu())

In [18]:
import pickle
# pickle.dump(cpu_nmt_embeddings[:156], open('/content/drive/MyDrive/NLP/Project/New_train_embeddings/nmt_embeddings.pkl', 'wb'))
pickle.dump(cpu_adapter_embeddings[:156], open('/content/drive/MyDrive/NLP/Project/New_train_embeddings/adapter_embeddings.pkl', 'wb'))

In [41]:
pickle.dump(cpu_labels, open('/content/drive/MyDrive/NLP/Project/New_train_embeddings/train_labels.pkl', 'wb'))

In [15]:
len(cpu_nmt_embeddings)

313

## Test Embeddings

In [42]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, lang, max_len=128):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.lang = lang
        self.max_len = max_len
        self.lang_text = pickle.load(open(f'/content/drive/MyDrive/NLP/Project/Test_translations/{lang}_to_en.pkl', 'rb'))

    def label_transform(self, label):
        return torch.tensor([1 if i in label else 0 for i in range(21)], dtype=torch.float)

    def __getitem__(self, index):
        adapter_text = self.data[index]['text'][self.lang]
        nmt_text = self.lang_text[index]

        adapter_inputs = self.tokenizer(adapter_text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt").to(device)
        adapter_ids = adapter_inputs['input_ids'].squeeze()
        adapter_mask = adapter_inputs['attention_mask'].squeeze()

        nmt_inputs = self.tokenizer(nmt_text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt").to(device)
        nmt_ids = nmt_inputs['input_ids'].squeeze()
        nmt_mask = nmt_inputs['attention_mask'].squeeze()

        targets = self.label_transform(self.data[index]['labels'])
        return adapter_inputs, adapter_ids, adapter_mask, nmt_inputs, nmt_ids, nmt_mask, targets

    def __len__(self):
        return self.len

In [65]:
lang = 'de'
test_ds = dataset['test'].select(range(500))
test_dataset = TextDataset(test_ds, XLMRobertaTokenizer.from_pretrained('xlm-roberta-base'), lang)
test_data_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [66]:
len(test_dataset)

500

In [67]:
adapter_embeddings = []
nmt_embeddings = []
final_labels = []

for data in tqdm(test_data_loader):
    with torch.no_grad():
        adapter_inputs, adapter_ids, adapter_mask, nmt_inputs, nmt_ids, nmt_mask, targets = data
        adapter_inputs, adapter_ids, adapter_mask, nmt_inputs, nmt_ids, nmt_mask, targets = adapter_inputs.to(device), adapter_ids.to(device), adapter_mask.to(device), nmt_inputs.to(device), nmt_ids.to(device), nmt_mask.to(device), targets.to(device)

        generated_adapter_embeddings = adapter_model.get_embeddings(ids, mask)
        generated_nmt_embeddings = nmt_model.get_embeddings(ids, mask).squeeze(0)

        adapter_embeddings.append(generated_adapter_embeddings)
        nmt_embeddings.append(generated_nmt_embeddings)
        final_labels.append(targets)

100%|██████████| 16/16 [00:13<00:00,  1.19it/s]


In [68]:
cpu_nmt_embeddings = []
for i in nmt_embeddings:
    cpu_nmt_embeddings.append(i.cpu())

cpu_adapter_embeddings = []
for i in adapter_embeddings:
    cpu_adapter_embeddings.append(i.cpu())

cpu_labels = []
for i in final_labels:
    cpu_labels.append(i.cpu())

In [69]:
lang

'hu'

In [62]:
pickle.dump(cpu_nmt_embeddings, open(f'/content/drive/MyDrive/NLP/Project/New_test_embeddings/nmt_embeddings_{lang}.pkl', 'wb'))

In [63]:
pickle.dump(cpu_adapter_embeddings, open(f'/content/drive/MyDrive/NLP/Project/New_test_embeddings/adapter_embeddings_{lang}.pkl', 'wb'))

In [64]:
pickle.dump(final_labels, open(f'/content/drive/MyDrive/NLP/Project/New_test_embeddings/labels_{lang}.pkl', 'wb'))