In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/98/87/ef312eef26f5cecd8b17ae9654cdd8d1fae1eb6dbd87257d6d73c128a4d0/transformers-4.3.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 5.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 39.8MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/fd/5b/44baae602e0a30bcc53fbdbc60bd940c15e143d252d658dfdefce736ece5/tokenizers-0.10.1-cp36-cp36m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 49.6MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=bdad9

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
import json
from tqdm import tqdm
warnings.filterwarnings('ignore')

In [3]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-multilingual-cased')

In [4]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights, return_dict=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




In [5]:
def sent_to_word_embed(sentence, tokenizer, model):
  input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
  outputs = model(input_ids)[0][0]
  tokens = []
  for idx in input_ids[0]:
    for k,v in tokenizer.vocab.items():
      if v == idx:
        tokens.append(k)
  
  is_subword = []
  for token in tokens:
    if len(token) >= 2 and token[:2] == '##':
      is_subword.append(1)
    else:
      is_subword.append(0)
  
  words = []
  embeddings = []
  pos = 0
  while pos < len(is_subword)-1:
    if is_subword[pos] == 0 and is_subword[pos+1] == 0:
      words.append(tokens[pos])
      embeddings.append(outputs[pos])
    elif is_subword[pos] == 0 and is_subword[pos+1] == 1:
      count = 0
      i = pos+1
      while is_subword[i] == 1:
        count += 1
        i += 1
      combined_word = tokens[pos]
      combined_embedding = outputs[pos]
      for j in range(count):
        combined_word += tokens[pos+j+1][2:]
        combined_embedding += outputs[pos+j+1]
      combined_embedding /= (count+1)
      words.append(combined_word)
      embeddings.append(combined_embedding)
    pos += 1
  words = words[1:]
  embeddings = embeddings[1:]
  return words, embeddings

In [6]:
def list_lower_case(ls):
  if len(ls) > 0:
    for e in ls:
      e.lower()

In [7]:
def sent_to_cat_embed(sentence, target, head, dependent, tokenizer, model):
  words, embeddings = sent_to_word_embed(sentence, tokenizer, model)
  
  list_lower_case(target)
  list_lower_case(head)
  list_lower_case(dependent)

  target_embedding = torch.zeros(torch.Size([768]))
  head_embedding = torch.zeros(torch.Size([768]))
  dependent_embedding = torch.zeros(torch.Size([768]))

  for i in range(len(words)):
    if words[i] in target:
        target_embedding += embeddings[i]
    elif words[i] in head:
        head_embedding += embeddings[i]
    elif words[i] in dependent:
        dependent_embedding += embeddings [i]

  cat_embedding = torch.cat((target_embedding, head_embedding, dependent_embedding), 0)
  return cat_embedding

In [8]:
def info_to_cat(filename, tokenizer, model):
  file = open(filename, "r")
  cats = []
  for line in tqdm(file.readlines()):
    info = eval(line)

    pair_id = info[0]

    sentence_1 = info[1]
    target_1 = info[2]
    head_1 = info[3]
    dependent_1 = info[4]

    sentence_2 = info[5]
    target_2 = info[6]
    head_2 = info[7]
    dependent_2 = info[8]

    label = info[9]

    cat_1 = sent_to_cat_embed(sentence_1, target_1, head_1, dependent_1, tokenizer, model)
    cat_2 = sent_to_cat_embed(sentence_2, target_2, head_2, dependent_2, tokenizer, model)
    cats.append([pair_id, cat_1, cat_2, label])
  
  return cats

In [9]:
def save_cats(cats, prefix):
  my_cats = open(prefix+'.cat', "a")
  for cat in cats:
    cat[1] = cat[1].tolist()
    cat[2] = cat[2].tolist()
    my_cats.write(json.dumps(cat))
    my_cats.write('\n')

In [10]:
%cd /content/drive/MyDrive/Colab Notebooks/R&D/Evaluation/xlwic_wn

/content/drive/MyDrive/Colab Notebooks/R&D/Evaluation/xlwic_wn


In [11]:
#save_cats(info_to_cat('dev.de-de_1.info', tokenizer, model), 'dev.de-de.mbert')

In [12]:
save_cats(info_to_cat('dev.da-da.info', tokenizer, model), 'dev.da-da.mbert')

100%|██████████| 500/500 [10:04:14<00:00, 72.51s/it]


In [13]:
#save_cats(info_to_cat('dev.en-en.info', tokenizer, model), 'dev.en-en')

In [14]:
#save_cats(info_to_cat('dev.fr-fr.info', tokenizer, model), 'dev.fr-fr')

In [15]:
#save_cats(info_to_cat('dev.zh-zh.info', tokenizer, model), 'dev.zh-zh')