In [1]:
import torch
from transformers import BertTokenizer
import csv
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
import time
import datetime
import random
import numpy as np
from transformers import get_linear_schedule_with_warmup
import sys
import argparse
import os
from collections import defaultdict
from torch import nn
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from torch.utils.data import DataLoader
from typing import List, Dict
from torch.utils.data import Dataset
from collections import Counter, defaultdict
import numpy as np

I0319 18:11:50.885721 140125443987264 file_utils.py:35] PyTorch version 1.3.1 available.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Load embedding P

In [2]:
P = np.load("P.embeddings.npy")

### Load data

In [3]:
lang2data = defaultdict(list)
langs = ["en", "fr", "ru", "de", "he"]
for lang in langs:
    
   dir_path = "data/{}_replication_evalset".format(lang)
   for filename in os.listdir(dir_path):
        
        with open(dir_path+"/"+filename, "r", encoding = "utf-8") as f:
            
            lines = f.readlines()
        
        for line in lines:
            
            label, sent = line.strip().split("\t")
            label = 1 if label == "True" else 0
            sent += "."
            filename_prefix = filename[:-4]
            sent_dict = {"text": sent, "label": label, "type": filename_prefix}
            lang2data[lang].append(sent_dict)
            
for lang in langs:
    
    random.shuffle(lang2data[lang])

In [4]:
print(lang2data["en"][1])

{'text': 'the show the assistants like is new.', 'label': 1, 'type': 'obj_rel_no_comp_within_inanim'}


#### Train-dev split for English, all other languages are evaluation

In [5]:
n = 7000
data = lang2data["en"][:n]
l = int(0.8 * len(data))
train, dev = data[:l], data[l:]

In [6]:
train[0]

{'text': 'the painting the executive like is unpopular.',
 'label': 0,
 'type': 'obj_rel_no_comp_within_inanim'}

### pytorch datasets

In [7]:
class Dataset(torch.utils.data.Dataset):
    """Simple torch dataset class"""

    def __init__(self, data: List[Dict], device = "cpu"):

        self.data = data
        self.device = device

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        with torch.no_grad():
            
            x,y = self.data[index]["text"], self.data[index]["label"]

            return x,y

In [18]:
train_dataset, dev_dataset = Dataset(dev[:50], "cpu"), Dataset(dev[:50])

## Perform INLP on embedding matrices

## Load model

In [19]:
class BertModel(pl.LightningModule):

    def __init__(self, train_dataset: Dataset, dev_dataset: Dataset, batch_size, device: str, mode: str = "eval"):
        
        super().__init__()
        
        self.device = device
        config = BertConfig.from_pretrained("bert-base-multilingual-uncased", output_hidden_states=True, num_labels = 2)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
        self.model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', config = config)
        self.pad_token = self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0]
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        
        if mode == "eval":
            
            self.model.eval()
        else:
            self.model.train()
            
        self.train_gen = torch.utils.data.DataLoader(self.train_dataset, batch_size=batch_size, drop_last=False, shuffle=True)
        self.dev_gen = torch.utils.data.DataLoader(self.dev_dataset, batch_size=batch_size, drop_last=False, shuffle=True)
        self.acc = None
        
        
        
    def tokenize_and_pad(self, texts: List[str]):
        
        indexed_texts = [self.tokenizer.encode(text, add_special_tokens=True) for text in texts] #
        max_len = min(500, max(len(text) for text in indexed_texts))
        indexed_texts = [text + [self.pad_token] * (max_len - len(text)) for text in indexed_texts]
        return torch.LongTensor(indexed_texts).to(self.device)
    
    def forward(self, x):
        
        outputs = self.model(x)
        logits = outputs[0], outputs[1]
        return logits
    
    def forward_with_loss_calculation(self, x, y):
        
        outputs = self.model(x, labels = y)
        loss, logits = outputs[0], outputs[1]
        return loss, logits
    
    def training_step(self, batch, batch_nb):
        
        sents, y = batch
        x = self.tokenize_and_pad(sents)        
        loss, logits = self.forward_with_loss_calculation(x, y)

        correct = logits.argmax(axis=1).int() == y.int()
        acc = torch.sum(correct).float() / len(y)

        return {'loss': loss, 'val_acc': acc}
    
    def validation_step(self, batch, batch_nb):
        sents, y = batch
        x = self.tokenize_and_pad(sents)
        loss, logits = self.forward_with_loss_calculation(x,y)

        correct = logits.argmax(axis=1).int() == y.int()
        acc = torch.sum(correct).float() / len(y)

        return {'val_loss': loss, 'val_acc': acc}
    
    def validation_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
        print("Loss is {}".format(avg_loss))
        print("Accuracy is {}".format(avg_acc))
        self.acc = avg_acc
        return {'avg_val_loss': avg_loss}
    
    def configure_optimizers(self):
        # return torch.optim.SGD(self.parameters(), lr=0.005, momentum=0.9)
        return torch.optim.Adam(self.parameters(), weight_decay=1e-4)
    
    @pl.data_loader
    def train_dataloader(self):
        return self.train_gen

    @pl.data_loader
    def val_dataloader(self):
        # OPTIONAL
        # can also return a list of val dataloaders
        return self.dev_gen

In [20]:
bert = BertModel(dev_dataset, dev_dataset, batch_size = 4, device = "cuda", mode = "train")

I0319 18:15:48.059725 140125443987264 configuration_utils.py:185] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json from cache at /home/nlp/ravfogs/.cache/torch/transformers/33b56ce0f312e47e4d77a57791a4fc6233ae4a560dd2bdd186107058294e58ab.c7892120c5a9b21e515abc904e398dbabddf9510b122f659063cbf361fe16868
I0319 18:15:48.062198 140125443987264 configuration_utils.py:199] Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labe

In [21]:
#list(bert.parameters())

## Perform INLP on embeddings

In [12]:
import urllib
wv = bert.model.get_input_embeddings().weight.detach().cpu().numpy()
#import requests
vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt"

page = urllib.request.urlopen(vocab_url)
vocab = page.read().decode('utf8')
vocab = vocab.split("\n")[:-1]
print(wv.shape, len(vocab))
w2i, i2w = {w:i for i,w in enumerate(vocab)}, {i:w for i,w in enumerate(vocab)}
wv_cleaned = P.dot(wv.T).T

(105879, 768) 105879


### Collect dataset

In [13]:
c =  defaultdict(Counter)

for lang, lang_data in lang2data.items():
    for data_dict in lang_data:
        sent = data_dict["text"]
        c[lang].update(sent.split(" "))

In [14]:
len(c["en"])

175

## Finetune

In [None]:
trainer = Trainer(max_nb_epochs=10,min_nb_epochs=8, gpus = 1)
trainer.fit(bert)

gpu available: True, used: True
VISIBLE GPUS: 0


  0%|          | 0/26 [00:00<00:00, 46.63it/s]

                                          Name                           Type  \
0                                        model  BertForSequenceClassification   
1                                   model.bert                      BertModel   
2                        model.bert.embeddings                 BertEmbeddings   
3        model.bert.embeddings.word_embeddings                      Embedding   
4    model.bert.embeddings.position_embeddings                      Embedding   
..                                         ...                            ...   
214                          model.bert.pooler                     BertPooler   
215                    model.bert.pooler.dense                         Linear   
216               model.bert.pooler.activation                           Tanh   
217                              model.dropout                        Dropout   
218                           model.classifier                         Linear   

    Params  
0    167 M  
1

100%|██████████| 26/26 [00:02<00:00,  9.61it/s, batch_nb=12, epoch=0, gpu=0, loss=0.805, v_nb=21]

Loss is 0.6905643939971924
Accuracy is 0.5384615659713745


100%|██████████| 26/26 [00:02<00:00, 11.67it/s, batch_nb=12, epoch=1, gpu=0, loss=0.793, v_nb=21]

Loss is 0.7584704756736755
Accuracy is 0.5384615659713745


100%|██████████| 26/26 [00:02<00:00, 12.42it/s, batch_nb=12, epoch=2, gpu=0, loss=0.808, v_nb=21]

Loss is 0.847201406955719
Accuracy is 0.5384615659713745


100%|██████████| 26/26 [00:02<00:00, 12.49it/s, batch_nb=12, epoch=3, gpu=0, loss=0.804, v_nb=21]

Loss is 0.7126203179359436
Accuracy is 0.46153849363327026


100%|██████████| 26/26 [00:02<00:00, 12.72it/s, batch_nb=12, epoch=4, gpu=0, loss=0.779, v_nb=21]

Loss is 0.7199629545211792
Accuracy is 0.5384615659713745


In [16]:
train[:5]

[{'text': 'the painting the executive like is unpopular.',
  'label': 0,
  'type': 'obj_rel_no_comp_within_inanim'},
 {'text': 'the show the assistants like is new.',
  'label': 1,
  'type': 'obj_rel_no_comp_within_inanim'},
 {'text': 'the teacher that likes the guard are young.',
  'label': 0,
  'type': 'subj_rel'},
 {'text': 'few senators that the architects like will ever be popular.',
  'label': 1,
  'type': 'npi_across_anim'},
 {'text': 'the authors that the chefs love swim.',
  'label': 1,
  'type': 'obj_rel_across_anim'}]