In [1]:
import torch
import torch.nn as nn

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

import numpy as np

import random
import math
import time
import pickle
import re
from collections import Counter
import scipy

In [2]:
with open('data/base_text_data.pkl', 'rb') as f:
    train_data = pickle.load(f)
with open('data/Multi-30k/train.en') as f:
    data = [x.strip() for x in f.readlines()]
dev_data = data[-8000:]
print(len(train_data), len(dev_data))

60265 8000


In [3]:
punctuations = [r'\.', r'\.{2,}',
                             r'\!+', r'\:+', r'\;+', r'\"+', r"\'+", r'\?+', r'\,+', r'\(|\)|\[|\]|\{|\}|\<|\>']
def clean(line):
    for pattern in punctuations:
        line = re.sub(pattern, '', line)
    line = re.sub(r'[^a-z]', ' ', line.lower())
    return line

for i, dat in enumerate(train_data):
    train_data[i] = clean(dat)
    
for i, dat in enumerate(dev_data):
    dev_data[i] = clean(dat)

In [4]:
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [5]:
def build_vocab(data, tokenizer):
    counter = Counter()
    for sent in data:
        counter.update(tokenizer(sent))

    vocab_obj = vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
    vocab_obj.set_default_index(vocab_obj['<unk>'])

    return vocab_obj

In [6]:
vocabulary = build_vocab(train_data, tokenizer)

In [7]:
def data_process(data):
    _data = []
    for raw in data:
        tokens = tokenizer(raw)
        inp_tensor = torch.tensor([vocabulary[token] for token in tokens[:-1]], dtype=torch.long)
        trg_tensor = torch.tensor([vocabulary[token] for token in tokens[1:]], dtype=torch.long)
        _data.append((inp_tensor, trg_tensor))

    return _data

In [8]:
tok_train_data = data_process(train_data)
tok_dev_data = data_process(dev_data)

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [8]:
BATCH_SIZE = 32
PAD_IDX = vocabulary['<pad>']
BOS_IDX = vocabulary['<bos>']
EOS_IDX = vocabulary['<eos>']

In [11]:
def generate_batch(data_batch):
    src_batch, trg_batch = [], []
    for (src_item, trg_item) in data_batch:
        src_batch.append(torch.cat([torch.tensor([BOS_IDX]), src_item, torch.tensor([EOS_IDX])], dim=0))
        trg_batch.append(torch.cat([torch.tensor([BOS_IDX]), trg_item, torch.tensor([EOS_IDX])], dim=0))
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    trg_batch = pad_sequence(trg_batch, padding_value=PAD_IDX)

    return src_batch, trg_batch

In [12]:
train_dataloader = DataLoader(tok_train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch, drop_last=True)
dev_dataloader = DataLoader(tok_dev_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch, drop_last=True)

In [32]:
class EncoderModel(nn.Module):
    def __init__(self, vocab_len, device, dropout_prob=0.2, bidirectional=True):
        super(EncoderModel, self).__init__()
        self.rnn_size = 512
        self.embedding_dim = 128
        self.num_layers = 2
        self.num_directions = 2 if bidirectional else 1
        self.device = device

        self.embedding = nn.Embedding(
            num_embeddings = vocab_len,
            embedding_dim = self.embedding_dim,
        )
        self.rnn = nn.GRU(
            input_size = self.embedding_dim,
            hidden_size = self.rnn_size,
            num_layers = self.num_layers,
            bidirectional=bidirectional,
            dropout = dropout_prob
        )
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(self.num_directions * self.rnn_size, vocab_len)

    def forward(self, x):
        batch_size = x.shape[1]
        hidden_state = self.init_state(batch_size)
        
        embed = self.embedding(x)
        output, state = self.rnn(embed, hidden_state)
        output = self.dropout(output)
        logits = self.fc(output)
        return logits, state

    def init_state(self, batch_size):
        return torch.zeros(self.num_directions * self.num_layers, batch_size, self.rnn_size).to(self.device)

In [11]:
model = EncoderModel(len(vocabulary), device=device)
model = model.to(device)

In [33]:
with open('biGRU_5ep.pkl', 'rb') as f:
    model = pickle.load(f)
print(model)
model = model.to(device)

EncoderModel(
  (embedding): Embedding(27494, 128)
  (rnn): GRU(128, 512, num_layers=2, dropout=0.2, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=1024, out_features=27494, bias=True)
)


In [34]:
def train_evaluate(model, num_epochs=3):
    loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(num_epochs):
        # Training
        model.train()
        print(f"Epoch: {epoch}")
        loss_val = 0 
        for x, y in train_dataloader:
            x = x.to(device)
            y = y.to(device)
            
            optimizer.zero_grad()

            pred, state_h = model(x)
            loss = loss_fn(pred.transpose(1, 2), y)

            loss.backward()
            optimizer.step()
            
            loss_val += loss.item()
        print(f"Train Loss: {loss_val/len(train_dataloader)}, Train PPL: {2**(loss_val/len(train_dataloader))}")

        # Validation
        model.train()
        with torch.no_grad():
            loss_val = 0 
            for x, y in dev_dataloader:
                x = x.to(device)
                y = y.to(device)
                optimizer.zero_grad()

                pred, state_h = model(x)
                loss = loss_fn(pred.transpose(1, 2), y)
                
                loss_val += loss.item()
            print(f"Dev Loss: {loss_val/len(dev_dataloader)}, Dev PPL: {2**(loss_val/len(dev_dataloader))}")

In [46]:
train_evaluate(model, num_epochs=1)

Epoch: 0
Train Loss: 0.4423317674149779, Train PPL: 1.3587987229344551
Dev Loss: 0.6440360325574875, Dev PPL: 1.562694789512324


- Take a training dataset (SICK 2014 training) and finetune the MLP
- Or do something with the CNN output, if you dont want to mean over the layers (use MSE as final loss prolly)

In [35]:
class ScoringHead(nn.Module):
    def __init__(self, encoder_model, dropout_prob=0.2,):
        super(ScoringHead, self).__init__()
        self.encoder_model = encoder_model
        
        self.input_dim = 2 * self.encoder_model.rnn_size   # [2 * rnn_size = 1024]
        self.hidden_dim_1 = self.input_dim // 2        # [512]
        self.hidden_dim_2 = self.hidden_dim_1 // 2     # [256]
        self.hidden_dim_3 = self.hidden_dim_2 // 4     # [64]
        self.hidden_dim_4 = self.hidden_dim_3 // 4     # [16]
        self.hidden_dim_5 = 1

        self.linear_relu_stack = nn.Sequential(
            nn.Linear(self.input_dim, self.hidden_dim_1),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_1, self.hidden_dim_2),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_2, self.hidden_dim_3),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_3, self.hidden_dim_4),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_4, self.hidden_dim_5)
        )

    def forward(self, x1, x2):
        _, h1 = model(x1)
        _, h2 = model(x2)
        x1 = torch.mean(h1, dim=0)
        x2 = torch.mean(h2, dim=0)
        x = torch.cat((x1, x2), dim=1)
        out = self.linear_relu_stack(x)
        return out # shape -> [batch_size, 1]

In [36]:
with open('data/train_data.pkl', 'rb') as f:
    scoring_training_data = pickle.load(f)

In [37]:
def data_process_ft(data, label):
    _data = []
    for raw, lbl in zip(data, label):
        tokens_1 = tokenizer(raw[0])
        tokens_2 = tokenizer(raw[1])
        tensor_1 = torch.tensor([vocabulary[token] for token in tokens_1], dtype=torch.long)
        tensor_2 = torch.tensor([vocabulary[token] for token in tokens_2], dtype=torch.long)
        lbl_tensor = torch.tensor(lbl, dtype=torch.float)
        _data.append((tensor_1, tensor_2, lbl_tensor))

    return _data

In [38]:
def generate_batch_ft(data_batch):
    l_batch, r_batch, y = [], [], []
    for (item_1, item_2, label) in data_batch:
        l_batch.append(torch.cat([torch.tensor([BOS_IDX]), item_1, torch.tensor([EOS_IDX])], dim=0))
        r_batch.append(torch.cat([torch.tensor([BOS_IDX]), item_2, torch.tensor([EOS_IDX])], dim=0))
        y.append(label)
    l_batch = pad_sequence(l_batch, padding_value=PAD_IDX)
    r_batch = pad_sequence(r_batch, padding_value=PAD_IDX)
    y = torch.tensor(y, dtype=torch.float)

    return l_batch, r_batch, y

In [39]:
ft_x = scoring_training_data['x']
ft_y = scoring_training_data['y']
ft_x[0]

['A woman who is taking off a pink boa is stopping her bicycle on a bridge built for pedestrians',
 'A woman who is wearing a pink boa is riding a bicycle down a bridge built for pedestrians']

In [40]:
tok_ft_data = data_process_ft(ft_x, ft_y)
ft_dataloader = DataLoader(tok_ft_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch_ft, drop_last=True)

In [41]:
scoring_model = ScoringHead(model)
print(scoring_model)
scoring_model.to(device)

ScoringHead(
  (encoder_model): EncoderModel(
    (embedding): Embedding(27494, 128)
    (rnn): GRU(128, 512, num_layers=2, dropout=0.2, bidirectional=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (fc): Linear(in_features=1024, out_features=27494, bias=True)
  )
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=64, bias=True)
    (5): ReLU()
    (6): Linear(in_features=64, out_features=16, bias=True)
    (7): ReLU()
    (8): Linear(in_features=16, out_features=1, bias=True)
  )
)


ScoringHead(
  (encoder_model): EncoderModel(
    (embedding): Embedding(27494, 128)
    (rnn): GRU(128, 512, num_layers=2, dropout=0.2, bidirectional=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (fc): Linear(in_features=1024, out_features=27494, bias=True)
  )
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=64, bias=True)
    (5): ReLU()
    (6): Linear(in_features=64, out_features=16, bias=True)
    (7): ReLU()
    (8): Linear(in_features=16, out_features=1, bias=True)
  )
)

In [42]:
def finetune(model, num_epochs=3):
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(num_epochs):
        scoring_model.train()
        print(f"Epoch: {epoch}")
        loss_val = 0 
        for x1, x2, y in ft_dataloader:
            x1 = x1.to(device)
            x2 = x2.to(device)
            y = y.to(device)
            
            optimizer.zero_grad()

            pred = model(x1, x2)
            pred = torch.flatten(pred)
            loss = loss_fn(pred, y)

            loss.backward()
            optimizer.step()
            loss_val += loss.item()
        print(f"Train Loss: {loss_val/len(ft_dataloader)}")

In [51]:
finetune(scoring_model, num_epochs=1)

Epoch: 0
Train Loss: 0.25254207066878553


In [24]:
with open('data/test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)
x_test = test_data['x']
y_test = test_data['y']

In [48]:
def output_preds(model, x_test):
    tok_test_data = data_process_ft(x_test, y_test)
    test_dataloader = DataLoader(tok_test_data, batch_size=BATCH_SIZE, collate_fn=generate_batch_ft, drop_last=True)
    preds = np.array([], dtype=np.float16)
    for x1, x2, y in test_dataloader:
        x1 = x1.to(device)
        x2 = x2.to(device)

        pred = model(x1, x2)
        pred = torch.flatten(pred).cpu().detach().numpy()
        
        preds = np.concatenate((preds, pred))
    
    return preds

In [53]:
preds = output_preds(scoring_model, x_test)
preds = list(preds)

In [54]:
from scipy.stats import pearsonr
pearson_score, _ = pearsonr(preds[:10], y_test[:10])
pearson_score

0.31905771530624794