In [36]:
import pandas as pd
import torch
from torch import nn, Tensor
import math
import time
import copy
import io
import numpy as np
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

In [2]:
def load_pre_trained_embeddings(path: str):
    fin = io.open(path, 'r', encoding='utf-8', newline='\n', errors='ignore')
    # n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

In [3]:
def load_words(file_name):
    df = pd.read_csv(file_name)
    text = df[['question1','question2']]
    targets = df['is_duplicate']
    return text, targets

In [19]:
def get_embedding(vocab):
    matrix_len = len(vocab)
    emb_dim = 300
    weights_matrix = np.zeros((matrix_len, emb_dim))
    words_found = 0
    for i, word in enumerate(vocab):
        try: 
            weights_matrix[i] = np.array(list(vocab[word]))
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=emb_dim)
        except ValueError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=emb_dim)
    return torch.from_numpy(weights_matrix)

In [5]:
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False
    return emb_layer, num_embeddings, embedding_dim

In [6]:
tokens_dict = load_pre_trained_embeddings('/Users/stavbracha/PycharmProjects/crawl-300d-2M-subword/crawl-300d-2M-subword.vec')

In [7]:
class Dataset(Dataset):
    def __init__( self, file_name):
        self.words, self.targets = load_words(file_name)
        self.uniq_words = self.get_uniq_words()
        self.token_dict = tokens_dict
        self.tokenized_words = self.word2token()
        
    def get_uniq_words(self):
        result = set()
        self.words.dropna(subset=['question1'], inplace=True)
        self.words['question1'].str.lower().str.split().apply(result.update)
        self.words.dropna(subset=['question2'], inplace=True)
        self.words['question2'].str.lower().str.split().apply(result.update)
        return list(result)
    
    def word2token(self):
        self.words['question1_list'] = self.words['question1'].str.lower().str.split()
        self.words['question2_list'] = self.words['question2'].str.lower().str.split()
        word_to_index = {word: index for index, word in enumerate(self.token_dict)}
        df_models1 = pd.DataFrame(self.words['question1_list'].tolist()).fillna('').add_prefix('model_')
        df_models1 = df_models1.apply(lambda x: x.map(word_to_index.get)).apply(pd.to_numeric,errors='coerce')
        df_models2 = pd.DataFrame(self.words['question2_list'].tolist()).fillna('').add_prefix('model_')
        df_models2 = df_models2.apply(lambda x: x.map(word_to_index.get)).apply(pd.to_numeric,errors='coerce')
        return torch.tensor(df_models1.values), torch.tensor(df_models2.values)
        
    def __len__(self):
        return len(self.words)

    def __getitem__(self, index):
        return (
            self.tokenized_words[0][index,:], self.tokenized_words[1][index,:],
            torch.tensor(self.targets[index]),
        )

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
train = Dataset('train_data.csv')
test = Dataset('test_data.csv')

In [10]:
batch_size = 30
eval_batch_size = 15
train_data = DataLoader(train.tokenized_words, batch_size = batch_size)
test_data = DataLoader(test.tokenized_words, batch_size = eval_batch_size)

In [12]:
class MultiHeadCoAttention(nn.Module):

    def __init__(self, d_model, n_head):
        super(MultiHeadCoAttention, self).__init__()
        self.n_head = n_head
        self.softmax = nn.Softmax(dim=-1)
        self.quest_1 = nn.Linear(d_model, d_model)
        self.quest_2 = nn.Linear(d_model, d_model)

    def split(self, tensor):
        batch_size, length, d_model = tensor.size()
        d_tensor = d_model // self.n_head
        tensor = tensor.view(batch_size, length, self.n_head, d_tensor).transpose(1, 2)
        return tensor

    def concat(self, tensor):
        batch_size, head, length, d_tensor = tensor.size()
        d_model = head * d_tensor
        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
        return tensor

    def forward(self, q1, k1, v1, q2, k2, v2, mask=None):
        
        q1, k1, v1 = self.quest_1(q1), self.quest_1(k1), self.quest_1(v1)
        q1, k1, v1 = self.split(q1), self.split(k1), self.split(v1)
        q2, k2, v2 = self.quest_2(q2), self.quest_2(k2), self.quest_2(v2)
        q2, k2, v2 = self.split(q2), self.split(k2), self.split(v2)
        _, head, length, d_tensor = k.size() 
        k1_t = k1.transpose(2, 3)
        k2_t = k2.transpose(2, 3)
        score1 = torch.matmul(q1,k1_t) / math.sqrt(d_tensor)
        score2 = torch.matmul(q2,k2_t) / math.sqrt(d_tensor)
        score1 = self.softmax(score1)
        score2 = self.softmax(score2)
        v1 = torch.matmul(score1, v2)
        v2 = torch.matmul(score2, v1)
        out1 = self.concat(v1)
        out2 = self.concat(v2)
        return out1, out2

In [32]:
class EncoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadCoAttention(d_model=d_model, n_head=n_head)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x1, x2, src_mask):
        _x1 = x1
        _x2 = x2
        x1, x2 = self.attention(q1=x1, k1=x1, v1=x1, q2=x2, k2=x2, v2=x2, mask=src_mask)
        
        x1 = self.dropout1(x1)
        x2 = self.dropout1(x2)
        x1 = self.norm1(x1 + _x1)
        x2 = self.norm1(x2 + _x2)
        
        _x1 = x1
        _x2 = x2
        x1 = self.ffn(x1)
        x2 = self.ffn(x2)
      
        x1 = self.dropout2(x1)
        x2 = self.dropout2(x2)
        x1 = self.norm2(x1 + _x1)
        x2 = self.norm2(x2 + _x2)
        return x1, x2

In [33]:
class Encoder(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, n_layers, drop_prob, weights_matrix):
        super().__init__()
        
        self.emb, num_embeddings, embedding_dim = create_emb_layer(weights_matrix)
        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])
        self.linear2output = nn.Linear(d_model, 1)

    def forward(self, x1, x2):
        x1 = self.emb(x1)
        x2 = self.emb(x2)
        
        for layer in self.layers:
            x1 = layer(x1)
            x2 = layer(x2)
        
        x1 = self.linear2output(x1)
        x2 = self.linear2output(x2)
        return x1, x2

In [34]:
class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [37]:
class ContrastiveLoss(torch.nn.Module):
    def __init__(self, margin=2.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = F.pairwise_distance(output1, output2, keepdim = True)
        loss_contrastive = torch.mean((1-label) * torch.pow(euclidean_distance, 2) +
                                      (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        
        return loss_contrastive

In [35]:
dim_model = max(train.tokenized_words[0].size(1),train.tokenized_words[1].size(1))
d_hid = 200 
nlayers = 2 
nhead = 3 
dropout = 0.2 
weights = get_embedding(train.token_dict)
model = Encoder(d_model=dim_model, n_head=nhead, ffn_hidden=d_hid, n_layers=nlayers, drop_prob=dropout, weights_matrix=weights).to(device)

In [ ]:
criterion = ContrastiveLoss()
lr = 0.5 
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.95)

def train(model: nn.Module, training_data: Tensor):
    epoch_loss = 0
    model.train()
    total_loss = 0.
    log_interval = 4000
    start_time = time.time()

    for i, (input1, input2, label) in enumerate(training_data):
        optimizer.zero_grad()
        output1, output2 = model(input1,input2)
        
        loss = criterion(output1,output2,label)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        if i % 4000 == 999:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            epoch_loss += total_loss
            total_loss = 0
            start_time = time.time()
    return epoch_loss


def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        for i, (input1, input2, label) in enumerate(eval_data):
            output1, output2 = model(input1,input2)
            loss = criterion(output1,output2,label)    
            total_loss += loss.item()
    return total_loss / (len(eval_data) - 1)

In [ ]:
best_val_loss = float('inf')
epochs = 7
best_model = None
train_loss = []
val_loss = []
decrease_learning_rate = []

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train_loss.append(train(model, train_data))
    val_loss.append(evaluate(model, val_data))
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss[-1]:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)

    if val_loss[-1] < best_val_loss:
        best_val_loss = val_loss[-1]
        best_model = copy.deepcopy(model)

save_path = 'trained_NLP_model.pt'
torch.save(model.state_dict(), save_path)