# TODO

* Add masking to all padding so it's not taken into account when doing calculations
* Add special start and end tokens like CLS and SEP so we know where the end of the generation is
* Try removing Decoder again
* Mask out padding in loss function
   


In [None]:
import time
import pandas as pd
import math
from itertools import combinations
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

## Data Exploration

In [None]:
df = pd.read_csv('SHsnid.csv', names=["Word", "ID", "Gender", "???", "Beyging", "Fall"], sep=';')

In [None]:
df.head()
df.loc[df.Word == 'góður']


In [None]:
df.shape
df = df.loc[df.Gender.isin(['hk', 'kvk', 'kk'])]

In [None]:

train_df = df[["Word", "Beyging", "Fall"]]

In [None]:
train_df.head()

In [None]:
train_df.Fall.unique()

In [None]:
train_df.loc[train_df.Fall == 'ÞFET']

In [None]:
no_articles = train_df.loc[train_df.Fall.isin(['ÞFET', 'NFET', 'EFET', 'ÞGFET', 'ÞFFT', 'ÞGFFT', 'NFFT', 'EFFT', ])]
articles = train_df.loc[train_df.Fall.isin(['ÞFETgr', 'NFETgr', 'EFETgr', 'ÞGFETgr', 'ÞFFTgr', 'ÞGFFTgr', 'NFFTgr', 'EFFTgr'])]

In [None]:
articles.replace(['ÞFETgr', 'NFETgr', 'EFETgr', 'ÞGFETgr', 'ÞFFTgr', 'ÞGFFTgr', 'NFFTgr', 'EFFTgr'], ['ÞFET', 'NFET', 'EFET', 'ÞGFET', 'ÞFFT', 'ÞGFFT', 'NFFT', 'EFFT', ], inplace=True)

print(no_articles.loc[train_df.Word == 'hestur'])
print(articles.loc[train_df.Word == 'hestur'])

In [None]:
declension_mapping = {}

for index, row in no_articles.iterrows():
    if not declension_mapping.get(row['Word'].lower()):
        declension_mapping[row['Word'].lower()] = {}
        
    declension_mapping[row['Word'].lower()][row['Fall'].lower()] = row['Beyging'].lower()
    
for index, row in articles.iterrows():
    if not declension_mapping.get(row['Word'].lower() + 'gr'):
        declension_mapping[row['Word'].lower() +'gr'] = {}
        
    declension_mapping[row['Word'].lower()+'gr'][row['Fall'].lower()] = row['Beyging'].lower()

In [None]:
input_words_data = []
output_words_data = []
source_declension = []
target_declension = []

idx = 0

for key, word in declension_mapping.items():
    declension_combs = list(combinations(word.keys(), 2))
    word_combs = list(combinations(word.values(), 2))
    
    input_words_data += [w[0] for w in word_combs]
    output_words_data += [w[1] for w in word_combs]
    source_declension += [d[0] for d in declension_combs]
    target_declension += [d[1] for d in declension_combs]
    
    #idx += 1
    
    #if idx > 5000:
        #break


## Create the vocabulary

In [None]:
vocab = set()

for w in input_words_data:
    for ch in list(w):
        vocab.add(ch)

In [None]:
vocab = sorted(list(vocab))
vocab = ['<pad>', '<BEG>', '<END>'] + vocab
declensions = ['nfet', 'þfet', 'þgfet', 'efet', 'nfft', 'þfft', 'þgfft',  'efft']

ch2idx = {ch: idx for idx, ch in enumerate(vocab)}
idx2ch = {idx: ch for idx, ch in enumerate(vocab)}
decl2idx = {d: idx for idx, d in enumerate(declensions)}
idx2decl = {idx: d for idx, d in enumerate(declensions)}

## Save the character and declension mappings

In [None]:
import json

with open('./ch2idx.json', 'w', encoding='utf-8') as fp:
    json.dump(ch2idx, fp, ensure_ascii=False)
    
with open('./decl2idx.json', 'w', encoding='utf-8') as fp:
    json.dump(decl2idx, fp, ensure_ascii=False)

## Change from characters to indices and pad all sequences

In [None]:
input_words_data = [['<BEG>']+list(w)+['<END>'] for w in input_words_data]
output_words_data = [['<BEG>']+list(w)+['<END>'] for w in output_words_data]

input_words_data = [[ch2idx[ch] for ch in w] for w in input_words_data]
output_words_data = [[ch2idx[ch] for ch in w] for w in output_words_data]

maxlen=32

input_words_data = pad_sequences(input_words_data, maxlen=maxlen, padding='post', value=0)
output_words_data = pad_sequences(output_words_data, maxlen=maxlen, padding='post', value=0)

## Create the datasets and dataloaders

In [None]:
class DeclensionDataset(Dataset):
    
    def __init__(self, input_words, output_words, src_decl, tgt_decl, ch2idx, decl2idx):
        super(DeclensionDataset, self).__init__()
        
        self.input = input_words
        self.output = output_words
        self.src_decls = src_decl
        self.tgt_decls = tgt_decl
        self.vocab = ch2idx
        self.decl_vocab = decl2idx
        
    def __getitem__(self, index):
        
        word = self.input[index]
        
        src_decl = self.decl_vocab[self.src_decls[index]]
        tgt_decl = self.decl_vocab[self.tgt_decls[index]]
        
        output = self.output[index]
        
        return torch.LongTensor(word), torch.LongTensor(output), src_decl, tgt_decl
    
    def __len__(self):
        return len(self.input)
 


        

In [None]:
from random import sample

# shuffle the data but keep it consistent over all four datasets
indices = [i for i in range(len(input_words_data))]
indices = sample(indices, len(indices))
split = int(len(indices) * 0.9)

train_input = [input_words_data[i] for i in indices[:split]]
train_output = [output_words_data[i] for i in indices[:split]]
train_src = [source_declension[i] for i in indices[:split]]
train_tgt = [target_declension[i] for i in indices[:split]]

val_input = [input_words_data[i] for i in indices[split:]]
val_output = [output_words_data[i] for i in indices[split:]]
val_src = [source_declension[i] for i in indices[split:]]
val_tgt = [target_declension[i] for i in indices[split:]]

train_dataset = DeclensionDataset(train_input, train_output,train_src, train_tgt, ch2idx, decl2idx)
val_dataset = DeclensionDataset(val_input, val_output, val_src, val_tgt, ch2idx, decl2idx)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True) #collate_fn=pad_collate)
val_loader = DataLoader(val_dataset, batch_size=256) #collate_fn=pad_collate)

## Create the models

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
class TargetNetwork(nn.Module):
    def __init__(self, ninput, nhidden=512, noutput=512, maxlen=32, nembed=128):
        super(TargetNetwork, self).__init__()
        
        self.tgt_dec_emb = nn.Embedding(ninput, nembed)
        
    def forward(self, tgt_dec):
        x = self.tgt_dec_emb(tgt_dec)
        
        return x

    
class TransformerNetwork(nn.Module):
    
    def __init__(self, char_vocab, embedding_len=128, nheads=12, num_layers=12, activation='gelu', dropout=0.1):
        super().__init__()

        self.char_emb = nn.Embedding(char_vocab, embedding_len)
        
        self.pos_encoder = PositionalEncoding(embedding_len, dropout)
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_len, nhead=nheads, activation=activation)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        
    def forward(self, words):
        # Add padding mask
        padding_mask = (words == 0).T
        
        word_embed = self.char_emb(words)
        word_embed = self.pos_encoder(word_embed)
        
        out = self.transformer_encoder(word_embed, src_key_padding_mask=padding_mask)
        
        return out
        

class DeclensionTransformer(nn.Module):
    
    def __init__(self, char_vocab, num_declensions, max_len=32, nheads=12, num_layers=6, embedding_len=516, dim_feedforward=2048, dropout=0.1, activation='gelu'):
        super(DeclensionTransformer, self).__init__()
        self.transformer_network = TransformerNetwork(char_vocab, embedding_len, nheads, num_layers)
        
        self.target_network = TargetNetwork(num_declensions, noutput=embedding_len)
        self.source_network = TargetNetwork(num_declensions, noutput=embedding_len)
        
        self.linear1 = nn.Linear(772, dim_feedforward)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.linear2 = nn.Linear(dim_feedforward, char_vocab)
        self.linear3 = nn.Linear(embedding_len, char_vocab)
        
        self.relu = nn.ReLU() 
        
    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
        
        
    def forward(self, word_input, src_dec, tgt_dec):
        words_encoded = self.transformer_network(word_input)
        tgt_encoded = self.target_network(tgt_dec)
        src_encoded = self.source_network(src_dec)

        out = torch.cat((words_encoded, tgt_encoded, src_encoded), 2)
        
        out = self.linear2(self.dropout1(self.relu(self.linear1(out))))
        
        return out

In [None]:
test_input = 'hestur'
print(len(vocab))

model = DeclensionTransformer(len(vocab), len(declensions))
device = 'cuda:0'
model.to(device)

print('yes')

## Create a loss function with masking

In [None]:
class NestedCrossEntropyLoss(nn.Module):
    
    
    def forward(self, preds, target):

        total_loss = 0.
        zeros = torch.zeros(target.size(0)).to(device)
        
        mask = (target != 0).float()
        
        loss = F.cross_entropy(preds, target, reduction='none')
        
        loss = loss * mask.float()
        
        return loss.sum() / torch.nonzero(loss).size(0)

In [None]:
criterion = NestedCrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-5,weight_decay=1e-5)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-4, pct_start=0.3, steps_per_epoch=len(train_loader), epochs=15)

## Train the model

In [None]:


def num_correct_preds(preds, target):
    num_words = len(preds)
    _, predicted = torch.max(preds, 1)
    correct = 0.
    
    #print(predicted.shape)
    target[target==0] = -1
    
    correct = ((predicted == target).sum()).item()
    
    """    #correct = (predicted == target).sum().item() / float(target.size(0))
    for idx in range(num_words):
        # If target is padding we ignore it
        if target[idx] == 0:
            continue
        else:
            if predicted[idx] == target[idx]:
                correct += 1"""
            
    
    return correct / target[target!=-1].size(0)

def train(epochs, scheduler, optimizer, model):
    for epoch in range(epochs):
        torch.cuda.empty_cache()
        
        running_loss = 0
        n_correct = 0
        
        start = time.time()
        
        # use dropouts and batchnorms
        model.train()
        for batch in train_loader:
            word_input, output, src_dec, tgt_dec = batch
            
            word_input = torch.LongTensor(word_input).t()
            src_dec = torch.LongTensor([[dec]*word_input.shape[0] for dec in src_dec]).t()
            tgt_dec = torch.LongTensor([[dec]*word_input.shape[0] for dec in tgt_dec]).t()
            target = torch.LongTensor(output).t().contiguous().view(-1)
            
            word_input = word_input.to(device)
            src_dec = src_dec.to(device)
            tgt_dec = tgt_dec.to(device)
            target = target.to(device)
            
            outputs = model(word_input, src_dec,  tgt_dec)
            
            loss = criterion(outputs.view(-1, len(vocab)), target)
            loss.backward()
            
            optimizer.step()
            scheduler.step()
            
            #zero the parameter gradients
            optimizer.zero_grad()
            
            curr_correct = num_correct_preds(outputs.view(-1, len(vocab)), target)
            n_correct += curr_correct
            running_loss += loss.item()
        
        train_acc = 100. * n_correct / (len(train_loader))
        train_loss = running_loss / len(train_loader)
        
        n_val_correct = 0
        val_loss = 0
        
        # disable batchnorm and dropouts
        model.eval()
        # don't calculate gradient
        with torch.no_grad():
            for batch in val_loader:
                word_input, output, src_dec, tgt_dec = batch

                word_input = torch.LongTensor(word_input).t()
                src_dec = torch.LongTensor([[dec]*word_input.shape[0] for dec in src_dec]).t()
                tgt_dec = torch.LongTensor([[dec]*word_input.shape[0] for dec in tgt_dec]).t()
                target = torch.LongTensor(output).t().contiguous().view(-1)

                word_input = word_input.to(device)
                src_dec = src_dec.to(device)
                tgt_dec = tgt_dec.to(device)
                target = target.to(device)
                
                outputs = model.forward(word_input, src_dec, tgt_dec)
                
                val_loss = criterion(outputs.view(-1, len(vocab)), target).item()
                
                n_val_correct += num_correct_preds(outputs.view(-1, len(vocab)), target)
                  
        val_acc = 100. * n_val_correct / (len(val_loader))

        print('Epoch %s: Train Accuracy: %.2f percent, Validation Accuracy: %.2f percent, Train Loss:  %.5f, Validation Loss:  %.5f - %s seconds' 
              % (epoch, train_acc, val_acc, train_loss, val_loss, time.time() - start))
        
train(5, scheduler, optimizer, model)

In [None]:
torch.save(
    model.state_dict(),
    './models/icelandic_declension_only_weights.pt'
)

## Try predicting

In [None]:
import json

with open('./ch2idx_final_99.8%.json', encoding='utf-8') as f:
    ch2idx = json.load(f)

with open('./decl2idx_final_99.8%.json', encoding='utf-8') as f:
    decl2idx = json.load(f)
    
idx2ch = {v:k for k,v in ch2idx.items()}
idx2decl = {v:k for k,v in decl2idx.items()}


device = 'cuda:0'
model = DeclensionTransformer(len(ch2idx), len(decl2idx))
model.to(device)
checkpoint = torch.load('./models/icelandic_declension_final-99.79%.pt')
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()






In [None]:
maxlen = 32

def predict(word, src, targets):
    print(word)
    #print(idx2decl[src], ':', word)
    
    word = ['<BEG>'] + list(word) +  ['<END>']
    word = word + ['<pad>']*(32-len(word))
    
    word = torch.LongTensor([ch2idx[c] for c in word])
    word = word.to(device)[None].t()
    
    if type(targets) != list:
        targets = [targets]
        
    src_padded = torch.LongTensor([src]*maxlen).to(device)[None].t()
    
    for tgt in targets:
        #word = [ch2idx[ch] for ch in word]
        #print([idx2ch[idx] for idx in word.tolist()])

        #print(word.shape)
        #word = torch.LongTensor(word + [0] * (maxlen - len(word))).to(device)[None]



        
        tgt_padded = torch.LongTensor([tgt]*maxlen).to(device)[None].t()

        pred = model(word, src_padded, tgt_padded)
        #print(pred.argmax(2).tolist())
        output = [idx2ch[idx[0]] for idx in pred.argmax(2).tolist()]
        output_word = ''.join(output[1:output.index('<END>')])
        print(idx2decl[tgt], ':', output_word)
    

In [None]:
#word, _, src, tgt = train_dataset[0]

#print(src, tgt)

words = ['herra', 'hnetusmjör']

for word in words:
    predict(word, 1, 0)