In [2]:
import numpy as np
from gensim.models import FastText,fasttext
import pandas as pd
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import math
import re
from string import digits
from nltk.translate.bleu_score import sentence_bleu

In [4]:
qc_embeding_path = './embeddings/lematized_embedding_v1.model'
qc_embeddings = FastText.load(qc_embeding_path)

In [3]:
es_embeding_path = './embeddings/cc.es.300.bin'
es_embeddings = fasttext.load_facebook_vectors(es_embeding_path)

In [5]:
dataset = pd.DataFrame(columns=['quechua', 'spanish'])
dataset['quechua'] = pd.read_csv('./train_data/QuechuaCollaoCorpus.csv')['Quechua']
dataset['spanish'] = pd.read_csv('./train_data/QuechuaCollaoCorpus.csv')['Traducción']
dataset

Unnamed: 0,quechua,spanish
0,Ch’arwi,Desorden
1,Runa,Persona
2,Chanin,Precio
3,Puka,Rojo
4,Hallp’a,Tierra
...,...,...
2066,Runakuna manan mallkikunata tarpunkuchu chaymi...,La personas no plantan árboles es uno de los f...
2067,Erqe mamanpa wañusqanmanta waqan.,El niño llora porque su madre murió.
2068,Huk wayna chakamanta wukchuyukusqa hinaspa wañ...,Un joven se lanzó del puente y murió.
2069,Mama churin mana wasinman kutimusqanmanta waqan.,La madre llora porque su hijo no vuelve a la c...


In [6]:
dataset.dropna(inplace=True)
dataset.reset_index(drop=True, inplace=True)

# Preprocesamiento


In [7]:

def preprocess_sentence(sentence):
    num_digits = str.maketrans('', '', digits)

    sentence = sentence.lower()
    sentence = re.sub(" +", " ", sentence)
    sentence = re.sub(" . ", "", sentence)
    sentence = sentence.translate(num_digits)
    sentence = sentence.strip()
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = sentence.rstrip().strip()
    
    # Remove the final dot if it exists
    if sentence.endswith('.'):
        sentence = sentence[:-1].strip()
    sentence = sentence.replace('’',"'")
    sentence = 'start_ ' + sentence + ' _end'  # importante

    return sentence

def clean_df(df):
    df['quechua'] = df['quechua'].apply(lambda x: preprocess_sentence(x))
    df['spanish'] = df['spanish'].apply(lambda x: preprocess_sentence(x))

In [8]:
clean_df(dataset)

In [9]:
dataset

Unnamed: 0,quechua,spanish
0,start_ ch'arwi _end,start_ desorden _end
1,start_ runa _end,start_ persona _end
2,start_ chanin _end,start_ precio _end
3,start_ puka _end,start_ rojo _end
4,start_ hallp'a _end,start_ tierra _end
...,...,...
2064,start_ runakuna manan mallkikunata tarpunkuchu...,start_ la personas no plantan árboles es uno d...
2065,start_ erqe mamanpa wañusqanmanta waqan _end,start_ el niño llora porque su madre murió _end
2066,start_ huk wayna chakamanta wukchuyukusqa hina...,start_ un joven se lanzó del puentemurió _end
2067,start_ mama churin mana wasinman kutimusqanman...,start_ la madre llora porque su hijo no vuelve...


In [10]:
qc_embeddings.build_vocab(dataset['quechua'].apply(lambda x: x.split()).tolist(), update=True)
qc_embeddings.train(dataset['quechua'], total_examples=qc_embeddings.corpus_count, epochs=10)

(1006, 969980)

# Tokenización


In [11]:
qc_word_dict = {
    '_pad_': np.zeros(50, dtype=np.float32),
}
for words in dataset['quechua']:
    words = words.split()
    for word in words:
        if not word in qc_word_dict:
            qc_word_dict[word] = qc_embeddings.wv[word]

sp_word_dict = {
    '_pad_': np.zeros(300, dtype=np.float32),
}

for words in dataset['spanish']:
    words = words.split()
    for word in words:
        if not word in sp_word_dict:
            sp_word_dict[word] = es_embeddings[word]

qc_word_index = {word: i for i, word in enumerate(qc_word_dict.keys())}
sp_word_index = {word: i for i, word in enumerate(sp_word_dict.keys())}

# Modelo


In [12]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [13]:
qc_max_len = dataset['quechua'].apply(len).max()
sp_max_len = dataset['spanish'].apply(len).max()
sp_max_len

161

In [79]:
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [80]:
import math
import torch
import torch.nn as nn

class TranslationTransformerModel(nn.Module):
    def __init__(self, d_model, nhead, num_encoder_layers, num_decoder_layers, 
                 dim_feedforward, max_seq_length, dropout, qc_word_dict, sp_word_dict):
        super(TranslationTransformerModel, self).__init__()
        
        self.encoder = nn.Sequential(
                                    nn.Embedding.from_pretrained(torch.tensor(list(sp_word_dict.values()))),#300 dim
                                    nn.Linear(300, d_model))#50 dim
        self.decoder = nn.Embedding.from_pretrained(torch.tensor(list(qc_word_dict.values()))) #50 dim
        self.pos_encoder = PositionalEncoding(d_model, dropout, max_seq_length)
        
        self.transformer = nn.Transformer(
            d_model=d_model, 
            nhead=nhead, 
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        
        self.fc = nn.Linear(d_model, len(qc_word_dict))
        self.d_model = d_model

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None,
            src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        src = self.encoder(src) * math.sqrt(self.d_model)
        tgt = self.decoder(tgt) * math.sqrt(self.d_model)
    
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)

        if tgt_mask is None:
            tgt_mask = self.generate_square_subsequent_mask(tgt.size(1))
        
        if src_mask is None:
            src_mask = torch.zeros(src.size(1), src.size(1),device=device).type(torch.bool) 
        
        
        output = self.transformer(src, tgt, src_mask, tgt_mask, memory_mask,
                              src_key_padding_mask, tgt_key_padding_mask, memory_key_padding_mask)
        output = self.fc(output)
        return output
    
    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

In [81]:
class TranslationDataset(Dataset):
    def __init__(self, dataset, sp_max_len, qc_max_len):
        self.dataset = dataset
        self.sp_max_len = sp_max_len
        self.qc_max_len = qc_max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sp = [sp_word_index[word] for word in self.dataset['spanish'][idx].split()]
        qc = [qc_word_index[word] for word in self.dataset['quechua'][idx].split()]
        
        sp = sp + [0] * (self.sp_max_len - len(sp))
        qc = qc + [0] * (self.qc_max_len - len(qc))
        
        return torch.tensor(sp, dtype=torch.long), torch.tensor(qc, dtype=torch.long)

In [82]:
train_ds = TranslationDataset(dataset, sp_max_len, qc_max_len)

In [83]:
train_dl = DataLoader(train_ds, batch_size=10, shuffle=True)

In [84]:
model = TranslationTransformerModel(d_model=50, nhead=5, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, max_seq_length=sp_max_len, dropout=0.1, qc_word_dict=qc_word_dict, sp_word_dict=sp_word_dict).to(device)



In [85]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [87]:
def create_padding_mask(seq, pad_idx=0):
    return (seq == pad_idx)


epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for i, (src, tgt) in enumerate(train_dl):
        src = src.to(device)
        tgt = tgt.to(device)
        
        src_padding_mask = create_padding_mask(src)
        tgt_padding_mask = create_padding_mask(tgt)
        memory_padding_mask = src_padding_mask.clone()
        
        optimizer.zero_grad()
        output = model(src,tgt,src_key_padding_mask=src_padding_mask, tgt_key_padding_mask=tgt_padding_mask)
        logits = output.view(-1, len(qc_word_index))
        loss = criterion(logits, tgt.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch}: {total_loss / len(src)}')


Epoch 0: 141.60622559653387
Epoch 1: 139.32030396991306
Epoch 2: 133.27734602822198
Epoch 3: 131.90463903215198
Epoch 4: 131.18657933341132
Epoch 5: 130.29598167207507
Epoch 6: 129.57277022467719
Epoch 7: 129.58455827501086
Epoch 8: 129.3541211022271
Epoch 9: 129.09915187623767


In [106]:
#Testing

def translate_sentence(sentence, model, qc_word_dict, sp_word_dict, qc_word_index, sp_word_index, qc_max_len, sp_max_len):
    model.eval()
    words = sentence.split()
    sp = [sp_word_index[word] for word in words]
    sp = sp + [0] * (sp_max_len - len(sp))
    sp = torch.tensor(sp, dtype=torch.long).unsqueeze(0).to(device)
    
    qc = torch.tensor([1], dtype=torch.long).unsqueeze(0).to(device)
    qc_padded = torch.tensor([0] * qc_max_len, dtype=torch.long).unsqueeze(0).to(device)
    for i in range(qc_max_len):
        output = model(sp, qc_padded)
        output = output[-1,:].unsqueeze(0)
        predicted_index = output.argmax(2)
        qc = torch.cat((qc, predicted_index), dim=1)
        if output[0, -1] == 2:
            break
    
    qc = qc.squeeze().detach().cpu().numpy()
    qc = [list(qc_word_index.keys())[list(qc_word_index.values()).index(i)] for i in qc]
    qc = ' '.join(qc)
    return qc

In [89]:
dataset['spanish'][1253]

'start_ estaba en mi casa con mi familia _end'

In [90]:
dataset['quechua'][1253]

'start_ wasiypin aylluywan kushka karani _end'

In [110]:
translate_sentence(dataset['spanish'][1253],model,qc_word_dict,sp_word_dict,qc_word_index,sp_word_index,qc_max_len,sp_max_len)

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [None]:
#Bleu score

def calculate_bleu_score(model, dataset, qc_word_dict, sp_word_dict, qc_word_index, sp_word_index, qc_max_len, sp_max_len):
    bleu_score = 0
    for i in range(len(dataset)):
        sp = dataset['spanish'][i]
        qc = dataset['quechua'][i]
        translated_qc = translate_sentence(sp, model, qc_word_dict, sp_word_dict, qc_word_index, sp_word_index, qc_max_len, sp_max_len)
        bleu_score += sentence_bleu([qc.split()], translated_qc.split())
    return bleu_score / len(dataset) 

In [None]:

score = calculate_bleu_score(model, dataset, qc_word_dict, sp_word_dict, qc_word_index, sp_word_index, qc_max_len, sp_max_len)
score