In [5]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.optim import Adam
import re
import spacy
import fr_core_news_sm

import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig(level=logging.DEBUG)

from tqdm import tqdm
import os


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    print("running on gpu!!!")
else:
    print("cpu :(")

if 'COLAB_GPU' in os.environ:
        print("running on colab")
        from google.colab import drive
        drive.mount('/content/drive')
        import sys
        sys.path.append("/content/drive/MyDrive/Colab Notebooks/ag_news")

from utils import ModelJob, Attention, Vocab

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
cpu :(


In [14]:
class TranslationDataset(Dataset):
    def __init__(self, df, src_lang, tgt_lang, 
                 src_vocab_size, tgt_vocab_size, 
                 min_frequency, 
                 mode="train", src_vocab=None, tgt_vocab=None):
        super(TranslationDataset, self).__init__()
        self.df = df
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.src_nlp = spacy.load(name=src_lang+"_core_web_sm")
        self.tgt_nlp = fr_core_news_sm.load() #spacy.load(name=tgt_lang+"_core_web_sm")
        self.src_vocab_size = src_vocab_size
        self.tgt_vocab_size = tgt_vocab_size
        self.min_frequency = min_frequency
        self.mode = mode
        self.df[self.src_lang+"_tokens"] = self.df[self.src_lang].apply(lambda x: self.preprocess(x, flag="src"))
        self.df[self.src_lang+"_tokens"] = self.df[self.src_lang+"_tokens"].apply(lambda x: ["<SOS>"] + x + ["<EOS>"])
        self.df[self.tgt_lang+"_tokens"] = self.df[self.tgt_lang].apply(lambda x: self.preprocess(x, flag="tgt"))
        self.df[self.tgt_lang+"_tokens"] = self.df[self.tgt_lang+"_tokens"].apply(lambda x: ["<SOS>"] + x + ["<EOS>"])
        
        if self.mode == "train":
            logging.info("Creating vocabulary")
            logging.info("Creating source vocabulary")
            self.src_vocab = Vocab(df=self.df, 
                                   min_frequency=self.min_frequency, 
                                   vocab_size=self.src_vocab_size, 
                                   field=self.src_lang+"_tokens",
                                   include_unk=False)
            logging.info("Creating target vocabulary")
            self.tgt_vocab = Vocab(df=self.df, 
                                   min_frequency=self.min_frequency, 
                                   vocab_size=self.tgt_vocab_size, 
                                   field=self.tgt_lang+"_tokens",
                                   include_unk=False)
        else:
            self.src_vocab = src_vocab
            self.tgt_vocab = tgt_vocab
        self.df["enc_input"] = self.df[self.src_lang+"_tokens"].apply(lambda x: self.create_index(tokens=x, flag="src"))
        self.df["enc_input_len"] = self.df["enc_input"].apply(len)
        self.df = self.df.loc[self.df["enc_input_len"]>0]
        self.df["decoder_seq"] = self.df[self.tgt_lang+"_tokens"].apply(lambda x: self.create_index(tokens=x, flag="tgt"))
        self.df["dec_input"] = self.df["decoder_seq"].apply(lambda x: x[:-1])
        self.df["dec_input_len"] = self.df["dec_input"].apply(len)

        if mode !="predict":
            self.df["dec_output"] = self.df["decoder_seq"].apply(lambda x: x[1:])
            self.df["dec_output_len"] = self.df["dec_output"].apply(len)
        
    def create_index(self, tokens, flag):
        if flag == "src":
            return [self.src_vocab.vocab[t] for t in tokens if t in self.src_vocab.vocab.keys()]
        else:
            return [self.tgt_vocab.vocab[t] for t in tokens if t in self.tgt_vocab.vocab.keys()]
        
    def preprocess(self, text, flag):
        text = re.sub("[^\w+|\s]", " ", text)
        if flag == "src":
            doc = self.src_nlp(text, disable=["ner", "tagger"])
        else:
            doc = self.tgt_nlp(text, disable=["ner", "tagger"])
        lemmas = [token.lemma_ for token in doc]
        return lemmas
            
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        df_batch = self.df.iloc[idx]
        if self.mode != "predict":
            return {"enc_input": df_batch["enc_input"], 
                    "enc_input_len": df_batch["enc_input_len"],
                    "dec_input": df_batch["dec_input"],
                    "dec_input_len": df_batch["dec_input_len"],
                    "dec_output_len": df_batch["dec_output_len"],
                    "dec_output": df_batch["dec_output"]}
        else:
            return {"enc_input": df_batch["enc_input"], 
                    "enc_input_len": df_batch["enc_input_len"],
                    "dec_input": df_batch["dec_input"],
                    "dec_input_len": df_batch["dec_input_len"]}
        
def collate_fn(batch, mode="train"):
    enc_input = [torch.tensor(row["enc_input"]) for row in batch]
    enc_input_len = [row["enc_input_len"] for row in batch]
    dec_input = [torch.tensor(row["dec_input"]) for row in batch]
    dec_input_len = [row["dec_input_len"] for row in batch]
    enc_input = pad_sequence(enc_input, batch_first=True)
    dec_input = pad_sequence(dec_input, batch_first=True)
    
    
    if mode == "predict":
        return (enc_input.to(device), enc_input_len, dec_input.to(device), dec_input_len)
    else:
        dec_output_len = [row["dec_output_len"] for row in batch]
        dec_output = [torch.tensor(row["dec_output"]) for row in batch]
        dec_output = pad_sequence(dec_output, batch_first=True)
    
    return (enc_input.to(device), enc_input_len, dec_input.to(device), dec_input_len), dec_output.reshape(-1).to(device)





In [15]:
class Encoder(nn.Module):
    def __init__(self, src_vocab_size, src_embedding_dim, enc_hidden_size):
        super(Encoder, self).__init__()
        self.enc_embedding = nn.Embedding(num_embeddings=src_vocab_size, 
                                      embedding_dim=src_embedding_dim,
                                      padding_idx=0 )
        self.enc_rnn = nn.GRU(input_size=src_embedding_dim,
                         hidden_size=enc_hidden_size,
                         batch_first=True)
        
    def forward(self, x):
        encoder_input, encoder_input_len = x
        enc_emb = self.enc_embedding(encoder_input)
        enc_emb_packed = pack_padded_sequence(enc_emb, encoder_input_len, 
                                              enforce_sorted=False,
                                             batch_first=True)
        enc_output, h_t = self.enc_rnn(enc_emb_packed)
        enc_output_padded, enc_output_lens = pad_packed_sequence(enc_output, batch_first=True)
        mask = encoder_input!=0
        return enc_output_padded, h_t[0], mask
    
class Attention(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.input_dim = input_dim
        self.w_query = nn.Linear(in_features=input_dim, out_features=input_dim)
        self.w_key = nn.Linear(in_features=input_dim, out_features=input_dim)
        self.v = nn.Linear(in_features=input_dim, out_features=1)
        self.mask_impute = torch.tensor(-9999.0).to(device)

    def forward(self, query, key, mask):
        # query : pre-attention-decoder last state [bs, hs]
        # key: all encoder_states - [bs, ts, hs]
        # [bs*ts, hs]
        key_reshaped = key.contiguous().view(key.shape[0]*key.shape[1], -1) 
        # [bs*ts, hs]
        key_transformed = self.w_key(key_reshaped)
        # [bs, ts, hs]
        key_transform_reshape = key_transformed.reshape(key.shape[0], key.shape[1], -1)
        # [bs, ts, hs]
        transformed_inp = self.w_query(query).unsqueeze(dim=1) + key_transform_reshape
        # [bs, ts]
        context_vector = self.v(F.tanh(transformed_inp))
        context_vector = torch.where(mask.unsqueeze(2)==0, self.mask_impute, context_vector)
        # [bs, ts]
        alpha = F.softmax(context_vector, dim=1)
        # [bs, ts, hs]
        alpha_repeated = alpha.repeat(1, 1, self.input_dim)
        effective_context = key * alpha_repeated
        
        # [bs, hs]
        return torch.sum(effective_context, dim=1), alpha
    
class Decoder(nn.Module):
    def __init__(self, tgt_vocab_size, dec_embedding_dim, dec_hidden_size, dec_fc_units):
        super(Decoder, self).__init__()
        self.dec_embedding = nn.Embedding(num_embeddings=tgt_vocab_size, 
                                          embedding_dim=dec_embedding_dim, 
                                          padding_idx=0)
        self.dec_rnn = nn.GRU(input_size=dec_embedding_dim,
                             hidden_size=dec_hidden_size,
                             batch_first=True)
        self.dec_dropout = nn.Dropout(p=0.2)
        self.dec_fc = nn.Linear(in_features=dec_hidden_size,
                               out_features=dec_fc_units)

    def forward(self, decoder_input, state=None):
        dec_embedding = self.dec_embedding(decoder_input.unsqueeze(dim=1))
        if state is not None:
            output, state = self.dec_rnn(dec_embedding)
        else:
            output, state = self.dec_rnn(dec_embedding,state)
        dec_fc_in = self.dec_dropout(state[0])
        fc_out = F.relu(self.dec_fc(dec_fc_in))
        return  fc_out

class Seq2Seq(nn.Module):
    def __init__(self,  src_vocab_size, src_embedding_dim, 
                        tgt_vocab_size, tgt_embedding_dim, 
                        enc_hidden_size, dec_hidden_size, 
                        dec_fc_units, fc_out):
        super(Seq2Seq, self).__init__()
        self.src_vocab_size = src_vocab_size
        self.encoder = Encoder(src_vocab_size, src_embedding_dim, enc_hidden_size)
        self.decoder = Decoder(tgt_vocab_size, tgt_embedding_dim, dec_hidden_size, dec_fc_units)
        self.attention = Attention(input_dim=dec_hidden_size)
        self.fc_dropout = nn.Dropout(p=0.2)
        self.fc = nn.Linear(in_features=enc_hidden_size+dec_hidden_size, out_features=tgt_vocab_size)
        
    def forward(self, x, mode="train"):
        logits = []
        enc_input, enc_input_len, dec_input, dec_input_len = x
        all_encoder_states, encoder_state, encoder_mask = self.encoder((enc_input, enc_input_len))
        for i in range(dec_input.shape[1]):
            decoder_state = self.decoder(dec_input[:, i])
            effective_state, attention_weights = self.attention(query=decoder_state, 
                                                                key=all_encoder_states, 
                                                                mask=encoder_mask)
            fc_in = torch.cat((effective_state, decoder_state), dim=-1)
            fc_in = self.fc_dropout(fc_in)
            fc_out = self.fc(fc_in)
            logits.append(fc_out)
        logits = torch.stack(logits, dim=1)
        if mode == "train":
            return logits.contiguous().view(logits.shape[0]*logits.shape[1], -1), decoder_state[0]
        elif mode == "interpret":
            return logits, decoder_state[0], attention_weights
        else:
            return logits, decoder_state[0]
        
        

In [18]:
if __name__ == "__main__":
    if 'COLAB_GPU' in os.environ:
        data_path = "/content/drive/MyDrive/Colab Notebooks/nmt"
        model_save_path = "/content/drive/MyDrive/Colab Notebooks/nmt"
        print("running on colab")
    else:
        data_path = "data/"
        model_save_path = "models/"
        print("running on local")
    df = pd.read_table(os.path.join(data_path,"fra.txt"),header=None)
    #df = df.sample(n=500, random_state=9)
    df = df.iloc[:, :2]
    df.columns = ["en", "fr"]
    df["en"] = df["en"].str.strip()
    df["fr"] = df["fr"].str.strip()

    logging.info(f"Dataset shape: {df.shape[0]}")
    subset = ["i am ", "i m ",  
              "he is", "he s ", 
              "she is", "she s ",  
              "you are", "you re ", 
              "we are", "we re "
              "they are", "they re "]
              #,  "i don't", "do you", 
            # "i want"]
            
    def _filter(text):
        text = text.strip()
        text = re.sub(r"([.!?])", r" \1", text)
        text = re.sub(r"[^a-zA-Z.!?]+", r" ", text)
        for sub in subset:
            if text.lower().startswith(sub):
                if len(text.split(" ")) < 10:
                    return 1
        return 0
    
    #df["flag"] = df["en"].apply(_filter)
    #df = df.loc[df["flag"]==1]
    logging.info(f"Dataset shape after filtering: {df.shape[0]}")

    from sklearn.model_selection import train_test_split
    df_train, df_test = train_test_split(df, random_state=9)
    logging.info("Train Test Split")
    
    
    logging.info("Creating Datasets")
    train_ds = TranslationDataset(df=df_train,
                            src_lang="en", 
                            tgt_lang="fr", 
                            src_vocab_size=500,
                            tgt_vocab_size = 500,
                            min_frequency=5, 
                            mode="train", 
                            src_vocab=None, 
                            tgt_vocab=None)
    
    test_ds = TranslationDataset(df=df_test,
                            src_lang="en", 
                            tgt_lang="fr", 
                            src_vocab_size=500, 
                            tgt_vocab_size=500,
                            min_frequency=20, 
                            mode="test", 
                            src_vocab=train_ds.src_vocab, 
                            tgt_vocab=train_ds.tgt_vocab)
    logging.info(f"Dataset lengths:: train: {len(train_ds)}, test: {len(test_ds)}")
    
    
    train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
    test_dl = DataLoader(test_ds, batch_size=256, shuffle=True, collate_fn=collate_fn)
    

running on local


INFO:root:Dataset shape: 189114
INFO:root:Dataset shape after filtering: 189114
INFO:root:Train Test Split
INFO:root:Creating Datasets
INFO:root:Creating vocabulary
INFO:root:Creating source vocabulary
INFO:root:Creating target vocabulary
INFO:root:Dataset lengths:: train: 141835, test: 47279


In [19]:
network = Seq2Seq(src_vocab_size=train_ds.src_vocab_size, 
          src_embedding_dim=256, 
          enc_hidden_size=128,
          tgt_vocab_size=train_ds.tgt_vocab_size, 
          tgt_embedding_dim=100, 
          dec_hidden_size=128,
          dec_fc_units=128,
          fc_out=256
  )
network = network.to(device)
loss_func = nn.CrossEntropyLoss(ignore_index=0)
optimizer = Adam(params=network.parameters(), lr=0.001)

model_run =  ModelJob(model=network,
                dataloaders = {"train": train_dl, "test":test_dl},
                model_save_path = model_save_path,
                model_save_name="fr_translation.pth",
                criterion=loss_func,
                optimizer=optimizer,
                n_epochs=20,
                phases=["train", "test"],
                )
logging.info("Started Training")
model_run.train_step()

INFO:root:Started Training


EPOCH: 1 out of 20
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
	MODE: train : LOSS: 2.2616491317749023 : ACCURACY: 0.2454225718975067
||||||||||
	MODE: test : LOSS: 1.8279727697372437 : ACCURACY: 0.20923177897930145
Best epoch : 1
Saving model : fr_translation.pth at models/
EPOCH: 2 out of 20
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
	MODE: train : LOSS: 1.8015875816345215 : ACCURACY: 0.28499871492385864
||||||||||
	MODE: test : LOSS: 1.7092512845993042 : ACCURACY: 0.2178775817155838
Best epoch : 2
Saving model : fr_translation.pth at models/
EPOCH: 3 out of 20
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

In [20]:
for _ in range(5):
    df_sample = test_ds.df.sample(n=1)
    encoder_input = df_sample["en"].values[0]
    decoder_input = ""
    temperature=1
    for _ in range(10):
        print("_", end="")
        df_text = pd.DataFrame([encoder_input], columns=["en"])
        df_text["fr"] = decoder_input
        predict_ds = TranslationDataset(df=df_text,
                            src_lang="en", 
                            tgt_lang="fr", 
                            src_vocab_size=500, 
                            tgt_vocab_size=500,
                            min_frequency=20, 
                            mode="predict", 
                            src_vocab=train_ds.src_vocab, 
                            tgt_vocab=train_ds.tgt_vocab)
        predict_dl = DataLoader(dataset=predict_ds, 
                                batch_size=1,
                                shuffle=True, 
                                collate_fn=lambda x: collate_fn(x, mode="predict"))
        for idx, batch in enumerate(predict_dl):
            if idx == 0:
                logits, state = model_run.predict_step(predict_dl, if_y=False)
            else:
                logits, state =  model_run.predict_step(predict_dl, h_t_1=state,  
                                                        if_y=False)
            predicted_char_idx = torch.multinomial(F.softmax(logits[0, -1]/temperature), num_samples=1).item()
            predicted_char = train_ds.tgt_vocab.vocab_ctoi.get(predicted_char_idx, "<UNK>")
            if predicted_char == "<EOS>":
                break
            #print(predicted_char_idx, predicted_char)
            if predicted_char != "<UNK>":
                decoder_input = decoder_input + " " + predicted_char
    print()
    print("encoder input ::", encoder_input)
    print("translated output ::", decoder_input)
    print("original output ::", df_sample["fr"].values[0])

__________
encoder input :: I can't find Tom anywhere.
translated output ::  je ne pouvoir pas de Tom
original output :: Je ne trouve Tom nulle part.
__________
encoder input :: I was talking about you.
translated output ::  je parler de vous le de toi
original output :: Je parlais de toi.
__________
encoder input :: I'd rather stand than sit.
translated output ::  je vouloir m asseoir depuis que vous me tenir
original output :: Je préfère rester debout que d'être assis.
__________
encoder input :: You're very observant.
translated output ::  vous être fort
original output :: Vous êtes très observateur.
__________
encoder input :: I'll call back in twenty minutes.
translated output ::  je rappeler en minute
original output :: Je rappellerai dans vingt minutes.
