In [5]:
import pandas as pd
import numpy as np
import math
import spacy
import random
from tqdm import tqdm

In [6]:
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import Dataset, DataLoader
#from torch.utils.tensorboard import SummaryWriter
from torchtext.data import Field, BucketIterator
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
torch.__version__
 

'1.11.0'

In [9]:
torch.cuda.set_per_process_memory_fraction(0.33, 0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")

In [10]:
df = pd.read_csv('.//GERMAN_ENGLISH_TRANSLATION.csv')


In [11]:
df = df[['ENGLISH', 'GERMAN']]

In [12]:

df.head()

Unnamed: 0,ENGLISH,GERMAN
0,hi,hallo
1,hi,gru gott
2,run,lauf
3,wow,potzdonner
4,wow,donnerwetter


In [13]:
train_df, valid_df = train_test_split(df, test_size=0.1, shuffle=True, random_state=28)

In [14]:
len(train_df)

137538

In [15]:
max_len_en = 20
max_len_de = 20
random_test = "my name is gariman,sd,f,,, ,, , isn"

In [16]:
english_token = spacy.load("en_core_web_sm")
german_token = spacy.load("de_core_news_sm")

In [17]:
german = Field(tokenize = german_token, tokenizer_language = 'de', lower = True,
               init_token  = '<sos>', eos_token = '<eos>')

In [18]:
english = Field(tokenize = english_token, lower = True,
              init_token  = '<sos>', eos_token = '<eos>')

In [19]:
#[tok.text for tok in english_token(random_test)]

In [20]:
class Vocabulary:
    def __init__(self, freq_threshold= 2, lang = 'en_core_web_sm',preprocessor=None,
                reverse = False):
        self.itos = {0: "<pad>", 1: "<sos>", 2: "<eos>", 3: "<unk>"}
        self.stoi = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
        self.tokenizer = spacy.load(lang)
        self.reverse = reverse
        self.preprocessor = preprocessor
        self.freq_threshold = freq_threshold
        
    def __len__(self):
        return len(self.itos)
    
    def tokenize(self, text):
        if self.reverse:
            return [ token.text.lower() for token in self.tokenizer.tokenizer(text)][::-1]
        else:
            return [ token.text.lower() for token in self.tokenizer.tokenizer(text)]
        
    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = len(self.itos)
        
        for sentence in sentence_list:
            if self.preprocessor:
                sentence = self.preprocessor(sentence)
                
            for word in self.tokenize(sentence):
                if word in frequencies:
                    frequencies[word] += 1
                else:
                    frequencies[word] = 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1
                    
                    

    def numericalize(self, text):
        tokenized_text = self.tokenize(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<unk>"]
            for token in tokenized_text
        ]

In [21]:
freq_threshold = 2
en_vocab = Vocabulary(freq_threshold=freq_threshold, reverse=False)
de_vocab = Vocabulary(freq_threshold=freq_threshold, lang="de_core_news_sm",  reverse=False)

In [22]:
en_vocab.build_vocabulary(train_df["ENGLISH"])
de_vocab.build_vocabulary(train_df["GERMAN"])

In [23]:
en_vocab.numericalize("hello everyone")

[2421, 883]

In [24]:
#train_dataset

In [25]:
class CustomTranslationDataset(Dataset):
    def __init__(self, df, en_vocab, de_vocab):
        super().__init__()
        self.df = df
        self.en_vocab = en_vocab
        self.de_vocab = de_vocab
        
    def __len__(self):
        return len(self.df)
    
    def get_numerical(self, sentence, vocab):
        """Numericalize given text using prebuilt vocab."""
        numericalized = [vocab.stoi["<sos>"]]   #vocab here is a class which is made using the voculary class
        numericalized.extend(vocab.numericalize(sentence))
        numericalized.append(vocab.stoi["<eos>"])
        return numericalized
    
    
    def __getitem__(self, index):
        en_numericalized = self.get_numerical(self.df.iloc[index]["ENGLISH"], self.en_vocab)
        de_numericalized = self.get_numerical(self.df.iloc[index]["GERMAN"], self.de_vocab)

        return torch.tensor(de_numericalized), torch.tensor(en_numericalized)

In [26]:
class CustomCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
        
    def __call__(self, batch):
        src = [item[0] for item in batch]
        src = pad_sequence(src, batch_first=False, padding_value = self.pad_idx)
        
        target = [item[1] for item in batch]
        target = pad_sequence(target, batch_first=False, padding_value = self.pad_idx)
        
        return src, target

In [27]:
BATCH_SIZE = 16
train_dataset = CustomTranslationDataset(train_df, en_vocab, de_vocab)
valid_dataset = CustomTranslationDataset(valid_df, en_vocab, de_vocab)

train_loader = DataLoader(dataset = train_dataset,
                          batch_size=BATCH_SIZE,
                          collate_fn=CustomCollate(pad_idx=en_vocab.stoi["<pad>"]),
                          num_workers=0)

valid_loader = DataLoader(dataset = valid_dataset,
                          batch_size=BATCH_SIZE,
                          collate_fn=CustomCollate(pad_idx=en_vocab.stoi["<pad>"]),
                          num_workers=0)

In [47]:
fun_de = np.vectorize(lambda x: de_vocab.itos[x])
fun_en = np.vectorize(lambda x: en_vocab.itos[x])

In [48]:
#this is to check is everything is working or not
total_steps = len(train_loader)
total_steps

8597

In [49]:

#t.next()[1][1].shape, t.next()[0][1].shape

In [30]:
class Encoder(nn.Module):
    def __init__(self, input_size,embedding_size, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)

        
    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        outputs, (hidden_state, cell_state) = self.lstm(x)
        
        return hidden_state, cell_state



In [31]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout=0.2):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x, hidden_state, cell_state):
        x = x.unsqueeze(0)
        x = self.embedding(x)
        x = self.dropout(x)
        outputs, (hidden_state, cell_state) = self.lstm(x, (hidden_state, cell_state))
        preds = self.fc(outputs.squeeze(0))
        return preds, hidden_state, cell_state

In [32]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = target.shape[1]
        #(target_len, N)
        target_len = target.shape[0]
        target_vocab_size = len(en_vocab) #doubt here check again
        #ouputs
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        #ecnode source text using encoder
        hidden_state, cell_state = self.encoder(source)
        input = target[0,:] #check again
        for t in range(1, target_len):
            output, hidden_state, cell_state = self.decoder(input, hidden_state, cell_state)
            outputs[t] = output
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess
            
        return outputs #retunrs an array of all batach sizes

In [33]:
num_epochs = 80
learning_rate = 0.001


In [34]:
load_model = False
input_dim_encoder = len(de_vocab)
input_dim_decoder = len(en_vocab)
output_dim = len(en_vocab)

In [35]:
embedding_size = 124
hidden_dim = 512
num_layers = 3
dropout = 0.5

In [36]:
steps = 0
encoder = Encoder(input_dim_encoder, embedding_size, hidden_dim, num_layers, dropout).to(device)
decoder = Decoder(input_dim_decoder, embedding_size, hidden_dim, num_layers, dropout).to(device)

In [37]:
model = Seq2Seq(encoder, decoder).to(device)

In [38]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(15937, 124)
    (lstm): LSTM(124, 512, num_layers=3, dropout=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(9720, 124)
    (lstm): LSTM(124, 512, num_layers=3, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
    (fc): Linear(in_features=512, out_features=9720, bias=True)
  )
)

In [39]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 19,186,068 trainable parameters


In [40]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=en_vocab.stoi["<pad>"])

In [41]:
epoch_loss = 0
num_epochs=50

In [42]:
def Train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    model.train()
    for batch_idx, batch in tqdm(enumerate(iterator), ncols = 75, desc ="Train", total = len(iterator)):
        #print(batch_idx)
        src = batch[0].to(device)
        trg = batch[1].to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        epoch_loss += loss.item()
#         if batch_idx % 500 == 0:
#             print(f"epoch = {epoch+1} / {num_epochs}, step {batch_idx+1}/ {total_steps}")
        
    return epoch_loss / len(iterator)


In [43]:
def Eval(model, iterator, criterion):
    epoch_loss = 0
    model.eval()
    for batch_idx, batch in tqdm(enumerate(iterator), ncols = 75, desc ="Valid", total = len(iterator)):
        #print(batch_idx)
         with torch.no_grad():
            src = batch[0].to(device)
            trg = batch[1].to(device)
            
            output = model(src, trg, 0)

            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()
#         if batch_idx % 500 == 0:
#             print(f"epoch = {epoch+1} / {num_epochs}, step {batch_idx+1}/ {total_steps}")
        
    return epoch_loss / len(iterator)


In [44]:
seq_len_en = 50

def inference(model, sentence):
    model.eval()
    result = []

    with torch.no_grad():
        sentence = sentence.to(device)
        
        hidden_state, cell_state = model.encoder(sentence)

        # First input to decoder is "<sos>"
        inp = torch.tensor([en_vocab.stoi["<sos>"]]).to(device)

        # Decode the encoded vector using decoder until max length is reached or <eos> is generated.
        for t in range(1, seq_len_en):
            output, hidden_state, cell_state = model.decoder(inp, hidden_state, cell_state)
            pred = output.argmax(1)
            if pred == en_vocab.stoi["<eos>"]:
                break
            result.append(en_vocab.itos[pred.item()])
            inp = pred
            
    return " ".join(result)

In [50]:
model_path = 'D:\\code\\deep learning\\torch\\machine translation\\best_model\\best_model.pt'
model.load_state_dict(torch.load(model_path))


<All keys matched successfully>

In [51]:
for sample_batch in valid_loader:
    break




best_valid_loss = 2.7257101868735196
sample_source = ' '.join([word for word in fun_de(sample_batch[0][:, 10]) if word not in ["<pad>", "<sos>", "<eos>"]])
sample_target = ' '.join([word for word in fun_en(sample_batch[1][:, 10]) if word not in ["<pad>", "<sos>", "<eos>"]])
print(sample_source)
print(sample_target)
print(inference(model, sample_batch[0][:, 10].reshape(-1, 1)))



for epoch in range(num_epochs):
    print(f"Epoch [{epoch} / {num_epochs}]")
    train_loss = Train(model, train_loader, optimizer, criterion)
    valid_loss = Eval(model, valid_loader, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_path)
        
        
    print(f'\t Train Loss: {train_loss:.10f}')
    print(f'\t Val. Loss: {valid_loss:.10f}')
    print(sample_source)
    print(sample_target)
    print(inference(model, sample_batch[0][:, 10].reshape(-1, 1)))
    

ich wunschte ich hatte mit ihr gehen konnen
i wish i could have gone with her
i wish i could could shopping you
Epoch [0 / 50]


Train: 100%|███████████████████████████| 8597/8597 [26:37<00:00,  5.38it/s]
Valid: 100%|█████████████████████████████| 956/956 [01:03<00:00, 14.99it/s]


	 Train Loss: 2.3482508123
	 Val. Loss: 2.7896505874
ich wunschte ich hatte mit ihr gehen konnen
i wish i could have gone with her
i wish i could have shopping you
Epoch [1 / 50]


Train: 100%|███████████████████████████| 8597/8597 [25:15<00:00,  5.67it/s]
Valid: 100%|█████████████████████████████| 956/956 [01:02<00:00, 15.20it/s]


	 Train Loss: 2.2437910526
	 Val. Loss: 2.7725269126
ich wunschte ich hatte mit ihr gehen konnen
i wish i could have gone with her
i wish i could could with you you
Epoch [2 / 50]


Train: 100%|███████████████████████████| 8597/8597 [25:09<00:00,  5.70it/s]
Valid: 100%|█████████████████████████████| 956/956 [01:02<00:00, 15.25it/s]


	 Train Loss: 2.1785800007
	 Val. Loss: 2.7610205160
ich wunschte ich hatte mit ihr gehen konnen
i wish i could have gone with her
i wish i could could her you you
Epoch [3 / 50]


Train: 100%|███████████████████████████| 8597/8597 [25:08<00:00,  5.70it/s]
Valid: 100%|█████████████████████████████| 956/956 [01:02<00:00, 15.17it/s]


	 Train Loss: 2.1202134404
	 Val. Loss: 2.7550723088
ich wunschte ich hatte mit ihr gehen konnen
i wish i could have gone with her
i wish i could could her with you
Epoch [4 / 50]


Train: 100%|███████████████████████████| 8597/8597 [25:08<00:00,  5.70it/s]
Valid: 100%|█████████████████████████████| 956/956 [01:03<00:00, 15.16it/s]


	 Train Loss: 2.0699768247
	 Val. Loss: 2.7441804246
ich wunschte ich hatte mit ihr gehen konnen
i wish i could have gone with her
i wish i could could able you you
Epoch [5 / 50]


Train: 100%|███████████████████████████| 8597/8597 [25:06<00:00,  5.71it/s]
Valid: 100%|█████████████████████████████| 956/956 [01:02<00:00, 15.30it/s]


	 Train Loss: 2.0243652082
	 Val. Loss: 2.7395828707
ich wunschte ich hatte mit ihr gehen konnen
i wish i could have gone with her
i wish i could could gone you
Epoch [6 / 50]


Train: 100%|███████████████████████████| 8597/8597 [25:10<00:00,  5.69it/s]
Valid: 100%|█████████████████████████████| 956/956 [01:02<00:00, 15.22it/s]


	 Train Loss: 1.9832825238
	 Val. Loss: 2.7451251467
ich wunschte ich hatte mit ihr gehen konnen
i wish i could have gone with her
i wish i had been gone with you
Epoch [7 / 50]


Train: 100%|███████████████████████████| 8597/8597 [25:06<00:00,  5.71it/s]
Valid: 100%|█████████████████████████████| 956/956 [01:02<00:00, 15.22it/s]


	 Train Loss: 1.9458151176
	 Val. Loss: 2.7360986996
ich wunschte ich hatte mit ihr gehen konnen
i wish i could have gone with her
i wish i had ve gone you you
Epoch [8 / 50]


Train: 100%|███████████████████████████| 8597/8597 [25:04<00:00,  5.71it/s]
Valid: 100%|█████████████████████████████| 956/956 [01:02<00:00, 15.18it/s]


	 Train Loss: 1.9138402583
	 Val. Loss: 2.7257101869
ich wunschte ich hatte mit ihr gehen konnen
i wish i could have gone with her
i wish i had been with with you
Epoch [9 / 50]


Train: 100%|███████████████████████████| 8597/8597 [25:08<00:00,  5.70it/s]
Valid: 100%|█████████████████████████████| 956/956 [01:03<00:00, 15.13it/s]


	 Train Loss: 1.8843591763
	 Val. Loss: 2.7331995455
ich wunschte ich hatte mit ihr gehen konnen
i wish i could have gone with her
i wish i had could with you you
Epoch [10 / 50]


Train: 100%|███████████████████████████| 8597/8597 [25:11<00:00,  5.69it/s]
Valid: 100%|█████████████████████████████| 956/956 [01:05<00:00, 14.55it/s]


	 Train Loss: 1.8531249394
	 Val. Loss: 2.7303715337
ich wunschte ich hatte mit ihr gehen konnen
i wish i could have gone with her
i wish i d could her with you
Epoch [11 / 50]


Train:   1%|▏                            | 44/8597 [00:08<26:38,  5.35it/s]


KeyboardInterrupt: 

In [52]:
best_valid_loss

2.7257101868735196

In [62]:
valid_loss = Eval(model, valid_loader, criterion)

Valid: 100%|█████████████████████████████| 956/956 [00:57<00:00, 16.75it/s]


In [63]:
valid_loss

2.818299176927391