In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import pickle
from nltk.translate.bleu_score import corpus_bleu
import random
from pyvi import ViTokenizer, ViPosTagger
from nltk.translate.bleu_score import sentence_bleu
import spacy
import torchtext
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [40]:
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.readlines()

english_sentences_train = read_text_file('/home/mediboina.v/Vikash/Deeplearning/neural-machine-translation/data/train.en')
vietnamese_sentences_train = read_text_file('/home/mediboina.v/Vikash/Deeplearning/neural-machine-translation/data/train.vi')

In [41]:
english_sentences_val = read_text_file('/home/mediboina.v/Vikash/Deeplearning/neural-machine-translation/data/tst2013.en')
vietnamese_sentences_val = read_text_file('/home/mediboina.v/Vikash/Deeplearning/neural-machine-translation/data/tst2013.vi')

In [19]:
nlp_en = spacy.load("en_core_web_md")
def tokenize_vietnamese(text):
    return ViTokenizer.tokenize(text).split()

# Function to tokenize English text using spaCy
def tokenize_english(input_texts):
    return [[token.text for token in nlp_en(text)] for text in input_texts]

def tokenize(vietnamese_sentences,english_sentences):


     # Tokenize all sentences
    tokenized_english = tokenize_english(english_sentences)
    tokenized_vietnamese = [tokenize_vietnamese(sentence) for sentence in vietnamese_sentences]

    return tokenized_english,tokenized_vietnamese

In [42]:
tokenized_english, tokenized_vietnamese =tokenize(english_sentences_train,vietnamese_sentences_train)


KeyboardInterrupt: 

In [22]:
# Save tokenized data to pickle files
with open('data/train_en_toknized.pkl', 'wb') as f:
        pickle.dump(tokenized_english, f)

with open('data/train_vi_toknized.pkl', 'wb') as f:
        pickle.dump(tokenized_vietnamese, f)

In [43]:
val_tokenized_english, val_tokenized_vietnamese =tokenize(english_sentences_val,vietnamese_sentences_val)

In [44]:
# Save tokenized data to pickle files
with open('data/test_en_toknized.pkl', 'wb') as f:
        pickle.dump(val_tokenized_english, f)

with open('data/test_vi_toknized.pkl', 'wb') as f:
        pickle.dump(val_tokenized_vietnamese, f)

In [45]:

with open('data/train_en_toknized.pkl', 'rb') as f:
        english_tokenized = pickle.load(f)

    # Load German tokenized sentences from the pickle file
with open('data/train_vi_toknized.pkl', 'rb') as f:
        vi_tokenized = pickle.load(f)

with open('data/test_en_toknized.pkl', 'rb') as f:
        val_english_tokenized = pickle.load(f)

    # Load vi tokenized sentences from the pickle file
with open('data/test_vi_toknized.pkl', 'rb') as f:
        val_vi_tokenized = pickle.load(f)


all_english = english_tokenized + val_english_tokenized 
all_vi = vi_tokenized + val_vi_tokenized
# Create vocabulary
eng_vocab = torchtext.vocab.build_vocab_from_iterator(all_english)
ger_vocab = torchtext.vocab.build_vocab_from_iterator(all_vi)
# Save vocabularies
with open('data/en_vocab.pkl', 'wb') as f:
    pickle.dump(eng_vocab, f)

with open('data/vi_vocab.pkl', 'wb') as f:
    pickle.dump(ger_vocab, f)
len(eng_vocab),len(ger_vocab)


(24367, 55694)

In [46]:
def remove_punctuation_from_tokenized_data(tokenized_data):
    return [[token for token in sentence if token not in [',', '.']] for sentence in tokenized_data]


In [47]:
with open('data/en_vocab.pkl', 'rb') as f:
    eng_vocab = pickle.load(f)

# Load viman tokenized sentences from the pickle file
with open('data/vi_vocab.pkl', 'rb') as f:
    vi_vocab = pickle.load(f)

if '<unk>' not in eng_vocab:
    eng_vocab.insert_token('<unk>', 0)  # Adjust the index if needed

if '<unk>' not in vi_vocab:
    vi_vocab.insert_token('<unk>', 0)  # Adjust the index if needed

In [48]:
def read_all_data_from_pickle(file_name):
    data = []
    try:
        with open(file_name, 'rb') as f:
            while True:
                try:
                    data.extend(pickle.load(f))
                except EOFError:
                    break
    except FileNotFoundError:
        print(f"File not found: {file_name}")
    return data

In [49]:
def preProcess( english_tokenized_file_name,vi_tokenized_file_name,eng_vocab,ger_vocab):
    english_tokenized=read_all_data_from_pickle(english_tokenized_file_name)
    vi_tokenized=read_all_data_from_pickle(vi_tokenized_file_name)

    english_tokenized = remove_punctuation_from_tokenized_data(english_tokenized)
    vi_tokenized = remove_punctuation_from_tokenized_data(vi_tokenized)
    # Convert words to indices
    english_indices = [torch.tensor([eng_vocab[word] if word in eng_vocab else eng_vocab['<unk>'] for word in sentence], dtype=torch.long) for sentence in english_tokenized]
    vi_indices = [torch.tensor([ger_vocab[word] if word in ger_vocab else eng_vocab['<unk>'] for word in sentence], dtype=torch.long) for sentence in vi_tokenized]

    # Pad sequences to the same length
    max_len = max(max(len(seq) for seq in english_indices), max(len(seq) for seq in vi_indices))
    english_padded = pad_sequence([torch.cat([seq, torch.zeros(max_len - len(seq))], dim=0) for seq in english_indices], batch_first=True)
    vi_padded = pad_sequence([torch.cat([seq, torch.zeros(max_len - len(seq))], dim=0) for seq in vi_indices], batch_first=True)

    return english_padded,vi_padded, max_len

In [50]:
train_en_tokenize_file_name='data/train_en_toknized.pkl'
train_vi_tokenize_file_name='data/train_vi_toknized.pkl'
val_en_tokenize_file_name='data/test_en_toknized.pkl'
val_vi_tokenize_file_name='data/test_vi_toknized.pkl'

In [51]:
train_english_padded,train_vi_padded,max_len = preProcess(train_en_tokenize_file_name,train_vi_tokenize_file_name,eng_vocab,ger_vocab)
val_english_padded,val_vi_padded,max_len = preProcess(val_en_tokenize_file_name,val_vi_tokenize_file_name,eng_vocab,ger_vocab)


In [52]:
len(train_english_padded),len(train_vi_padded),len(val_english_padded),len(val_vi_padded)

(133317, 133317, 1268, 1268)

In [53]:
train_english_padded = train_english_padded.to(device)
train_vi_padded = train_vi_padded.to(device)
val_english_padded = val_english_padded.to(device)
val_vi_padded = val_vi_padded.to(device)

In [54]:
# Define the dataset
class TranslationDataset(Dataset):
    def __init__(self, english_data, vi_data):
        self.english_data = english_data
        self.vi_data = vi_data

    def __len__(self):
        return len(self.english_data)

    def __getitem__(self, idx):
        return self.english_data[idx], self.vi_data[idx]



In [55]:
print(len(train_english_padded),len(train_vi_padded))
# Create DataLoader
train_dataset = TranslationDataset(train_english_padded, train_vi_padded)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Create DataLoader
val_dataset = TranslationDataset(val_english_padded, val_vi_padded)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=True)

133317 133317


In [56]:

class EncoderLSTM(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout):
        super(EncoderLSTM, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)

    def forward(self, x):
        # x shape: (seq_length, N)

        embedding = self.dropout(self.embedding(x.long()))
        # embedding shape: (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding)
        return hidden, cell

class DecoderLSTM(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, dropout):
        super(DecoderLSTM, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = hidden_size
        self.output_size=output_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x shape: (N) but we want (1, N)
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x.long()))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)
        # predictions shape: (1, N, length_of_vocab)

        predictions = predictions.squeeze(0)
        return predictions, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_forcing_ratio = 0.5):
        # source shape: (src_len, N)
        # target shape: (trg_len, N)

        trg_len, N = target.shape
        trg_vocab_size = self.decoder.output_size

        # tensor to store decoder outputs
        outputs = torch.zeros(trg_len, N, trg_vocab_size).to(device)

        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(source)

        # first input to the decoder is the <sos> tokens
        x = target[0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(x, hidden, cell)

            outputs[t] = output

            # decide if we will use teacher forcing or not
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_forcing_ratio else best_guess
        
        return outputs


In [57]:
# Initialize models
INPUT_DIM = len(eng_vocab)  # Assuming eng_vocab is your English vocabulary
OUTPUT_DIM = len(vi_vocab)  # Assuming ger_vocab is your German vocabulary
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 1024
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
learning_rate=0.001
print(INPUT_DIM,OUTPUT_DIM)

24368 55695


In [62]:

enc = EncoderLSTM(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)
dec = DecoderLSTM(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, OUTPUT_DIM, N_LAYERS, DEC_DROPOUT).to(device)
start_epoch=0
model = Seq2Seq(enc, dec).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0) 
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [20]:
checkpoint = torch.load('data/model_checkpoint.pth')
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
start_epoch = checkpoint['epoch'] + 1 

In [59]:
# Training loop
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    
    epoch_loss = 0
    for i, (src, trg) in enumerate(iterator):
        
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()
        
        output = model(src, trg)
       
        # trg = [trg len, batch size]
        # output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
       
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1).long()
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)
def evaluate(model, iterator, criterion,ger_vocab):
    model.eval()
    
    epoch_loss = 0
    predictions, targets = [], []
    with torch.no_grad():
        for _, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg, 0)  # turn off teacher forcing
            

            # Convert output to token indices
            output_indices = output.argmax(2)  # Choose the word with highest probability
            output_sentences = tensor_to_sentence(output_indices[:, 1:], ger_vocab)
            predictions.extend(output_sentences)

            # Convert trg to token strings, skipping <sos> token
            trg_sentences = tensor_to_sentence(trg[:, 1:], ger_vocab)
            targets.extend([[sent] for sent in trg_sentences])  # Wrap each sentence in another list


            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1).long()

            loss = criterion(output, trg)
            epoch_loss += loss.item()
    bleu=corpus_bleu( targets,predictions)
    return epoch_loss / len(iterator) , bleu


In [60]:
def tensor_to_sentence(tensor, vocab, pad_index=0, eos_index=None, sos_index=None):
    itos = vocab.get_itos()
    sentences = []
    
    for i in range(tensor.size(0)):  # Loop over each item in the batch
        sentence = []
        for idx in tensor[i]:
            if idx == pad_index or idx == eos_index or idx == sos_index:
                continue  # Skip pad, eos, and sos tokens
            sentence.append(itos[int(idx.item())])
        sentences.append(sentence)
    return sentences


In [63]:
# Training settings
N_EPOCHS = 50
CLIP = 1
best_val_loss = float('inf')
for epoch in range(start_epoch,N_EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
    valid_loss, val_bleu = evaluate(model, val_dataloader, criterion,vi_vocab)
    if valid_loss < best_val_loss:
        best_val_loss = valid_loss
        # Save the model
        torch.save(model.state_dict(), 'data/eng_to_vi_translation_model.pth')
        print("Saved Best Model")
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Val. Loss: {valid_loss:.3f}, Valid Bleu: {val_bleu:.5f}')


# Save the model
torch.save(model.state_dict(), 'data/eng_to_vi_translation_model_end.pth')

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.32 GiB. GPU 0 has a total capacty of 31.74 GiB of which 273.62 MiB is free. Process 111737 has 16.31 GiB memory in use. Including non-PyTorch memory, this process has 15.16 GiB memory in use. Of the allocated memory 14.16 GiB is allocated by PyTorch, and 68.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [69]:
torch.save(model.state_dict(), 'data/eng_to_vi_translation_model.pth')

In [70]:
 print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Val. Loss: {valid_loss:.3f}, Valid Bleu: {val_bleu:.5f}')

Epoch: 01, Train Loss: 6.297, Val. Loss: 6.798, Valid Bleu: 0.00015


In [71]:
state = {
    'epoch': epoch,
    'state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict(),
    # Include any other information you want
}
torch.save(state, 'data/model_checkpoint.pth')