In [2]:
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset


In [3]:
dataset = load_dataset("bentrevett/multi30k")

train = dataset['train']
test = dataset['test']
val = dataset['validation']

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 29000/29000 [00:00<00:00, 509723.53 examples/s]
Generating validation split: 100%|██████████| 1014/1014 [00:00<00:00, 44015.32 examples/s]
Generating test split: 100%|██████████| 1000/1000 [00:00<00:00, 47244.33 examples/s]


In [4]:
df_train = train.to_pandas()
df_test = test.to_pandas()
df_val = val.to_pandas()

In [5]:
df_train.shape,df_test.shape,df_val.shape

((29000, 2), (1000, 2), (1014, 2))

In [6]:
df_train.head()

Unnamed: 0,en,de
0,"Two young, White males are outside near many b...",Zwei junge weiße Männer sind im Freien in der ...
1,Several men in hard hats are operating a giant...,Mehrere Männer mit Schutzhelmen bedienen ein A...
2,A little girl climbing into a wooden playhouse.,Ein kleines Mädchen klettert in ein Spielhaus ...
3,A man in a blue shirt is standing on a ladder ...,Ein Mann in einem blauen Hemd steht auf einer ...
4,Two men are at the stove preparing food.,Zwei Männer stehen am Herd und bereiten Essen zu.


In [8]:
from transformers import AutoTokenizer

In [None]:
!python -m spacy download  --quiet
!python -m spacy download de --quiet

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'de' are deprecated. Please use the
full pipeline package name 'de_core_news_sm' instead.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')



[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import spacy

In [12]:
spacy_english = spacy.load('en_core_web_sm')
spacy_german = spacy.load('de_core_news_sm')

In [20]:
def tokenize_german(text):
    tokens = []
    for token in spacy_german.tokenizer(text):
        # print(token)
        tokens.append(token.text.lower())
    
    return ['<sos>'] + tokens + ['<eos>']  

In [21]:
def tokenize_english(text):
    tokens = []
    for token in spacy_english.tokenizer(text):
        tokens.append(token.text.lower())
    
    return ['<sos>'] + tokens + ['<eos>']

In [23]:
english_tokens = [tokenize_english(sentence) for sentence in df_train['en'].tolist()]
german_tokens = [tokenize_english(sentence) for sentence in df_train['de'].tolist()]

In [24]:
english_tokens

[['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 ['<sos>',
  'several',
  'men',
  'in',
  'hard',
  'hats',
  'are',
  'operating',
  'a',
  'giant',
  'pulley',
  'system',
  '.',
  '<eos>'],
 ['<sos>',
  'a',
  'little',
  'girl',
  'climbing',
  'into',
  'a',
  'wooden',
  'playhouse',
  '.',
  '<eos>'],
 ['<sos>',
  'a',
  'man',
  'in',
  'a',
  'blue',
  'shirt',
  'is',
  'standing',
  'on',
  'a',
  'ladder',
  'cleaning',
  'a',
  'window',
  '.',
  '<eos>'],
 ['<sos>',
  'two',
  'men',
  'are',
  'at',
  'the',
  'stove',
  'preparing',
  'food',
  '.',
  '<eos>'],
 ['<sos>',
  'a',
  'man',
  'in',
  'green',
  'holds',
  'a',
  'guitar',
  'while',
  'the',
  'other',
  'man',
  'observes',
  'his',
  'shirt',
  '.',
  '<eos>'],
 ['<sos>', 'a', 'man', 'is', 'smiling', 'at', 'a', 'stuffed', 'lion', '<eos>'],
 ['<sos>',
  'a',
  'trendy',
  'girl',
  'talking',
  'on',
  'her',
  'cellphon

In [25]:
german_tokens

[['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'i',
  'm',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>'],
 ['<sos>',
  'mehrere',
  'männer',
  'mit',
  'schutzhelmen',
  'bedienen',
  'ein',
  'antriebsradsystem',
  '.',
  '<eos>'],
 ['<sos>',
  'ein',
  'kleines',
  'mädchen',
  'klettert',
  'in',
  'ein',
  'spielhaus',
  'aus',
  'holz',
  '.',
  '<eos>'],
 ['<sos>',
  'ein',
  'mann',
  'in',
  'einem',
  'blauen',
  'hemd',
  'steht',
  'auf',
  'einer',
  'leiter',
  'und',
  'putzt',
  'ein',
  'fenster',
  '.',
  '<eos>'],
 ['<sos>',
  'zwei',
  'männer',
  'stehen',
  'am',
  'herd',
  'und',
  'bereiten',
  'essen',
  'zu',
  '.',
  '<eos>'],
 ['<sos>',
  'ein',
  'mann',
  'in',
  'grün',
  'hält',
  'eine',
  'gitarre',
  ',',
  'während',
  'der',
  'andere',
  'mann',
  'sein',
  'hemd',
  'ansieht',
  '.',
  '<eos>'],
 ['<sos>',
  'ein',
  'mann',
  'lächelt',
  'einen',
  'ausgestopften',
  'löwen',
  'an',
  '.',

In [40]:
import re

In [52]:
def build_vocab(tokens):
    vocab = {
        '<pad>': 0,
        '<unk>': 1,
        '<sos>': 2,
        '<eos>': 3
    }
    
    for sentence in tokens:
        for word in sentence:
            if word not in vocab:
                word = re.sub(r'[.,?/!-_~"\';:]+', '', word)
                if word.strip():
                    vocab[word] = len(vocab)
    
    return vocab                

In [53]:
vocab_en = build_vocab(english_tokens)
vocab_de = build_vocab(german_tokens)

In [54]:
len(vocab_en)

9690

In [55]:
len(vocab_de)

17841

In [56]:
vocab_en

{'<pad>': 0,
 '<unk>': 1,
 '<sos>': 2,
 '<eos>': 3,
 'two': 4,
 'young': 5,
 'white': 6,
 'males': 7,
 'are': 8,
 'outside': 9,
 'near': 10,
 'many': 11,
 'bushes': 12,
 'several': 13,
 'men': 14,
 'in': 15,
 'hard': 16,
 'hats': 17,
 'operating': 18,
 'a': 6500,
 'giant': 20,
 'pulley': 21,
 'system': 22,
 'little': 23,
 'girl': 24,
 'climbing': 25,
 'into': 26,
 'wooden': 27,
 'playhouse': 28,
 'man': 29,
 'blue': 30,
 'shirt': 31,
 'is': 32,
 'standing': 33,
 'on': 34,
 'ladder': 35,
 'cleaning': 36,
 'window': 37,
 'at': 38,
 'the': 39,
 'stove': 40,
 'preparing': 41,
 'food': 42,
 'green': 43,
 'holds': 44,
 'guitar': 45,
 'while': 46,
 'other': 47,
 'observes': 48,
 'his': 49,
 'smiling': 50,
 'stuffed': 51,
 'lion': 52,
 'trendy': 53,
 'talking': 54,
 'her': 55,
 'cellphone': 56,
 'gliding': 57,
 'slowly': 58,
 'down': 59,
 'street': 60,
 'woman': 61,
 'with': 62,
 'large': 63,
 'purse': 64,
 'walking': 65,
 'by': 66,
 'gate': 67,
 'boys': 68,
 'dancing': 69,
 'poles': 70,
 'mid

In [57]:
vocab_de

{'<pad>': 0,
 '<unk>': 1,
 '<sos>': 2,
 '<eos>': 3,
 'zwei': 1867,
 'junge': 5,
 'weiße': 6,
 'männer': 7,
 'sind': 8,
 'i': 9,
 'm': 10,
 'freien': 11,
 'in': 12,
 'der': 13,
 'nähe': 14,
 'vieler': 15,
 'büsche': 16,
 'mehrere': 17,
 'mit': 18,
 'schutzhelmen': 19,
 'bedienen': 20,
 'ein': 13548,
 'antriebsradsystem': 22,
 'kleines': 23,
 'mädchen': 24,
 'klettert': 25,
 'spielhaus': 26,
 'aus': 27,
 'holz': 28,
 'mann': 29,
 'einem': 30,
 'blauen': 31,
 'hemd': 32,
 'steht': 33,
 'auf': 13993,
 'einer': 35,
 'leiter': 36,
 'und': 37,
 'putzt': 38,
 'fenster': 39,
 'stehen': 40,
 'am': 16472,
 'herd': 42,
 'bereiten': 43,
 'essen': 44,
 'zu': 45,
 'grün': 46,
 'hält': 47,
 'eine': 48,
 'gitarre': 9456,
 'während': 50,
 'andere': 51,
 'sein': 52,
 'ansieht': 53,
 'lächelt': 54,
 'einen': 55,
 'ausgestopften': 56,
 'löwen': 57,
 'an': 58,
 'schickes': 59,
 'spricht': 60,
 'dem': 61,
 'handy': 62,
 'sie': 63,
 'langsam': 64,
 'die': 65,
 'straße': 66,
 'entlangschwebt': 67,
 'frau': 68,

In [61]:
print([word for word, index in vocab_de.items() if len(word) <= 1])

['i', 'm', 's', 'u', 't', '„', '“', 'e', 'a', 'k', 'r', 'p', '–', '”', 'v', 'h', 'd', 'f', 'à', 'o']


In [66]:
vocab_de_cleaned = {word: index for word, index in vocab_de.items() if len(word) >= 2}

In [68]:
len(vocab_de_cleaned)

17821

In [70]:
print([word for word, index in vocab_de_cleaned.items() if len(word) <= 3])

['in', 'der', 'mit', 'ein', 'aus', 'auf', 'und', 'am', 'zu', 'an', 'dem', 'sie', 'die', 'tor', 'von', 'vor', 'ist', 'das', 'du', 'ich', 'wie', 'hut', 'den', 'um', 'bei', 'arm', 'bis', 'des', 'eis', 'rot', 'für', 'er', 'hat', 'hof', 'ihr', 'bus', 'typ', 'weg', 'tag', 'ihm', 'ab', 'da', 'es', 'lkw', 'tür', 'so', 'tut', 'als', 'ob', 'ins', 'kuh', 'wo', 'uns', 'gap', 'hin', 'see', 'zum', 'zug', 'op', 'zoo', 'ihn', 'mal', 'amc', 'vom', 'zur', 'gut', 'bar', 'rad', 'ton', 'her', 'elf', 'for', 'was', 'wer', 'ort', 'bud', 'bzw', 'new', 'air', 'kai', 'go', 'dj', 'and', 'man', 'fuß', 'übt', 'tim', 'ohr', 'mir', 'mia', 'tee', 'ast', 'ski', 'bad', 'pep', 'heu', 'us', 'the', 'mud', 'los', 'vw', 'ess', 'war', 'iro', 'dhl', 'nur', 'la', 'not', 'sms', 'st', 'nun', 'my', 'em', 'bay', 'bh', 'fu', 'car', 'usa', 'fee', 'cd', 'sei', 'uhr', 'atm', 'wir', 'joe', 'art', 'wii', 'no', 'boa', 'rod', 'pat', 'to', 'dr', 'axt', 'neu', 'qa', 'tun', 'mr', 'ans', 'nah', 'ou', 'uw', 'dre', 'top', 'bau', 'ums', 'end', 's

In [73]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [72]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 32
learning_rate = 0.001
num_epochs = 50

In [78]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        super().__init__()
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

In [80]:
dataset = CustomDataset(df_train['en'], df_train['de'])
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [83]:
index_to_word_de = { index: word for word, index in vocab_de.items() }
index_to_word_en = { index: word for word, index in vocab_en.items() }

In [85]:
def convert_to_indices(text, mapping):
    numeric = []
    for token in text:
        if token in mapping:
            numeric.append(mapping[token])
        else:
            numeric.append(mapping['<unk>'])
    
    return numeric            

In [89]:
sentences_en = [sentence for sentence in df_train['en']]
sentences_de = [sentence for sentence in df_train['de']]

In [90]:
numeric_sentences_en = [convert_to_indices(sentence, vocab_en) for sentence in sentences_en]
numeric_sentences_de = [convert_to_indices(sentence, vocab_de_cleaned) for sentence in sentences_de]

In [92]:
max_len_en = max(len(sentence) for sentence in numeric_sentences_en)
max_len_de = max(len(sentence) for sentence in numeric_sentences_de)

sequence_size = max(max_len_en, max_len_de)

In [93]:
sequence_size

254

In [94]:
def apply_padding(text):
    return text + ([0]*(sequence_size - len(text)))

In [95]:
padded_numeric_sentences_en = [apply_padding(sentence) for sentence in numeric_sentences_en]
padded_numeric_sentences_de = [apply_padding(sentence) for sentence in numeric_sentences_de]

In [96]:
padded_numeric_sentences_en[0]

[1,
 1,
 5023,
 1,
 1,
 5023,
 2965,
 8548,
 1,
 1,
 1,
 1,
 1,
 172,
 784,
 5008,
 1,
 3895,
 6500,
 1,
 5008,
 9690,
 1,
 6500,
 3019,
 5008,
 1,
 5023,
 2965,
 784,
 9690,
 172,
 9174,
 5008,
 1,
 8548,
 5008,
 6500,
 3019,
 1,
 3895,
 6500,
 8548,
 1,
 1,
 2966,
 2965,
 9690,
 1,
 5008,
 9690,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 

In [97]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super().__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(p)
        self.tag = True
        
        self.embedding = nn.Embedding(self.input_size, self.embedding_size)
        self.LSTM = nn.LSTM(self.embedding_size, self.hidden_size, self.num_layers, dropout=p)
        
    def forward(self, x):
        embedding = self.dropout(self.embedding(x))
        outputs, (hidden_state, cell_state) = self.LSTM(embedding)
        
        return hidden_state, cell_state        

In [98]:
input_size_encoder = len(vocab_en)
encoder_embedding_size = 300
hidden_size = 1024
num_layers = 3
encoder_dropout = float(0.5)

In [99]:
encoder = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, encoder_dropout).to(device)

In [100]:
print(encoder)

Encoder(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(9690, 300)
  (LSTM): LSTM(300, 1024, num_layers=3, dropout=0.5)
)


In [101]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, output_size):
        super().__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size

        self.dropout = nn.Dropout(p)
        self.tag = True
        
        self.embedding = nn.Embedding(self.input_size, self.embedding_size)
        self.LSTM = nn.LSTM(self.embedding_size, hidden_size, num_layers, dropout=p)
        
        self.fc = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, x, hidden_state, cell_state):
        x = x.unsqueeze(0)
        
        embedding = self.dropout(self.embedding(x))
        
        outputs, (hidden_state, cell_state) = self.LSTM(embedding, (hidden_state, cell_state))
        
        predictions = self.fc(outputs)
        
        predictions = predictions.squeeze(0)
        
        return predictions, hidden_state, cell_state        
        
        

In [102]:
input_size_decoder = len(vocab_de_cleaned)
decoder_embedding_size = 300
decoder_dropout = float(0.5)
output_size = input_size_decoder

In [103]:
decoder = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, num_layers, decoder_dropout, output_size)

In [104]:
decoder

Decoder(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(17821, 300)
  (LSTM): LSTM(300, 1024, num_layers=3, dropout=0.5)
  (fc): Linear(in_features=1024, out_features=17821, bias=True)
)

In [105]:
import random

In [113]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target):
        batch_size = source.shape[1]
        
        target_len = target.shape[0]
        target_vocab_size = len(vocab_de_cleaned)
        
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device) 
        
        hidden_state_encoder, cell_state_encoder = self.encoder(source)
        
        x = target[0]
        
        for i in range(1, target_len):
            output, hidden_state_decoder, cell_state_decoder = self.decoder(x, hidden_state_encoder, cell_state_encoder) 
            outputs[i] = output
            best_guess = output.argmax(1)
            x = target[i]
            
        return outputs            

In [114]:
model = Seq2Seq(encoder, decoder)
model

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(9690, 300)
    (LSTM): LSTM(300, 1024, num_layers=3, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(17821, 300)
    (LSTM): LSTM(300, 1024, num_layers=3, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=17821, bias=True)
  )
)

In [115]:
PAD_IDX = 0
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [130]:
def train(model, source, target):
    model.train()
    
    epoch_loss = 0
    num_batches = len(source) // batch_size
    
    for batch_index in range(num_batches):
        start = batch_index * batch_size
        end = (batch_index + 1) * batch_size
        
        source_batch = source[start:end]
        target_batch = target[start:end]
        
        print(f"Source batch dtype: {type(source_batch)}, Device: {source_batch.device if hasattr(source_batch, 'device') else 'Not on device'}")
        print(f"Target batch dtype: {type(target_batch)}, Device: {target_batch.device if hasattr(target_batch, 'device') else 'Not on device'}")
        
        
        print(f"Source batch before tensor conversion: {type(source_batch)}")
        print(f"Target batch before tensor conversion: {type(target_batch)}")
        source_batch = torch.tensor(source_batch, dtype=torch.long).to(device)
        target_batch = torch.tensor(target_batch, dtype=torch.long).to(device)
        print(f"Source batch dtype: {source_batch.dtype}, Device: {source_batch.device}")
        print(f"Target batch dtype: {target_batch.dtype}, Device: {target_batch.device}")
        
        optimizer.zero_grad()
        
        output = model(source_batch, target_batch)
        
        print("Model output shape:", output.shape)

        
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        target_batch = target_batch[1:].view(-1)
        
        loss = criterion(output, target_batch)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
    
    return epoch_loss / num_batches        
        

In [131]:
for epoch in range(num_epochs):
    train_loss = train(model, padded_numeric_sentences_en, padded_numeric_sentences_de)
    print(f"Epoch {epoch+1}/{num_epochs}\tTrain Loss: {train_loss:.4f}")

Source batch dtype: <class 'list'>, Device: Not on device
Target batch dtype: <class 'list'>, Device: Not on device
Source batch before tensor conversion: <class 'list'>
Target batch before tensor conversion: <class 'list'>


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
