In [1]:
import json
import pandas as pd
import numpy as np
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

from pickle import load
from numpy import array
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.utils.data
import math

In [2]:
batch_size = 64
max_len = 16
num_heads = 8

In [3]:
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [4]:
# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

In [5]:
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

In [6]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

In [7]:
filename = "D:\data\deu.txt"
doc = load_doc(filename)
pairs = to_pairs(doc)
clean_pairs = clean_pairs(pairs)
save_clean_data(clean_pairs, 'english-german.pkl')
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-german.pkl
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stop] => [stopp]
[wait] => [warte]
[hello] => [hallo]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[smile] => [lacheln]
[cheers] => [zum wohl]
[freeze] => [keine bewegung]
[freeze] => [stehenbleiben]
[got it] => [verstanden]
[got it] => [einverstanden]
[he ran] => [er rannte]
[he ran] => [er lief]
[hop in] => [mach mit]
[hug me] => [druck mich]
[hug me] => [nimm mich in den arm]
[hug me] => [umarme mich]
[i fell] => [ich fiel]
[i fell] => [ich fiel hin]
[i fell] => [ich sturzte]
[i fell] => [ich bin hingefallen]
[i fell] => [ich bin gesturzt]
[i know] => [ich wei]
[i lied] => [ich habe gelogen]
[i lost] => [ich habe verloren]
[im] => [ich bin jahre alt]
[im] => [ich bin]
[im ok] => [mir gehts gut]
[im ok] => [es geht mir gut]
[no way] => [unmoglich]
[no way] => [da

In [8]:
len(clean_pairs)

152820

In [9]:
for p in clean_pairs:
	if len(p) !=2:
		print(len(p))

In [10]:
from collections import Counter

word_freq = Counter()
for pair in clean_pairs:
	word_freq.update(pair[0].split())
	word_freq.update(pair[1].split())
word_freq

Counter({'tom': 60434,
         'ich': 40537,
         'the': 37212,
         'to': 34851,
         'you': 34260,
         'i': 32855,
         'a': 24014,
         'in': 21454,
         'ist': 21346,
         'nicht': 21081,
         'sie': 19804,
         'is': 18781,
         'du': 17570,
         'das': 17320,
         'was': 16128,
         'zu': 15622,
         'die': 14282,
         'es': 13890,
         'er': 13412,
         'he': 12141,
         'of': 11448,
         'der': 11165,
         'it': 10381,
         'that': 10358,
         'do': 9481,
         'have': 9247,
         'this': 9100,
         'hat': 9077,
         'me': 8858,
         'ein': 8730,
         'dass': 8274,
         'for': 7841,
         'im': 7812,
         'my': 7684,
         'wir': 7604,
         'habe': 7184,
         'an': 7087,
         'mary': 7063,
         'mir': 6942,
         'dont': 6851,
         'auf': 6567,
         'sich': 6529,
         'your': 6405,
         'mit': 6402,
         'are': 

In [11]:
min_word_freq = 3
words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq]
word_map = {k: v+1 for v, k in enumerate(words)}

#adding special tokens
word_map['<unk>'] = len(word_map) + 1
word_map['<start>'] = len(word_map) + 1
word_map['<end>'] = len(word_map) + 1
word_map['<pad>'] = 0

In [12]:
len(word_map)

16617

In [13]:
word_map['<unk>'], word_map['<start>'], word_map['<end>'], word_map['<pad>']

(16614, 16615, 16616, 0)

In [14]:
with open('word_map_corpus.json', 'w') as j:
	json.dump(word_map, j)

In [15]:
def encode_enc_inp(words, word_map):
	"""
	Encode a list of words into a tensor of word ids.
	
	parameter:
	words: list of words in the sentences
	word_map: dictionary mapping words to indices
	
	Returns:
	list: Encoded german words as a sequence of indices
	"""
	enc_c = [word_map.get(word, word_map['<unk>']) for word in words]
	enc_c += [word_map['<pad>']] * (max_len - len(words)) 
	return enc_c

def encode_dec_inp(words, word_map):
	"""
	Encode translate into a seq of indices using a word-to-index mapping.
	
	parameter:
	words: list of words in the translate sentences
	word_map: dictionary mapping words to indices
	
	Returns:
	"""
	enc_c = ([word_map['<start>']] + [word_map.get(word, word_map['<unk>']) for word in words] + 
			 [word_map['<end>']] + [word_map['<pad>']] * (max_len - len(words))
			 ) 
	return enc_c

In [16]:
#initialize an empty list to store encoded translate pairs
pairs_encoded = []
for pair in clean_pairs:
	english = encode_enc_inp(pair[0], word_map)
	german = encode_dec_inp(pair[1], word_map)
	pairs_encoded.append([english, german])

In [17]:
clean_pairs[1]

array(['hi', 'gru gott'], dtype='<U370')

In [18]:
# Print a sample of the encoded pairs to verify
print("Sample Encoded Pair:")
print(f"English: {pairs_encoded[0][0]}")
print(f"German: {pairs_encoded[0][1]}")


Sample Encoded Pair:
English: [16614, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
German: [16615, 16614, 207, 16614, 16614, 5266, 16616, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [19]:
#saving number coded wordmap
f_name = 'pairs_encoded.json'
with open(f_name, 'w') as p:
	json.dump(pairs_encoded, p)

In [20]:
max_len = 100  # This should be fixed and consistent across training

In [21]:
class TranslationDataset(Dataset):
    def __init__(self):
        self.pairs = json.load(open('pairs_encoded.json'))
        self.dataset_size = len(self.pairs)
    
    def __getitem__(self, index):
        enc_inp = torch.LongTensor(self.pairs[index][0])
        dec = torch.LongTensor(self.pairs[index][1])
        
        dec_inp = dec[:-1]
        dec_out = dec[1:]
        
        # Ensure all sequences are padded or truncated to max_len
        enc_inp = enc_inp[:max_len]
        dec_inp = dec_inp[:max_len]
        dec_out = dec_out[:max_len]

        return enc_inp, dec_inp, dec_out
    
    def __len__(self):
        return self.dataset_size

train_data = TranslationDataset()


In [22]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    enc_inp, dec_inp, dec_out = zip(*batch)

    # Pad all sequences to the fixed max_len
    enc_inp = pad_sequence([torch.tensor(seq).long() for seq in enc_inp], batch_first=True, padding_value=word_map['<pad>'])
    dec_inp = pad_sequence([torch.tensor(seq).long() for seq in dec_inp], batch_first=True, padding_value=word_map['<pad>'])
    dec_out = pad_sequence([torch.tensor(seq).long() for seq in dec_out], batch_first=True, padding_value=word_map['<pad>'])

    # Ensure padding is applied correctly for max_len
    enc_inp = torch.cat([enc_inp, torch.full((enc_inp.size(0), max_len - enc_inp.size(1)), word_map['<pad>'], dtype=torch.long)], dim=1)
    dec_inp = torch.cat([dec_inp, torch.full((dec_inp.size(0), max_len - dec_inp.size(1)), word_map['<pad>'], dtype=torch.long)], dim=1)
    dec_out = torch.cat([dec_out, torch.full((dec_out.size(0), max_len - dec_out.size(1)), word_map['<pad>'], dtype=torch.long)], dim=1)

    return enc_inp, dec_inp, dec_out


In [23]:
train_data = TranslationDataset()
batch_size = 16  # Set batch size (adjust based on memory availability)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [24]:
q_r=train_data[10]
q_r

(tensor([16614, 16614, 16614, 16614,  5266,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]),
 tensor([16615, 16614,   207, 16614, 16614,  5266, 16616,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]),
 tensor([16614,   207, 16614, 16614,  5266, 16616,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]))

In [25]:
rev_word_map = {v: k for k,v in word_map.items()}

In [26]:
rev_word_map[5267]

'gentle'

In [27]:
def tensor_to_sentence(t, clean=False):
	q = t.detach().numpy()
	q_words = " ".join([rev_word_map[v] for v in q])
	if clean:
		q_words = q_words.replace("<pad>", "")
	return q_words

In [28]:
q_words = tensor_to_sentence(q_r[0])
r_words = tensor_to_sentence(q_r[1])
q_words, r_words

('<unk> <unk> <unk> <unk> o <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>',
 '<start> <unk> a <unk> <unk> o <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>')

In [29]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_size, pad_id):
        super(TokenEmbedding, self).__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_size, padding_idx=pad_id)
        self.init_weights()
        
    def init_weights(self):
        """Initialize embedding weights uniformly within [-0.1, 0.1]."""
        initrange = 0.1
        self.token_embedding.weight.data.uniform_(-initrange, initrange)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass to generate token embeddings.
        
        Args:
            x: Input tensor containing token indices.
        
        Returns:
            Tensor containing token embeddings.
        """
        x_embed = self.token_embedding(x)
        return x_embed


In [30]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        self.d_model = d_model
        self.pe = torch.zeros(max_len, d_model)
        
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        
        self.pe[:, 0::2] = torch.sin(position * div_term)
        self.pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = self.pe.unsqueeze(0)  # Shape: (1, max_len, d_model)
        
    def forward(self, x):
        seq_len = x.size(1)  # Get current sequence length
        return self.pe[:, :seq_len].to(x.device)

In [31]:
class Embeddings(nn.Module):
    def __init__(self, vocab, embed_size, max_len):
        super(Embeddings, self).__init__()
        self.token_embedding = nn.Embedding(num_embeddings=len(vocab), embedding_dim=embed_size, padding_idx=vocab['<pad>'])
        self.pos_embedding = PositionalEncoding(embed_size, max_len + 2)  # Including <start> and <end>
        self.embed_size = embed_size
    
    def forward(self, x):
        token_embed = self.token_embedding(x) * math.sqrt(self.embed_size)  # Scale embeddings
        pos_embed = self.pos_embedding(x)[:, :x.size(1), :]
        return token_embed + pos_embed

In [32]:
class Transformer(nn.Module):
	
	def __init__(self, vocab, d_model=512, n_heads=8, num_encoder_layers=6, num_decoder_layers=6,
						 dim_feedforward=2048, dropout=0.1, max_len=15):
		super(Transformer, self).__init__()
		self.input_embedding = Embeddings(vocab, d_model, max_len)
		self.transformer = nn.Transformer(d_model=d_model,
												  nhead=n_heads,
												  num_encoder_layers=num_encoder_layers,
												  num_decoder_layers=num_decoder_layers,
												  dim_feedforward=dim_feedforward,
												  dropout=dropout,
												  batch_first=True)
		self.project_vocab_layer = nn.Linear(in_features=d_model, out_features=len(vocab))
		self.init_weights()
			
	def init_weights(self):
		initrange = 0.1
		self.project_vocab_layer.bias.data.zero_()
		self.project_vocab_layer.weight.data.uniform_(-initrange, initrange)
	
	def forward(self, enc_input, dec_input):
		x_enc_embed = self.input_embedding(enc_input.long())  # (batch_size, enc_seq_len, d_model)
		x_dec_embed = self.input_embedding(dec_input.long())  # (batch_size, dec_seq_len, d_model)
		
		print(f"encoder embed shape : {x_enc_embed.size()}")
		print(f"decoder embed shape : {x_dec_embed.size()}")
			
				# Masks
		src_key_padding_mask = (enc_input == self.vocab['<pad>']).to(device)  # Shape: (batch_size, enc_seq_len)
		tgt_key_padding_mask = (dec_input == self.vocab['<pad>']).to(device)  # Shape: (batch_size, dec_seq_len)
			
				# Generate tgt_mask based on decoder input length
		tgt_mask = self.transformer.generate_square_subsequent_mask(dec_input.size(1)).to(enc_input.device)
				
				# Forward pass through transformer
		feature = self.transformer(src=x_enc_embed,
								   tgt=x_dec_embed,
								   src_key_padding_mask=src_key_padding_mask,
								   tgt_key_padding_mask=tgt_key_padding_mask,
								   memory_key_padding_mask=src_key_padding_mask,
								   tgt_mask=tgt_mask)
		logits = self.project_vocab_layer(feature)
		return logits

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(word_map, max_len=98).to(device)

for i, (enc_inp, dec_inp, dec_out) in enumerate(train_loader):
    print(f"Batch {i+1}")
    print(f"Encoder input shape: {enc_inp.shape}")
    print(f"Decoder input shape: {dec_inp.shape}")
    print(f"Decoder output shape: {dec_out.shape}")
    
    enc_inp, dec_inp, dec_out = enc_inp.to(device), dec_inp.to(device), dec_out.to(device)
    
    out = model(enc_inp, dec_inp)
    print(f"Model output shape: {out.shape}")
    
    # Ensure shapes are consistent
    assert enc_inp.shape[1] == dec_inp.shape[1], "Mismatch in encoder and decoder sequence lengths!"


Batch 1
Encoder input shape: torch.Size([16, 100])
Decoder input shape: torch.Size([16, 100])
Decoder output shape: torch.Size([16, 100])
encoder embed shape : torch.Size([16, 100, 512])
decoder embed shape : torch.Size([16, 100, 512])


  enc_inp = pad_sequence([torch.tensor(seq).long() for seq in enc_inp], batch_first=True, padding_value=word_map['<pad>'])
  dec_inp = pad_sequence([torch.tensor(seq).long() for seq in dec_inp], batch_first=True, padding_value=word_map['<pad>'])
  dec_out = pad_sequence([torch.tensor(seq).long() for seq in dec_out], batch_first=True, padding_value=word_map['<pad>'])


AttributeError: 'Transformer' object has no attribute 'vocab'