In [1]:
import json
import pandas as pd
import numpy as np
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

from pickle import load
from numpy import array
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.utils.data
import math

In [2]:
def load_doc(filename):
    # open the file as read only
    with open(filename, mode='rt', encoding='utf-8') as file:
        # read all text
        text = file.read()
    return text
# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars from each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [3]:
filename = "D:\data\deu.txt"
doc = load_doc(filename)
pairs = to_pairs(doc)
clean_pairs = clean_pairs(pairs)
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stop] => [stopp]
[wait] => [warte]
[hello] => [hallo]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[smile] => [lacheln]
[cheers] => [zum wohl]
[freeze] => [keine bewegung]
[freeze] => [stehenbleiben]
[got it] => [verstanden]
[got it] => [einverstanden]
[he ran] => [er rannte]
[he ran] => [er lief]
[hop in] => [mach mit]
[hug me] => [druck mich]
[hug me] => [nimm mich in den arm]
[hug me] => [umarme mich]
[i fell] => [ich fiel]
[i fell] => [ich fiel hin]
[i fell] => [ich sturzte]
[i fell] => [ich bin hingefallen]
[i fell] => [ich bin gesturzt]
[i know] => [ich wei]
[i lied] => [ich habe gelogen]
[i lost] => [ich habe verloren]
[im] => [ich bin jahre alt]
[im] => [ich bin]
[im ok] => [mir gehts gut]
[im ok] => [es geht mir gut]
[no way] => [unmoglich]
[no way] => [das gibts doch nicht]
[no wa

In [4]:
len(clean_pairs)

152820

In [5]:
for p in clean_pairs:
	if len(p) !=2:
		print(len(p))

In [6]:
from collections import Counter

word_freq = Counter()
for pair in clean_pairs:
	word_freq.update(pair[0].split())
	word_freq.update(pair[1].split())
word_freq

Counter({'tom': 60434,
         'ich': 40537,
         'the': 37212,
         'to': 34851,
         'you': 34260,
         'i': 32855,
         'a': 24014,
         'in': 21454,
         'ist': 21346,
         'nicht': 21081,
         'sie': 19804,
         'is': 18781,
         'du': 17570,
         'das': 17320,
         'was': 16128,
         'zu': 15622,
         'die': 14282,
         'es': 13890,
         'er': 13412,
         'he': 12141,
         'of': 11448,
         'der': 11165,
         'it': 10381,
         'that': 10358,
         'do': 9481,
         'have': 9247,
         'this': 9100,
         'hat': 9077,
         'me': 8858,
         'ein': 8730,
         'dass': 8274,
         'for': 7841,
         'im': 7812,
         'my': 7684,
         'wir': 7604,
         'habe': 7184,
         'an': 7087,
         'mary': 7063,
         'mir': 6942,
         'dont': 6851,
         'auf': 6567,
         'sich': 6529,
         'your': 6405,
         'mit': 6402,
         'are': 

In [7]:
min_word_freq = 3
words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq]
word_map = {k: v+1 for v, k in enumerate(words)}

#adding special tokens
word_map['<unk>'] = len(word_map) + 1
word_map['<start>'] = len(word_map) + 1
word_map['<end>'] = len(word_map) + 1
word_map['<pad>'] = 0

In [8]:
len(word_map)

16617

In [9]:
word_map['<unk>'], word_map['<start>'], word_map['<end>'], word_map['<pad>']

(16614, 16615, 16616, 0)

In [10]:
with open('word_map_corpus.json', 'w') as j:
	json.dump(word_map, j)

In [11]:
def encode_enc_inp(words, word_map):
    # Get the encoded words
    enc_c = [word_map.get(word, word_map['<unk>']) for word in words]
    # Truncate to max_len
    enc_c = enc_c[:max_len]
    # Pad to max_len
    enc_c += [word_map['<pad>']] * (max_len - len(enc_c))
    return enc_c

def encode_dec_inp(words, word_map):
    # Start with the start token
    enc_c = [word_map['<start>']]
    
    # Encode the words, limiting the length to max_len - 2 to account for <start> and <end>
    enc_c += [word_map.get(word, word_map['<unk>']) for word in words][:max_len - 2]
    
    # Add the end token if there is space
    enc_c.append(word_map['<end>'])
    
    # Pad to max_len + 1 (to include the end token)
    enc_c += [word_map['<pad>']] * (max_len - len(enc_c))
    
    return enc_c

In [12]:
max_len = 20
#initialize an empty list to store encoded translate pairs
# Check the lengths of the encoded pairs
pairs_encoded = []
for pair in clean_pairs:
    english = encode_enc_inp(pair[0], word_map)
    german = encode_dec_inp(pair[1], word_map)
    pairs_encoded.append([english, german])

In [13]:
clean_pairs[1]

array(['hi', 'gru gott'], dtype='<U370')

In [14]:
# Print a sample of the encoded pairs to verify
print("Sample Encoded Pair:")
print(f"English: {pairs_encoded[0][0]}")
print(f"German: {pairs_encoded[0][1]}")


Sample Encoded Pair:
English: [16614, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
German: [16615, 16614, 207, 16614, 16614, 5266, 16616, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [15]:
#saving number coded wordmap
f_name = 'pairs_encoded.json'
with open(f_name, 'w') as p:
	json.dump(pairs_encoded, p)

In [16]:
def tensor_to_sentence(t, clean=False):
	q = t.detach().numpy()
	q_words = " ".join([rev_word_map[v] for v in q])
	if clean:
		q_words = q_words.replace("<pad>", "")
	return q_words

In [17]:
rev_word_map = {v: k for k, v in word_map.items()}  # Reverse mapping for decoding
max_len = 20  # Max length for sequences, including <start> and <end>
batch_size = 32  # Example batch size

# Dataset class
class TranslationDataset(Dataset):
    def __init__(self, word_map, max_len):
        self.word_map = word_map
        self.max_len = max_len
        self.pairs = json.load(open('pairs_encoded.json'))  # Load your encoded pairs
        self.dataset_size = len(self.pairs)
    
    def __getitem__(self, index):
        enc_inp = torch.LongTensor(self.pairs[index][0])  # Assuming pairs contain encoded inputs
        dec = torch.LongTensor(self.pairs[index][1])
        
        dec_inp = dec[:-1]  # Input for decoder
        dec_out = dec[1:]   # Output for decoder
        
        # Ensure all sequences are padded or truncated to max_len
        enc_inp = enc_inp[:self.max_len]
        dec_inp = dec_inp[:self.max_len + 1]  # Adjusted for the decoder input
        dec_out = dec_out[:self.max_len]  # This should still be max_len

        return enc_inp, dec_inp, dec_out
    
    def __len__(self):
        return self.dataset_size

# Token Embedding class
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_size, pad_id):
        super(TokenEmbedding, self).__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_size, padding_idx=pad_id)
        self.init_weights()
        
    def init_weights(self):
        """Initialize embedding weights uniformly within [-0.1, 0.1]."""
        initrange = 0.1
        self.token_embedding.weight.data.uniform_(-initrange, initrange)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.token_embedding(x)

# Positional Encoding class
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        self.d_model = d_model
        self.pe = torch.zeros(max_len, d_model)
        
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        
        self.pe[:, 0::2] = torch.sin(position * div_term)
        self.pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = self.pe.unsqueeze(0)  # Shape: (1, max_len, d_model)
        
    def forward(self, x):
        seq_len = x.size(1)  # Get current sequence length
        return self.pe[:, :seq_len].to(x.device)

# Embeddings class
# In the Embeddings class
class Embeddings(nn.Module):
    def __init__(self, word_map, embed_size, max_len):
        super(Embeddings, self).__init__()
        self.token_embedding = nn.Embedding(num_embeddings=len(word_map), embedding_dim=embed_size, padding_idx=word_map['<pad>'])
        self.pos_embedding = PositionalEncoding(embed_size, max_len + 2)  # Including <start> and <end>
        self.embed_size = embed_size
    
    def forward(self, x):
        token_embed = self.token_embedding(x) * math.sqrt(self.embed_size)  # Scale embeddings
        pos_embed = self.pos_embedding(x)[:, :x.size(1), :]
        return token_embed + pos_embed

class Transformer(nn.Module):
    def __init__(self, word_map, d_model=512, n_heads=8, num_encoder_layers=6, num_decoder_layers=6,
                 dim_feedforward=2048, dropout=0.1, max_len=20):
        super(Transformer, self).__init__()
        self.input_embedding = Embeddings(word_map=word_map, embed_size=d_model, max_len=max_len)
        self.transformer = nn.Transformer(d_model=d_model,
                                           nhead=n_heads,
                                           num_encoder_layers=num_encoder_layers,
                                           num_decoder_layers=num_decoder_layers,
                                           dim_feedforward=dim_feedforward,
                                           dropout=dropout,
                                           batch_first=True)
        self.project_vocab_layer = nn.Linear(d_model, len(word_map))  # Ensure vocab_size is correct

    def forward(self, enc_input, dec_input):
        enc_input = enc_input[:, :max_len]  # Truncate encoder input if necessary
        dec_input = dec_input[:, :max_len + 1]  # Truncate decoder input if necessary

        x_enc_embed = self.input_embedding(enc_input.long())  # (batch_size, enc_seq_len, d_model)

        x_dec_embed = self.input_embedding(dec_input.long())  # (batch_size, dec_seq_len, d_model)

        # Masks
        src_key_padding_mask = (enc_input == word_map['<pad>']).to(enc_input.device)
        tgt_key_padding_mask = (dec_input == word_map['<pad>']).to(enc_input.device)

        tgt_mask = self.transformer.generate_square_subsequent_mask(dec_input.size(1)).to(enc_input.device)

        # Forward pass through transformer
        feature = self.transformer(src=x_enc_embed,
                                   tgt=x_dec_embed,
                                   src_key_padding_mask=src_key_padding_mask,
                                   tgt_key_padding_mask=tgt_key_padding_mask,
                                   memory_key_padding_mask=src_key_padding_mask,
                                   tgt_mask=tgt_mask)

        logits = self.project_vocab_layer(feature)  # Shape should be (batch_size, max_len, vocab_size)
        return logits

In [18]:
train_data = TranslationDataset(word_map, max_len)
train_data[:2]

(tensor([[16614,    17,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
         [16615, 16614,   207, 16614, 16614,  5266, 16616,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]),
 tensor([[16614,    17,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]),
 tensor([[16615, 16614, 16614, 16614, 16614, 16614,  5266, 16614, 16614, 16616,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]))

In [19]:
def encode_sentence(sentence, word_map, max_len):
    # Convert words to their corresponding indices
    encoded = [word_map.get(word, word_map['<unk>']) for word in sentence.split()]
    
    # Add <start> and <end> tokens
    encoded = [word_map['<start>']] + encoded + [word_map['<end>']]
    
    # Pad or truncate to max_len
    if len(encoded) < max_len:
        encoded += [word_map['<pad>']] * (max_len - len(encoded))
    else:
        encoded = encoded[:max_len]
    
    return encoded

# Example usage
sentence = "hello world"
encoded_sentence = encode_sentence(sentence, word_map, max_len=20)
print(encoded_sentence)

[16615, 16, 5175, 16616, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [20]:
from torch import optim

# Define model parameters
embed_size = 64  # Size of the embedding vectors
hidden_size = 256  # Size of the hidden layers
num_layers = 1  # Number of layers in the Transformer
num_heads = 8  # Number of attention heads
dropout = 0.1  # Dropout rate

# Initialize the model
model = Transformer(word_map=word_map, n_heads=num_heads, dropout=dropout)

# Define the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=word_map['<pad>'])

# Load the dataset
dataset = TranslationDataset(word_map, max_len)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training loop
num_epochs = 3  # Number of epochs to train
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    total_loss = 0
    for enc_inp, dec_inp, dec_out in data_loader:
        optimizer.zero_grad()  # Clear previous gradients
        
        # Forward pass
        output = model(enc_inp, dec_inp)  # Output shape should be (batch_size, max_len, vocab_size)

        # Calculate loss
        # Reshape output and dec_out for loss calculation
        output_reshaped = output.view(-1, len(word_map))  # Shape: (batch_size * max_len, vocab_size)
        dec_out_reshaped = dec_out.view(-1)  # Shape: (batch_size * max_len)

        # Calculate the loss
        loss = criterion(output_reshaped, dec_out_reshaped)
        total_loss += loss.item()
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(data_loader)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}')


KeyboardInterrupt



In [None]:
#