In [81]:
import pandas as pd
import pickle
from keras.preprocessing.text import Tokenizer

# Load the dataset
data = pd.read_csv('deu.txt', delimiter='\t', header=None, names=['english', 'german'])

# Clean the data
data['english'] = data['english'].str.strip().str.lower()
data['german'] = data['german'].str.strip().str.lower()

# Add special tokens to the German sentences
data['german'] = '<start> ' + data['german'] + ' <end>'

# Check for NaN values and drop them
data.dropna(inplace=True)

# Split into train and test datasets (80% train, 20% test)
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)

# Check the shape of the data
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

# Create tokenizers
source_tokenizer = Tokenizer()
source_tokenizer.fit_on_texts(train_data['english'])

target_tokenizer = Tokenizer()
target_tokenizer.fit_on_texts(train_data['german'])

# Save the tokenizers as pickle files
with open('source_tokenizer.pkl', 'wb') as f:
    pickle.dump(source_tokenizer, f)

with open('target_tokenizer.pkl', 'wb') as f:
    pickle.dump(target_tokenizer, f)

print("Tokenizers saved.")


Train data shape: (122256, 2)
Test data shape: (30564, 2)
Tokenizers saved.


In [82]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
import pickle

# Load the dataset
data = pd.read_csv('deu.txt', delimiter='\t', header=None, names=['english', 'german'])

# Clean the data
data['english'] = data['english'].str.strip().str.lower()
data['german'] = data['german'].str.strip().str.lower()

# Add special tokens to the German sentences
data['german'] = '<start> ' + data['german'] + ' <end>'

# Check for NaN values
print("Null values in data:")
print(data.isnull().sum())

# Drop rows with NaN values
data = data.dropna()

# Split into train and test datasets (80% train, 20% test)
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)

# Create and fit tokenizers
source_tokenizer = Tokenizer()
source_tokenizer.fit_on_texts(train_data['english'])

target_tokenizer = Tokenizer()
target_tokenizer.fit_on_texts(train_data['german'])

# Save the tokenizers as pickle files
with open('source_tokenizer.pkl', 'wb') as f:
    pickle.dump(source_tokenizer, f)

with open('target_tokenizer.pkl', 'wb') as f:
    pickle.dump(target_tokenizer, f)


Null values in data:
english    0
german     1
dtype: int64


In [83]:
# Display the first few rows of the modified dataset
print(data[['english', 'german']].head(10))  # Display first 10 rows


   english                       german
1      hi.         <start> hallo! <end>
2      hi.     <start> grüß gott! <end>
3     run!          <start> lauf! <end>
4     wow!    <start> potzdonner! <end>
5     wow!  <start> donnerwetter! <end>
6    fire!         <start> feuer! <end>
7    help!         <start> hilfe! <end>
8    help!       <start> zu hülf! <end>
9    stop!         <start> stopp! <end>
10   wait!         <start> warte! <end>


In [84]:
print(data.head(10))

   english                       german
1      hi.         <start> hallo! <end>
2      hi.     <start> grüß gott! <end>
3     run!          <start> lauf! <end>
4     wow!    <start> potzdonner! <end>
5     wow!  <start> donnerwetter! <end>
6    fire!         <start> feuer! <end>
7    help!         <start> hilfe! <end>
8    help!       <start> zu hülf! <end>
9    stop!         <start> stopp! <end>
10   wait!         <start> warte! <end>


In [87]:
import numpy as np
from keras.utils import pad_sequences
from keras.models import Model
from keras.layers import Input, LSTM, Embedding, Dense, Attention, Concatenate

# Convert texts to sequences
train_source_seq = source_tokenizer.texts_to_sequences(train_data['english'])
train_target_seq = target_tokenizer.texts_to_sequences(train_data['german'])

max_source_len = max(len(seq) for seq in train_source_seq)
max_target_len = max(len(seq) for seq in train_target_seq)

train_source_seq = pad_sequences(train_source_seq, maxlen=max_source_len, padding='post')
train_target_seq = pad_sequences(train_target_seq, maxlen=max_target_len, padding='post')

# Prepare decoder input data (shifted)
decoder_input_data = train_target_seq[:, :-1]  # Exclude the last token
decoder_output_data = train_target_seq[:, 1:]  # Exclude the first token
decoder_output_data = np.expand_dims(decoder_output_data, -1)  # dims - dimensions 

# Build the NMT model
source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

# Encoder definition
encoder_inputs = Input(shape=(max_source_len,))
encoder_embedding = Embedding(source_vocab_size, 256, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(256, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder definition
decoder_inputs = Input(shape=(max_target_len - 1,))
decoder_embedding = Embedding(target_vocab_size, 256, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Applying attention
attention = Attention()([decoder_outputs, encoder_outputs])
decoder_combined_context = Concatenate(axis=-1)([decoder_outputs, attention])

decoder_dense = Dense(target_vocab_size, activation='softmax')
output = decoder_dense(decoder_combined_context)

# Compile the model
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training the model
model.fit([train_source_seq, decoder_input_data], decoder_output_data,
          batch_size=16,
          epochs=20,  # Adjust as needed
          validation_split=0.2)

# Save the model
model.save('nmt_model_with_attention.h5')
print("Model training complete and saved as 'nmt_model_with_attention.h5'")


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model training complete and saved as 'nmt_model_with_attention.h5'


In [88]:
print(f"Input sequence shape: {train_source_seq.shape}")
print(f"Target sequence shape: {train_target_seq.shape}")


Input sequence shape: (122256, 48)
Target sequence shape: (122256, 55)


In [89]:
encoder_lstm = LSTM(256,return_sequences=True, return_state=True)

In [90]:
print(target_tokenizer.word_index)


{'start': 1, 'end': 2, 'ich': 3, 'tom': 4, 'ist': 5, 'nicht': 6, 'sie': 7, 'du': 8, 'das': 9, 'zu': 10, 'die': 11, 'es': 12, 'er': 13, 'der': 14, 'in': 15, 'hat': 16, 'ein': 17, 'dass': 18, 'wir': 19, 'habe': 20, 'was': 21, 'mir': 22, 'auf': 23, 'sich': 24, 'mit': 25, 'den': 26, 'eine': 27, 'mich': 28, 'wie': 29, 'ihr': 30, 'war': 31, 'und': 32, 'an': 33, 'haben': 34, 'kann': 35, 'einen': 36, 'maria': 37, 'von': 38, 'sind': 39, 'dem': 40, 'für': 41, 'so': 42, 'als': 43, 'bin': 44, 'sein': 45, 'dich': 46, 'hast': 47, 'noch': 48, 'dir': 49, 'um': 50, 'im': 51, 'aus': 52, 'uns': 53, 'nach': 54, 'sehr': 55, 'etwas': 56, 'wird': 57, 'wenn': 58, 'hier': 59, 'meine': 60, 'mein': 61, 'ihn': 62, 'mehr': 63, 'bitte': 64, 'schon': 65, 'warum': 66, 'vor': 67, 'weiß': 68, 'bist': 69, 'keine': 70, 'werden': 71, 'gehen': 72, 'tun': 73, 'seine': 74, 'wurde': 75, 'nichts': 76, 'diese': 77, 'hatte': 78, 'immer': 79, 'gut': 80, 'viel': 81, 'man': 82, 'ihm': 83, 'nur': 84, 'muss': 85, 'will': 86, 'dieses'

In [None]:
import numpy as np
import pickle
from keras.models import load_model
from keras.utils import pad_sequences


# Load the tokenizers
with open('source_tokenizer.pkl', 'rb') as f:
    source_tokenizer = pickle.load(f)

with open('target_tokenizer.pkl', 'rb') as f:
    target_tokenizer = pickle.load(f)

# Load the trained model
model = load_model('nmt_model.h5')
max_source_len = 48  
max_target_len = 54  

# Create the encoder model to get encoder outputs and states
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

# Function to apply softmax with temperature
def softmax_with_temperature(logits, temperature=1.0):
    exp_logits = np.exp(logits / temperature)
    return exp_logits / np.sum(exp_logits)

# Function to sample token from predictions
def sample(preds, top_k=3):
    # Get the top k indices
    indices = np.argsort(preds)[-top_k:]  
    probabilities = preds[indices] / np.sum(preds[indices])  # Normalize top-k probabilities
    return np.random.choice(indices, p=probabilities)  # Sample from top-k

def translate_sentence(input_text):
    input_seq = source_tokenizer.texts_to_sequences([input_text])
    if not input_seq or not input_seq[0]:
        return "Input sentence cannot be translated. Please check the input."
        
    input_seq = pad_sequences(input_seq, maxlen=max_source_len, padding='post')
    encoder_outputs, state_h, state_c = encoder_model.predict(input_seq)
    encoder_states = [state_h, state_c]

    start_token = "start"
    target_seq = np.array([[target_tokenizer.word_index[start_token]]])
    translated = ''
    max_repeats = 1  # Maximum times a word can be repeated
    word_count = {}

    while True:
        target_seq = pad_sequences(target_seq, maxlen=max_target_len, padding='post')
        output_tokens = model.predict([input_seq, target_seq])
        
        # Sample from the output with temperature
        output_probs = softmax_with_temperature(output_tokens[0, -1, :], temperature=0.8)
        sampled_token_index = np.random.choice(range(len(output_probs)), p=output_probs)
        sampled_word = target_tokenizer.index_word.get(sampled_token_index, '')

        # Prevent repeating too many times
        if sampled_word in word_count:
            if word_count[sampled_word] >= max_repeats:
                continue  # Skip adding this word if it exceeds max repeats
            word_count[sampled_word] += 1
        else:
            word_count[sampled_word] = 1

        if sampled_word == '<end>' or len(translated.split()) >= (max_target_len - 1):
            break

        translated += ' ' + sampled_word
        target_seq = np.array([[sampled_token_index]])

    return translated.strip()


# Sample usage
if __name__ == "__main__":
    input_sentence = input("Enter an English sentence to translate: ")
    translation = translate_sentence(input_sentence)
    print("Translation:", translation)

In [70]:
import numpy as np
import pandas as pd
import pickle
from keras.models import load_model
from keras.utils import pad_sequences
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns


# Load the tokenizers
with open('source_tokenizer.pkl', 'rb') as f:
    source_tokenizer = pickle.load(f)

with open('target_tokenizer.pkl', 'rb') as f:
    target_tokenizer = pickle.load(f)

# Load the trained model
model = load_model('nmt_model.h5')

# Set max lengths for padding (Adjustable)
max_source_len = 50
max_target_len = 50

# Define some test pairs (input sentences and expected output)
# Replace these sentences with your actual test cases and their expected translations
test_sentences = [
    ("Hello", "<start> hallo <end>"),
    ("How are you?", "<start> wie geht es dir? <end>"),
    ("What is your name?", "<start> wie heißt du? <end>"),
    # Add more test sentences as needed
]

# Prepare to collect predictions
predictions = []
true_labels = []

for input_text, expected_translation in test_sentences:
    translated_sentence = translate_sentence(input_text)
    predictions.append(translated_sentence)
    true_labels.append(expected_translation)

# Calculate metrics
def calculate_metrics(true, pred):
    # Flatten
    true_flat = [word for sentence in true for word in sentence.split()]
    pred_flat = [word for sentence in pred for word in sentence.split()]

    # Generate labels from tokenizer's index
    labels = target_tokenizer.word_index.keys()

    # Create confusion matrix
    cm = confusion_matrix(true_flat, pred_flat, labels=labels)
    print("Confusion Matrix:\n", cm)

    # Classification Report
    report = classification_report(true_flat, pred_flat, target_names=labels)
    print("\nClassification Report:\n", report)

    # Accuracy Score
    accuracy = accuracy_score(true_flat, pred_flat)
    print(f"\nAccuracy: {accuracy:.4f}")

    return cm

# Evaluate
confusion_mat = calculate_metrics(true_labels, predictions)

# Plot confusion matrix
def plot_confusion_matrix(cm, labels):
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title('Confusion Matrix')
    plt.show()

# Plotting the confusion matrix
plot_confusion_matrix(confusion_mat, target_tokenizer.word_index.keys())


Initial input sequence: [[1837]]
Padded input sequence: [[1837    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]]


ValueError: in user code:

    File "c:\Users\NUHAYD\anaconda3\envs\tf_env\lib\site-packages\keras\engine\training.py", line 2041, in predict_function  *
        return step_function(self, iterator)
    File "c:\Users\NUHAYD\anaconda3\envs\tf_env\lib\site-packages\keras\engine\training.py", line 2027, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\NUHAYD\anaconda3\envs\tf_env\lib\site-packages\keras\engine\training.py", line 2015, in run_step  **
        outputs = model.predict_step(data)
    File "c:\Users\NUHAYD\anaconda3\envs\tf_env\lib\site-packages\keras\engine\training.py", line 1983, in predict_step
        return self(x, training=False)
    File "c:\Users\NUHAYD\anaconda3\envs\tf_env\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\NUHAYD\anaconda3\envs\tf_env\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 1 of layer "model_3" is incompatible with the layer: expected shape=(None, 54), found shape=(None, 50)
