**Initialization**
- I use these three lines of code on top of my each notebooks because it will help to prevent any problems while reloading the same project. And the third line of code helps to make visualization within the notebook.

In [1]:
#@ INITIALIZATION: 
%reload_ext autoreload
%autoreload 2
%matplotlib inline

**Machine Translation**
- Machine translation is the process of reproducing human translation by machine transductions and outputs. The transduction process of the original Transformer architecture uses the encoder, the decoder stack, and all of the model's parameters to represent a reference sequence. 

**Processing Dataset**

In [2]:
%%writefile read.py
#@ PROCESSING THE DATASET:
import pickle
from pickle import dump

import re 
import string
import unicodedata

#@ LOADING INTO MEMORY:
def load_doc(filename):                                 # Defining function.
    file = open(filename, mode="rt", encoding="utf-8")  # Opening file.
    text = file.read()                                  # Reading file.
    file.close()                                        # Closing file.
    return text                                         # Getting all texts.

#@ SPLITTING INTO SENTENCES:
def to_sentence(doc):                                   # Defining function.
    return doc.strip().split("\n")                      # Splitting into sentences. 

#@ SHORTEST AND LONGEST SENTENCES:
def sentence_lengths(sentences):                        # Defining function.
    lengths = [len(s.split()) for s in sentences]       # Lengths of sentence.
    return min(lengths), max(lengths)                   # Getting minimum and maximum.

#@ CLEANING SENTENCES:
def clean_lines(lines):                                            # Defining function.
    cleaned = list()                                               # Initialization. 
    re_print = re.compile('[^%s]' % re.escape(string.printable))   # Preparing regex.
    table = str.maketrans('', '', string.punctuation)              # Removing punctuation. 
    for line in lines:
        line = unicodedata.normalize(
            "NFD", line).encode("ascii", "ignore")                 # Normalizing unicode.
        line = line.decode("UTF-8")
        line = line.split()                                        # Tokenization.
        line = [word.lower() for word in line]                     # Lower case.
        line = [word.translate(table) for word in line]            # Remove punctuation.
        line = [re_print.sub('', w) for w in line]                 # Remove non-printable.
        line = [word for word in line if word.isalpha()]           # Remove numbers.
        cleaned.append(' '.join(line))                             # Storing as string.
    return cleaned 

#@ LOADING ENGLISH DATA:
path_file_en = "/content/drive/MyDrive/Data/europarl-v7.fr-en.en"  # English data.
doc = load_doc(path_file_en)                                       # Loading.
sentences = to_sentence(doc)                                       # Splitting into sentences.
minlen, maxlen = sentence_lengths(sentences)                       # Shortest and longest.
print("English data: sentences=%d, min=%d, max=%d" %(
    len(sentences), minlen, maxlen))
cleanf = clean_lines(sentences)                                    # Cleaning sentences.
filename = "English.pkl"                                           # Initialization.
outfile = open(filename, "wb")
pickle.dump(cleanf, outfile)                                       # Storing.
outfile.close()                                                    # Closing.
print(filename, " saved")                                          # Inspection. 

#@ LOADING FRENCH DATA:
path_file_fr = "/content/drive/MyDrive/Data/europarl-v7.fr-en.fr"  # French data.
doc = load_doc(path_file_fr)                                       # Loading.
sentences = to_sentence(doc)                                       # Splitting into sentences.
minlen, maxlen = sentence_lengths(sentences)                       # Shortest and longest.
print("French data: sentences=%d, min=%d, max=%d" %(
    len(sentences), minlen, maxlen))
cleanf = clean_lines(sentences)                                    # Cleaning sentences.
filename = "French.pkl"                                            # Initialization.
outfile = open(filename, "wb")
pickle.dump(cleanf, outfile)                                       # Storing.
outfile.close()                                                    # Closing.
print(filename, " saved")                                          # Inspection. 

Writing read.py


In [4]:
#@ PROCESSING THE DATA: 
!python read.py

English data: sentences=2007723, min=0, max=668
English.pkl  saved
French data: sentences=2007723, min=0, max=693
French.pkl  saved


**Preprocessing Dataset**

In [5]:
%%writefile read_clean.py
#@ PREPROCESSING THE DATASET:

import pickle
from pickle import load
from pickle import dump
from collections import Counter

#@ LOADING CLEAN DATASET:
def load_clean_sentences(filename):         # Defining function.
    return load(open(filename, "rb"))       # Loading clean dataset.

#@ SAVING CLEAN DATASET:
def save_clean_sentences(sentences, filename):      # Defining function. 
    dump(sentences, open(filename, "wb"))           # Saving dataset. 
    print("Saved: %s" % filename)                   # Inspection.

#@ CREATING SEQUENCE TABLE: VOCABULARY:
def to_vocab(lines):                                # Defining function. 
    vocab = Counter()                               # Initializing counter.
    for line in lines:
        tokens = line.split()                       # Tokenization.
        vocab.update(tokens)                        # Updating.
    return vocab                                    # Getting vocabulary.

#@ PREPROCESSING VOCABULARY:
def trim_vocab(vocab, min_occurance):                               # Defining function.
    tokens = [k for k,c in vocab.items() if c >= min_occurance]     # Trimming vocabulary tokens.
    return set(tokens)                                              # Getting tokens.

#@ PROCESSING OOV WORDS:
def update_dataset(lines, vocab):                                   # Defining function.
    new_lines = list()                                              # Initialization. 
    for line in lines:
        new_tokens = list()                                         # Initialization.
        for token in line.split():
            if token in vocab:
                new_tokens.append(token)
            else:
                new_tokens.append("unk")                            # Adding unknown tokens.
        new_line = ' '.join(new_tokens)
        new_lines.append(new_line)
    return new_lines                                                # Getting updated lines.

#@ LOADING ENGLISH DATASET:
filename = "English.pkl"                                            # Initialization.
lines = load_clean_sentences(filename)                              # Loading.
vocab = to_vocab(lines)                                             # Initializing vocabulary. 
print("English vocabulary: %d" % len(vocab))                        # Inspection.
vocab = trim_vocab(vocab, 5)                                        # Reducing vocabulary.
print("New English vocabulary: %d" % len(vocab))                    # Inspection. 
lines = update_dataset(lines, vocab)                                # Processing OOV.
filename = "english_vocab.pkl"                                      # Initialization. 
save_clean_sentences(lines, filename)
for i in range(10):
    print("line", i, ":", lines[i])                                 # Inspection.

#@ LOADING FRENCH DATASET:
filename = "French.pkl"                                             # Initialization.
lines = load_clean_sentences(filename)                              # Loading.
vocab = to_vocab(lines)                                             # Initializing vocabulary. 
print("French vocabulary: %d" % len(vocab))                         # Inspection.
vocab = trim_vocab(vocab, 5)                                        # Reducing vocabulary.
print("New French vocabulary: %d" % len(vocab))                     # Inspection. 
lines = update_dataset(lines, vocab)                                # Processing OOV.
filename = "french_vocab.pkl"                                       # Initialization. 
save_clean_sentences(lines, filename)
for i in range(10):
    print("line", i, ":", lines[i])                                 # Inspection.

Writing read_clean.py


In [6]:
#@ PREPROCESSING THE DATASET: 
!python read_clean.py

English vocabulary: 105357
New English vocabulary: 41746
Saved: english_vocab.pkl
line 0 : resumption of the session
line 1 : i declare resumed the session of the european parliament adjourned on friday december and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period
line 2 : although as you will have seen the dreaded millennium bug failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful
line 3 : you have requested a debate on this subject in the course of the next few days during this partsession
line 4 : in the meantime i should like to observe a minute s silence as a number of members have requested on behalf of all the victims concerned particularly those of the terrible storms in the various countries of the european union
line 5 : please rise then for this minute s silence
line 6 : the house rose and observed a minute s silence
line 7 : madam president o

**Bilingual Evaluation Understudy Score (BLEU)**

In [7]:
%%writefile BLEU.py
#@ BILINGUAL EVALUATION UNDERSTUDY:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

#@ EXAMPLE 1:
reference = [['the', 'cat', 'likes', 'milk'], ['cat', 'likes' 'milk']]
candidate = ['the', 'cat', 'likes', 'milk']
score = sentence_bleu(reference, candidate)
print("Example 1", score)

#@ EXAMPLE 2:
reference = [['the', 'cat', 'likes', 'milk']]
candidate = ['the', 'cat', 'likes', 'milk']
score = sentence_bleu(reference, candidate)
print("Example 2", score)

#@ EXAMPLE 3:
reference = [['the', 'cat', 'likes', 'milk']]
candidate = ['the', 'cat', 'enjoys', 'milk']
score = sentence_bleu(reference, candidate)
print("Example 3", score)

#@ CHENCHERRY SMOOTHING:
reference = [['je','vous','invite', 'a', 'vous', 'lever','pour', 'cette', 'minute', 'de', 'silence']]
candidate = ['levez','vous','svp','pour', 'cette', 'minute', 'de', 'silence']
score = sentence_bleu(reference, candidate)
print("Without smoothing score", score)

#@ CHENCHERRY SMOOTHING: 
chencherry = SmoothingFunction()
r1 = list("je vous invite a vous lever pour cette minute de silence")
candidate = list('levez vous svp pour cette minute de silence')
print("With smoothing score", sentence_bleu([r1], candidate, 
                                            smoothing_function=chencherry.method1))

Writing BLEU.py


In [9]:
#@ BILINGUAL EVALUATION UNDERSTUDY SCORE: UNCOMMENT BELOW:
# !python BLEU.py

**Translations with Trax**

In [11]:
#@ IMPORTING MODULES: UNCOMMENT BELOW:
# !pip install trax
import os
import numpy as np
import trax

#@ IGNORING WARNINGS: 
import warnings
warnings.filterwarnings("ignore")

**Initializing Transformer Model**

In [20]:
#@ CREATING TRANSFORMER MODEL:
model = trax.models.Transformer(input_vocab_size=33300,
                                d_model=512, d_ff=2048,
                                n_heads=8, n_encoder_layers=6, 
                                n_decoder_layers=6, max_len=2048,
                                mode="predict")                                 # Initializing transformer model. 
model.init_from_file("gs://trax-ml/models/translation/ende_wmt32k.pkl.gz",
                     weights_only=True);                                        # Initializing pretrained weights.

**Tokenization**

In [21]:
#@ TOKENIZING THE SENTENCE:
sentence = "I am only a machine but I have machine intelligence."           # Initialization.
tokenized = list(trax.data.tokenize(iter([sentence]),
                                    vocab_dir='gs://trax-ml/vocabs/',
                                    vocab_file='ende_32k.subword'))[0]      # Tokenization.

#@ DECODING FROM TRANSFORMER:
tokenized = tokenized[None, :]                                              # Adding batch dimensions.
tokenized_translation = trax.supervised.decoding.autoregressive_sample(
    model, tokenized, temperature=0.0
)                                                                           # Initializing decoding.

#@ DE-TOKENIZING AND TRANSLATION:
tokenized_translation = tokenized_translation[0][:-1]                       # Removing batch.
translation = trax.data.detokenize(tokenized_translation,
                                   vocab_dir='gs://trax-ml/vocabs/',
                                   vocab_file='ende_32k.subword')           # Initializing translation. 
print("The sentence:", sentence)                                            # Inspection.
print("The translation:", translation)                                      # Inspection.

The sentence: I am only a machine but I have machine intelligence.
The translation: Ich bin nur eine Maschine, aber ich habe Maschinenübersicht.
