# <u>Chapter 6</u>: Teaching Machines to Translate

In [None]:
import sys
import subprocess
import pkg_resources

# Find out which packages are missing.
installed_packages = {dist.key for dist in pkg_resources.working_set}
required_packages = {'nltk'}
missing_packages = required_packages - installed_packages

# If there are missing packages install them.
if missing_packages:
    print('Installing the following packages: ' + str(missing_packages))
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing_packages], stdout=subprocess.DEVNULL)

In [None]:
import nltk

# Tokenize the input text.
text = nltk.word_tokenize("The sky is blue")

# Parse the input.
nltk.pos_tag(text)

In [None]:
# Create the grammar that consists of six rules. 
# S:sentence, NP:noun phrase, DT:determiner, NN:noun, 
# VBZ:verb in the third person singular, JJ:adjective.
analysis_grammar = nltk.CFG.fromstring("""
    S -> NP VBZ JJ	
    NP -> DT NN	
    DT -> 'The'	
    NN -> 'sky'	
    VBZ -> 'is'	
    JJ -> 'blue'
    """)
 	
# Create the input.
input = ['The', 'sky', 'is', 'blue']

# Parse the input.
parser = nltk.ChartParser(analysis_grammar)

# Print the parse trees.
for tree in parser.parse(input):
    print(tree)
    #tree.draw()


In [None]:
# The grammar consists  of six but more powerful rules.
analysis_grammar = nltk.CFG.fromstring("""
    S -> NP VBZ JJ	
    NP -> DT NN	
    DT -> 'The' | 'the'	
    NN -> 'sky' | 'sea'	
    VBZ -> 'is'	
    JJ -> 'blue' | 'red'
    """)

In [None]:
from nltk.parse.generate import generate

# Generate ten examples at most.
for sentence in generate(analysis_grammar, n=10):
    print(' '.join(sentence))

In [None]:
# Create the dependency grammar that includes three rules.
dependency_grammar = nltk.DependencyGrammar.fromstring("""
    'is' -> 'sky' | 'sea' | 'blue' | 'red'
    'sky' -> 'The' | 'the' 
    'sea' -> 'The' | 'the' 
    """)

# Create the dependency parser.
pdp = nltk.ProjectiveDependencyParser(dependency_grammar)

# Create the input.
input = ['The', 'sky', 'is', 'blue']

# Parse the input.
trees = pdp.parse(input)

# Print the parse trees.
for tree in trees:
    print(tree)

In [None]:
# Download nltk models/corpora.
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Tokenize the input text.
text = nltk.word_tokenize("The Aston Martin is blue")

# Parse the input.
tags = nltk.pos_tag(text)

# Find the name entities.
tree = nltk.ne_chunk(tags)

# Draw the tree.
#tree.draw()

In [None]:
# Get the IOB tags.
iob_tags = nltk.tree2conlltags(tree)

# Print the IOB tags.
print(iob_tags)

In [None]:
# Create the grammar string.
g = """

# S expansion productions.
S[AGR1=?np, ARG2=?vbz, ARG3=?jj] -> NP[AGR=?np] VBZ[AGR=?vbz] JJ[AGR=?jj]

# NP expansion productions.
NP[AGR=[DT=?dt, NN=?nn]] -> DT[AGR=?dt] NN[AGR=?nn] 

# Lexical productions.
DT[AGR=[TEXT='Le', SEM='determiner']] -> 'The' 
DT[AGR=[TEXT='le', SEM='determiner']] -> 'the' 
NN[AGR=[TEXT='ciel', SEM='noun']] -> 'sky'
NN[AGR=[TEXT='mer', SEM='noun']] -> 'sea'
VBZ[AGR=[TEXT='être', SEM='verb', TENSE='present', NUM='singular']] -> 'is'
JJ[AGR=[TEXT='bleu', SEM='adjective']] -> 'blue'
JJ[AGR=[TEXT='rouge', SEM='adjective']] -> 'red'
"""

# Create the input, transfer grammar, and parser.
input = ['The', 'sky', 'is', 'blue']
transfer_grammar = nltk.grammar.FeatureGrammar.fromstring(g)
parser = nltk.parse.FeatureEarleyChartParser(transfer_grammar)

# Parse the input and print the result.
trees = parser.parse(input)
for tree in trees: print(tree)


In [None]:
# Create the grammar string.
g = """

# S expansion productions.
S[AGR1=?np, ARG2=?vbz, ARG3=?jj] -> NP[AGR=?np] VBZ[AGR=?vbz] JJ[AGR=?jj]

# NP expansion productions.
NP[AGR=[DT=?dt, NN=?nn]] -> DT[AGR=?dt] NN[AGR=?nn] 

# Lexical productions.	
DT[AGR=[TEXT='Le']] -> 'Le' 
DT[AGR=[TEXT='le']] -> 'le' 
NN[AGR=[TEXT='ciel']] -> 'ciel'
NN[AGR=[TEXT='mer']] -> 'mer'
VBZ[AGR=[TEXT='est', SEM='verb', TENSE='present', NUM='singular']] -> 'être'
JJ[AGR=[TEXT='bleu']] -> 'bleu'
JJ[AGR=[TEXT='rouge']] -> 'rouge'
"""

# Create the input, transfer grammar, and parser.
input = ['Le', 'ciel', 'être', 'bleu']
generation_grammar = nltk.grammar.FeatureGrammar.fromstring(g)
parser = nltk.parse.FeatureEarleyChartParser(generation_grammar)

# Parse the input and print the result.
trees = parser.parse(input)
for tree in trees: print(tree)

In [None]:
from nltk.translate import AlignedSent, Alignment

# Hold the bi-lingual text.
bitext = []

# Create two examples from German to English, along with the alignments.
bitext.append(AlignedSent(['blue', 'is', 'The', 'sky'], 
                            ['Le', 'ciel', 'est', 'bleu'], 
                            Alignment.fromstring('0-3 1-2 2-0 3-1')))
bitext.append(AlignedSent(['yellow', 'is', 'The', 'sun'], 
                            ['Le', 'soleil', 'est', 'jaune'], 
                            Alignment.fromstring('0-3 1-2 2-0 3-1')))

# Print the source words in the second example.
bitext[1].words

In [None]:
# Print the target words in the second example.
bitext[1].mots

In [None]:
# Print the alignments in the second example.
bitext[1].alignment

In [None]:
# Download nltk corpus.
nltk.download('comtrans')

from nltk.corpus import comtrans

# Get the first example from the english/french corpus.
fe = comtrans.aligned_sents('alignment-en-fr.txt')[0]

# Print the source words.
fe.words

In [None]:
# Print the target words.
fe.mots

In [None]:
# Print the alignments.
fe.alignment

In [None]:
# Get the 52nd example from the English/French corpus.
fe = comtrans.aligned_sents('alignment-en-fr.txt')[52]

# Print the source words.
fe.words

In [None]:
# Print the target words.
fe.mots

In [None]:
# Print the alignments.
fe.alignment

In [None]:
import nltk.translate.ibm2
from nltk.translate import AlignedSent, Alignment

# Hold the bi-lingual text.
bitext = []

# Create examples from French to English.
bitext.append(AlignedSent(
    ['petite', 'est', 'la', 'maison'],
    ['the', 'house', 'is', 'small']))
bitext.append(AlignedSent(
    ['la', 'maison', 'est', 'grande'], 
    ['the', 'house', 'is', 'big']))
bitext.append(AlignedSent
    (['le', 'livre', 'est', 'petit'], 
    ['the', 'book', 'is', 'small']))
bitext.append(AlignedSent(
    ['la', 'maison'], ['the', 'house']))
bitext.append(AlignedSent(['le', 'livre'], ['the', 'book']))
bitext.append(AlignedSent(['un', 'livre'], ['a', 'book']))

# Create the lexical translation model from the examples.
ibm2 = nltk.translate.ibm2.IBMModel2(bitext, 5)

# Get the translation probabilities from the model.
print(round(ibm2.translation_table['livre']['book'], 3))

In [None]:
# Consider one example from the bi-lingual text.
test_sentence = bitext[2]
test_sentence.words

In [None]:
test_sentence.mots

In [None]:
from collections import defaultdict
from math import log
from nltk.translate import PhraseTable
from nltk.translate.stack_decoder import StackDecoder

# Create the phrase table.
phrase_table = PhraseTable()

# Populate the table with examples.
phrase_table.add(('das',), ('the', 'it'), log(0.4))
phrase_table.add(('das', 'ist'), ('this', 'is'), log(0.8))
phrase_table.add(('ein',), ('a',), log(0.8))
phrase_table.add(('haus',), ('house',), log(1.0))
phrase_table.add(('!',), ('!',), log(0.8))

# Create the dictionary of probabilities for each n-gram.
language_prob = defaultdict(lambda: -999.0)

# Populate the dictionary uni-grams and bi-grams.
language_prob[('this',)] = log(0.8)
language_prob[('is',)] = log(0.6)
language_prob[('a', 'house')] = log(0.2)
language_prob[('!',)] = log(0.1)

# Create the language model.
language_model = type('',(object,),{'probability_change': lambda self, context, phrase: language_prob[phrase], 'probability': lambda self, phrase: language_prob[phrase]})()

# Create the stack decoder and translate a sentence.
stack_decoder = StackDecoder(phrase_table, language_model)
stack_decoder.translate(['das', 'ist', 'ein', 'haus', '!'])	

Remove code

In [None]:
from math import log
from nltk.translate import PhraseTable

# The translation model.
phrase_table = PhraseTable()
	
# Open the phrase-table file.
f = open("./data/phrase-table", "r")

# Iterate over all lines.
for line in f:
    
    # Extract all the elements in the line.
    line = line.strip().split(' ||| ')
    
    # Get the elements we are interested in.
    source, target, probabilities = line[:3]
    
    # Get the φ(f|e).
    prob = float(probabilities.split( )[0])
	
    # Store the information into our phrase_table.
    phrase_table.add((source,), (target,), log(prob))

In [None]:
import gzip
from collections import defaultdict

# The probability of each n-gram. 
language_prob = defaultdict(lambda: -999.0)

# Open the europarl language model.
with gzip.open('./data/europarl.srilm.gz', 'r') as f:
	
    # Iterate over all lines.
    for line in f:
	
        # Use tab to split each line.
        line = line.decode('latin-1').strip().split('\t')

        # There is enough info in the line.
        if len(line) > 1:
            prob, ngram = line[:2]
            language_prob[(ngram,)] = float(prob)

# Create our language model.  		
language_model = type('',(object,),{'probability_change': lambda self, context, phrase: language_prob[phrase], 'probability': lambda self, phrase: language_prob[phrase]})()

In [None]:
from nltk.translate.stack_decoder import StackDecoder

# Translate a German sentence to English.
stack_decoder = StackDecoder(phrase_table, language_model)
stack_decoder.translate(['das', 'haus', 'ist', 'klein'])