# 机器翻译与数据集
机器翻译指的是将序列从一种语言转换成另一种语言，几十年来，在神经网络进行端到端学习的兴起之前，统计学方法在这一领域一直占据主导地位，之后人们使用神经网络进行端到端(end to end)学习，所以机器翻译分为统计机器翻译和神经机器翻译。

在这里我们关注端到端的学习方法，使用神经网络来完成翻译任务}

In [41]:
import os 
import torch as t
import sys
sys.path.append("../")
from pltutils import *
raw_text=read_data_nmt()
print(raw_text[:79])

Go.	Va !
Hi.	Salut !
Run!	Cours !
Run!	Courez !
Who?	Qui ?
Wow!	Ça alors !
Fire


数据预处理

In [42]:
def preprocess_nmt(text:str):
    """
    预处理英语-法语数据集
    """
    def no_space(char,prev_char):
        return char in set(',.!?') and prev_char!=" "
    # 替换成普通空格，转小写
    text=text.replace("\u202f"," ").replace("\xa0"," ").lower()
    # 在单词和标点之间加入空格
    out = [" " + char if i>0 and no_space(char,text[i-1]) else char for i,char in enumerate(text)]
    return "".join(out)

In [43]:
text=preprocess_nmt(raw_text)
print(text[:90])

go .	va !
hi .	salut !
run !	cours !
run !	courez !
who ?	qui ?
wow !	ça alors !
fire !	au


词元化

In [44]:
def tokenize_nmt(text:str,num_examples=None):
    """
    词元化英语-法语数据集
    """
    source,target=[],[]
    for i ,line in enumerate(text.split("\n")):
        if num_examples and i>num_examples:
            break
        # 以水平制表符分隔
        parts=line.split("\t")
        if len(parts)==2:
            source.append(parts[0].split(" "))
            target.append(parts[1].split(" "))
    return source,target

source,target=tokenize_nmt(text)
source[:20],target[:20]


([['go', '.'],
  ['hi', '.'],
  ['run', '!'],
  ['run', '!'],
  ['who', '?'],
  ['wow', '!'],
  ['fire', '!'],
  ['help', '!'],
  ['jump', '.'],
  ['stop', '!'],
  ['stop', '!'],
  ['stop', '!'],
  ['wait', '!'],
  ['wait', '!'],
  ['go', 'on', '.'],
  ['go', 'on', '.'],
  ['go', 'on', '.'],
  ['hello', '!'],
  ['hello', '!'],
  ['i', 'see', '.']],
 [['va', '!'],
  ['salut', '!'],
  ['cours', '!'],
  ['courez', '!'],
  ['qui', '?'],
  ['ça', 'alors', '!'],
  ['au', 'feu', '!'],
  ['à', "l'aide", '!'],
  ['saute', '.'],
  ['ça', 'suffit', '!'],
  ['stop', '!'],
  ['arrête-toi', '!'],
  ['attends', '!'],
  ['attendez', '!'],
  ['poursuis', '.'],
  ['continuez', '.'],
  ['poursuivez', '.'],
  ['bonjour', '!'],
  ['salut', '!'],
  ['je', 'comprends', '.']])

In [45]:
src_vocab=Vocab(source,min_freq=2,reserved_tokens=["<pad>","<bos>","<eos>"])
len(src_vocab)

10012

In [46]:
src_vocab.token_freqs

[('.', 139392),
 ('i', 45611),
 ('you', 43192),
 ('to', 36718),
 ('the', 33263),
 ('?', 27619),
 ('a', 23973),
 ('is', 16829),
 ('tom', 13990),
 ('that', 12651),
 ('he', 12209),
 ('do', 11292),
 ('of', 11287),
 ('it', 11025),
 ('this', 10385),
 ('in', 10317),
 ('me', 10165),
 ('have', 9698),
 ("don't", 9636),
 (',', 9318),
 ('was', 8842),
 ('my', 8096),
 ('are', 7838),
 ('for', 7745),
 ('your', 7481),
 ('what', 7353),
 ("i'm", 7310),
 ('we', 6959),
 ('be', 6899),
 ('want', 6365),
 ('she', 6318),
 ('not', 6286),
 ('know', 5488),
 ('like', 5426),
 ('on', 5319),
 ('with', 5110),
 ('can', 4586),
 ('his', 4496),
 ('all', 4383),
 ('did', 4335),
 ('at', 4304),
 ("you're", 4227),
 ('how', 4117),
 ('go', 4038),
 ('they', 4018),
 ('him', 3931),
 ('think', 3865),
 ('and', 3448),
 ("it's", 3371),
 ('about', 3351),
 ('time', 3311),
 ("can't", 3293),
 ('here', 3221),
 ('very', 3134),
 ("didn't", 3041),
 ('get', 3034),
 ('there', 3021),
 ('her', 2973),
 ('were', 2904),
 ('as', 2897),
 ('will', 2861),

In [47]:
src_vocab.idx_to_token

['<unk>',
 '<pad>',
 '<bos>',
 '<eos>',
 '.',
 'i',
 'you',
 'to',
 'the',
 '?',
 'a',
 'is',
 'tom',
 'that',
 'he',
 'do',
 'of',
 'it',
 'this',
 'in',
 'me',
 'have',
 "don't",
 ',',
 'was',
 'my',
 'are',
 'for',
 'your',
 'what',
 "i'm",
 'we',
 'be',
 'want',
 'she',
 'not',
 'know',
 'like',
 'on',
 'with',
 'can',
 'his',
 'all',
 'did',
 'at',
 "you're",
 'how',
 'go',
 'they',
 'him',
 'think',
 'and',
 "it's",
 'about',
 'time',
 "can't",
 'here',
 'very',
 "didn't",
 'get',
 'there',
 'her',
 'were',
 'as',
 'will',
 'had',
 'if',
 'why',
 'just',
 'up',
 'out',
 'no',
 'has',
 'one',
 'going',
 'would',
 'so',
 'good',
 'need',
 'tell',
 'an',
 'see',
 "i'll",
 'come',
 'when',
 'from',
 'by',
 'really',
 'mary',
 'help',
 'who',
 'please',
 'us',
 "that's",
 'should',
 'could',
 'been',
 "i've",
 'never',
 'more',
 'now',
 'where',
 'take',
 'something',
 'got',
 'too',
 'than',
 'much',
 'make',
 'some',
 "i'd",
 "we're",
 'right',
 'but',
 'work',
 'am',
 'money',
 'an

In [48]:
src_vocab.token_to_idx

{'<unk>': 0,
 '<pad>': 1,
 '<bos>': 2,
 '<eos>': 3,
 '.': 4,
 'i': 5,
 'you': 6,
 'to': 7,
 'the': 8,
 '?': 9,
 'a': 10,
 'is': 11,
 'tom': 12,
 'that': 13,
 'he': 14,
 'do': 15,
 'of': 16,
 'it': 17,
 'this': 18,
 'in': 19,
 'me': 20,
 'have': 21,
 "don't": 22,
 ',': 23,
 'was': 24,
 'my': 25,
 'are': 26,
 'for': 27,
 'your': 28,
 'what': 29,
 "i'm": 30,
 'we': 31,
 'be': 32,
 'want': 33,
 'she': 34,
 'not': 35,
 'know': 36,
 'like': 37,
 'on': 38,
 'with': 39,
 'can': 40,
 'his': 41,
 'all': 42,
 'did': 43,
 'at': 44,
 "you're": 45,
 'how': 46,
 'go': 47,
 'they': 48,
 'him': 49,
 'think': 50,
 'and': 51,
 "it's": 52,
 'about': 53,
 'time': 54,
 "can't": 55,
 'here': 56,
 'very': 57,
 "didn't": 58,
 'get': 59,
 'there': 60,
 'her': 61,
 'were': 62,
 'as': 63,
 'will': 64,
 'had': 65,
 'if': 66,
 'why': 67,
 'just': 68,
 'up': 69,
 'out': 70,
 'no': 71,
 'has': 72,
 'one': 73,
 'going': 74,
 'would': 75,
 'so': 76,
 'good': 77,
 'need': 78,
 'tell': 79,
 'an': 80,
 'see': 81,
 "i'll":

In [49]:
def truncate_pad(line,num_steps,padding_token):
    """
    截断或者填充文本序列
    """
    if len(line)>num_steps:
        return line[:num_steps]
    return line+[padding_token]*(num_steps-len(line))

truncate_pad(src_vocab[source[0]],10,src_vocab["<unk>"])

[47, 4, 0, 0, 0, 0, 0, 0, 0, 0]

In [50]:
def build_array_nmt(lines,vocab,num_steps):
    """
    将文本序列转换成小批量
    """
    lines =[vocab[l] for l in lines]
    lines=[l+[vocab["<eos>"]] for l in lines]
    array=torch.tensor([truncate_pad(l,num_steps,vocab["<pad>"]) for l in lines])
    valid_len=(array!=vocab["<pad>"]).type(torch.int32).sum(1)
    return array,valid_len
    

In [51]:
def load_data_nmt(batch_size, num_steps, num_examples=600):
    """返回翻译数据集的迭代器和词表"""
    text = preprocess_nmt(read_data_nmt())
    source, target = tokenize_nmt(text, num_examples)
    src_vocab = Vocab(source, min_freq=2,
                          reserved_tokens=['<pad>', '<bos>', '<eos>'])
    tgt_vocab = Vocab(target, min_freq=2,
                          reserved_tokens=['<pad>', '<bos>', '<eos>'])
    src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps)
    tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps)
    data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len)
    data_iter = load_array(data_arrays, batch_size)
    return data_iter, src_vocab, tgt_vocab

In [52]:
train_iter, src_vocab, tgt_vocab = load_data_nmt(batch_size=2, num_steps=8)
for X, X_valid_len, Y, Y_valid_len in train_iter:
    print('X:', X.type(torch.int32))
    print('X的有效长度:', X_valid_len)
    print('Y:', Y.type(torch.int32))
    print('Y的有效长度:', Y_valid_len)
    break


X: tensor([[111,  23,   4,   3,   1,   1,   1,   1],
        [  6,  18,  47,   4,   3,   1,   1,   1]], dtype=torch.int32)
X的有效长度: tensor([4, 5])
Y: tensor([[ 0,  5,  3,  1,  1,  1,  1,  1],
        [ 6,  7, 34,  4,  3,  1,  1,  1]], dtype=torch.int32)
Y的有效长度: tensor([3, 5])
