In [19]:
import hashlib
import os
import requests
import re
import collections
import tarfile
import zipfile
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'

DATA_HUB['fra-eng'] = (DATA_URL + 'fra-eng.zip',
                           '94646ad1522d915e7b0f9296181140edcf86a4f5')

def download(name, cache_dir=os.path.join('..', 'data')):  #@save
    """下载一个DATA_HUB中的文件，返回本地文件名"""
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # 命中缓存
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname
def download_extract(name, folder=None):  #@save
    """下载并解压zip/tar文件"""
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, '只有zip/tar文件可以被解压缩'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

def read_data_nmt():
    data_dir = download_extract('fra-eng')
    with open(os.path.join(data_dir,'fra.txt'),'r') as f:
        return f.read()
raw_text = read_data_nmt()
print(raw_text[:75])

Go.	Va !
Hi.	Salut !
Run!	Cours !
Run!	Courez !
Who?	Qui ?
Wow!	Ça alors !



In [20]:
def preprocess_nmt(text):
    def no_space(char,prev_char):
        return char in set(',.!?') and prev_char != ' '
    text = text.replace('\u202f',' ').replace('\xa0',' ').lower()

    out = [
        ' ' + char if i > 0 and no_space(char,text[i-1]) else char
        for i,char in enumerate(text)
    ]
    return ''.join(out)

text = preprocess_nmt(raw_text)
print(text[:80])

go .	va !
hi .	salut !
run !	cours !
run !	courez !
who ?	qui ?
wow !	ça alors !


In [21]:
def tokenize_nmt(text,num_examples=None):
    source ,target = [],[]
    for i,line in enumerate(text.split('\n')):
        if num_examples and i > num_examples:
            break
        parts = line.split('\t')
        if len(parts) == 2:
            source.append(parts[0].split(' '))
            target.append(parts[1].split(' '))
    return source,target

source,target = tokenize_nmt(text)
source[:6],target[:6]

([['go', '.'],
  ['hi', '.'],
  ['run', '!'],
  ['run', '!'],
  ['who', '?'],
  ['wow', '!']],
 [['va', '!'],
  ['salut', '!'],
  ['cours', '!'],
  ['courez', '!'],
  ['qui', '?'],
  ['ça', 'alors', '!']])

In [22]:
class Vocab:
    def __init__(self,tokens=None,min_freq=0,reserved_token=None):
        if tokens is None:
            tokens = []
        if reserved_token is None:
            reserved_token = []
        counter = self.count_corpus(tokens)
        self.token_freqs = sorted(counter.items(),key=lambda x :x[1],reverse=True)
        
        self.unk , uniq_token = 0,['<unk>'] + reserved_token

        uniq_token+=[
            token for token,freq in self.token_freqs
            if freq >= min_freq and token not in uniq_token
        ] 
        self.idx_to_token ,self.token_to_idx = [],dict()
        for token in uniq_token:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1
    def count_corpus(self,tokens):
        if len(tokens)==0 or isinstance(tokens[0],list):
            tokens = [token for line in tokens
                      for token in line]
        return collections.Counter(tokens)
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self,indices):
        if not isinstance(indices,(list,tuple)):
            return self.token_to_idx[indices]
        return [self.token_to_idx[index] for index in indices]
    def to_tokens(self,indices):
        if not isinstance(indices,(list,tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]
src_vocab = Vocab(source,min_freq=2,reserved_token=['<pad>','<bos>','<eos>'])
len(src_vocab)

10012

In [23]:
def truncate_pad(line,num_steps,padding_token):
    if len(line) > num_steps:
        return line[:num_steps]
    return line + [padding_token] * (num_steps - len(line))
truncate_pad(src_vocab[source[0]],10,src_vocab['<pad>'])

[47, 4, 1, 1, 1, 1, 1, 1, 1, 1]

In [24]:
import torch
from torch import nn
def build_array_nmt(lines,vocab,num_steps):
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab['<eos>']] for l in lines]
    array = torch.tensor(
        [truncate_pad(l,num_steps,padding_token=vocab["<pad>"]) for l in lines]
    )
    valid_len = (array !=vocab['<pad>']).type(torch.float32).sum(1)
    return array ,valid_len

In [25]:
from torch.utils import data
def load_array(data_arrays, batch_size, is_train=True):  #@save
    """构造一个PyTorch数据迭代器"""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

In [26]:
def load_data_nmt(batch_size,num_step,num_examples=600):
    text = preprocess_nmt(read_data_nmt())
    source,target = tokenize_nmt(text,num_examples)
    src_vocab = Vocab(source,min_freq=0,reserved_token=['<pad>','<bos>','<eos>'])
    tgt_vocab = Vocab(target,min_freq=0,reserved_token=['<pad>','<bos>','<eos>'])

    src_array ,src_valid_len = build_array_nmt(
        source,
        src_vocab,
        num_steps=num_step
    )
    tgt_array,tgt_valid_len = build_array_nmt(
        target,
        tgt_vocab,
        num_step
    )
    data_arrays = (src_array,src_valid_len,tgt_array,tgt_valid_len)
    data_iter = load_array(data_arrays,batch_size)
    return data_iter,src_vocab,tgt_vocab

In [27]:
train_itrer,src_vocab,tgt_vocab =  load_data_nmt(batch_size=2,num_step=8)

for X,X_valid_len ,Y,Y_valid_len in train_itrer:
    print("X:",X)
    print("X.valid :",X_valid_len)
    print("Y:",Y)
    print("Y.valid:",Y_valid_len)
    break

X: tensor([[ 68,  60,   4,   3,   1,   1,   1,   1],
        [  6, 197,   4,   3,   1,   1,   1,   1]])
X.valid : tensor([4., 4.])
Y: tensor([[ 64,  53,   4,   3,   1,   1,   1,   1],
        [  6, 280,   4,   3,   1,   1,   1,   1]])
Y.valid: tensor([4., 4.])
