# Translating French to English with Pytorch

In [1]:
%matplotlib inline
import re, pickle, collections, bcolz, numpy as np, keras, sklearn, math, operator

Using TensorFlow backend.


In [2]:
from gensim.models import word2vec

import torch, torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F



In [3]:
path='data/training/'
dpath = 'data/validation/'

## Prepare corpus

In [4]:
fname=path+'train'
en_fname = fname+'.en'
zh_fname = fname+'.zh'

To make this problem a little simpler so we can train our model more quickly, we'll just learn to translate questions that begin with 'Wh' (e.g. what, why, where which). Here are our regexps that filter the sentences we want.

### Tokenizing and word2vec

In [5]:
from tqdm import tqdm
import jieba

re_apos = re.compile(r"(\w)'s\b")         # make 's a separate word
re_mw_punc = re.compile(r"(\w[’'])(\w)")  # other ' in a word creates 2 words
re_punc = re.compile("([\"().,;:/_?!—])") # add spaces around punctuation
re_mult_space = re.compile(r"  *")        # replace multiple spaces with just one

def simple_toks(sent):
    sent = re_apos.sub(r"\1 's", sent)
    sent = re_mw_punc.sub(r"\1 \2", sent)
    sent = re_punc.sub(r" \1 ", sent).replace('-', ' ')
    sent = re_mult_space.sub(' ', sent)
    return sent.lower().split()


class MyEnSentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for line in tqdm(open(self.dirname)):
            yield simple_toks(line)
            
class MyZhSentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for line in tqdm(open(self.dirname)):
            yield jieba.cut(line)            

In [6]:
def train_save_wordvec_export(lan ='en'):
    output = 'input/'+lan
    if lan == 'en':
        corpus = MyEnSentences(en_fname)
    else:
        corpus = MyZhSentences(zh_fname)
    model = Word2Vec(corpus, min_count=10, sg=1,workers=2,iter=50)    
    print('model trained!')
    vocabulary = model.wv.vocab
    embeddings = []
    for word in vocabulary:
        embeddings.append(model[word])    
    embeddings = np.array(embeddings, dtype=np.float32)
    pickle.dump(vocabulary, open(output+'_vocab.pkl', "wb"))
    np.save(output+'word2vec.npy',embeddings)
    print('word vector saved!')

In [None]:
train_save_wordvec_export(lan ='en')
train_save_wordvec_export(lan ='zh')