In [2]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

def to_lines(doc):
	lines = doc.strip().split('\n')
	return lines

def clean_data(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for line in lines[0:150000]:
		clean_pair = list()
		# normalize unicode characters
		line = normalize('NFD', line).encode('ascii', 'ignore')
		line = line.decode('UTF-8')
		# tokenize on white space
		line = line.split()
		# convert to lowercase
		line = [word.lower() for word in line]
		# remove punctuation from each token
		line = [word.translate(table) for word in line]
		# remove non-printable chars form each token
		line = [re_print.sub('', w) for w in line]
		# remove tokens with numbers in them
		line = [word for word in line if word.isalpha()]
		# store as string
		clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)
 
# load dataset
en_file = 'fr-en/europarl-v7.fr-en.en'
fr_file='fr-en/europarl-v7.fr-en.fr'
doc_en = load_doc(en_file)
doc_fr= load_doc(fr_file)
# split into english-german pairs
en_line = to_lines(doc_en)
fr_line= to_lines(doc_fr)
# clean sentences
clean_en = clean_data(en_line)
clean_fr= clean_data(fr_line)
# save clean pairs to file
save_clean_data(clean_en, 'english.pkl')
save_clean_data(clean_fr, 'french.pkl')
# spot check
for i in range(100):
	print('[%s] => [%s]' % (clean_en[i,0], clean_fr[i,0]))

Saved: english.pkl
Saved: french.pkl
[resumption of the session] => [reprise de la session]
[i declare resumed the session of the european parliament adjourned on friday december and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period] => [je declare reprise la session du parlement europeen qui avait ete interrompue le vendredi decembre dernier et je vous renouvelle tous mes vux en esperant que vous avez passe de bonnes vacances]
[although as you will have seen the dreaded millennium bug failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful] => [comme vous avez pu le constater le grand bogue de lan ne sest pas produit en revanche les citoyens dun certain nombre de nos pays ont ete victimes de catastrophes naturelles qui ont vraiment ete terribles]
[you have requested a debate on this subject in the course of the next few days during this partsession] => [vou

In [None]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle
 
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))
 
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)
 
# load dataset
raw_dataset_en = load_clean_sentences('english.pkl')
raw_dataset_fr=load_clean_sentences('french.pkl')
 

shuffle(raw_dataset_en)
shuffle(raw_dataset_fr)
# split into train/test
train_en, test_en = raw_dataset_en[:100000], dataset[100000:]
train_fr, test_fr= raw_dataset_fr[:100000], raw_dataset_fr[100000:]

# save
save_clean_data(train_en, 'english-train.pkl')
save_clean_data(test_en, 'english-test.pkl')
save_clean_data(train_fr, 'french-train.pkl')
save_clean_data(test_fr, 'french-test.pkl')