In [None]:
%load_ext autoreload
%autoreload 2

# Process Training Data
Data Source:
- [news.2013.de.shuffled](www.statmt.org/wmt14/training-monolingual-news-crawl/)

In [None]:
from tqdm import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
import langid
import re
import time, threading
ac_token = re.compile(r'[^a-zA-ZÄÜÖÓäáàâëéêèöòôóüûŭúůíïğßŞøæœÆç0-9\.’\,\:\-\&\s]')
URL = re.compile(r'(URL)?:? ?https?://', re.IGNORECASE)

def filter_sentence(l):
    s = l
    s = re.sub(URL, '', s)
    s = re.sub(r'\'s','',s)
    s = re.sub(r"[\"`\']",'',s)
    s = re.sub(r"/",' or ',s)
    s = re.sub(r"\(.+?\)",'',s)
    s = re.sub(r"\[.+?\]",'',s)
    s = re.sub(r"^\d+\)",'',s)
    
    s = s.strip()
    s = re.sub(r'\.+$|;$|\?$|\!$|^,|^:|^-','',s)  # 去除句首结束字符
    s = re.sub(r'\s+',' ',s)
    s = re.sub(r'\'','’',s)
    s = s.replace(r'’',' ’ ')
    
    s = s.strip()
    s = re.sub(ac_token,'',s)
    
    return s

class Reader(threading.Thread):
    def __init__(self, file_name, start_pos, end_pos, output):
        super(Reader, self).__init__()
        self.file_name = file_name
        self.start_pos = start_pos
        self.end_pos = end_pos
        self.output = output

    def run(self):
        fd = open(self.file_name, 'r', encoding="unicode_escape",errors="ignore")
        if self.start_pos != 0:
            fd.seek(self.start_pos-1)
            if fd.read(1) != '\n':
                line = fd.readline()
                self.start_pos = fd.tell()
        fd.seek(self.start_pos)
        
        while (self.start_pos <= self.end_pos):
            line = fd.readline()
            for p1 in line.split(';'):
                for p2 in p1.split('?'):
                    part = p2
                    if langid.classify(part.strip())[0] != 'de':
                        continue
                    l = filter_sentence(part.strip())
                    res = word_tokenize(l, language="german") 
                    if len(res) == 0:
                        continue
                    for j in res:
                        self.output.write(j + ' ')
                    self.output.write('\n')
            self.start_pos = fd.tell()

class Partition(object):
    def __init__(self, file_name, thread_num):
        self.file_name = file_name
        self.block_num = thread_num

    def part(self):
        fd = open(self.file_name, 'r')
        fd.seek(0, 2)
        pos_list = []
        file_size = fd.tell()
        block_size = file_size/self.block_num
        start_pos = 0
        for i in range(self.block_num):
            if i == self.block_num-1:
                end_pos = file_size-1
                pos_list.append((start_pos, end_pos))
                break
            end_pos = start_pos+block_size-1
            if end_pos >= file_size:
                end_pos = file_size-1
            if start_pos >= file_size:
                break
            pos_list.append((start_pos, end_pos))
            start_pos = end_pos+1
        fd.close()
        return pos_list
    
file_name = "raw/news.2013.de.shuffled"
p = Partition(file_name, 8)
t = []
pos = p.part()
thread_num = 8


for i in range(thread_num):
    output = open("processed/" + str(i) + ".txt", 'w', encoding="utf-8-sig")
    t.append(Reader(file_name, *pos[i], output))
    
    
for i in range(thread_num):
    t[i].start()
for i in range(thread_num):
    t[i].join()


In [None]:
from utils.helper import merge_files
merge_files("processed/", "fastText/data/input_cased.txt")

In [None]:
from tqdm import tqdm
output = open("fastText/data/input.txt", 'w')
with open("fastText/data/input_cased.txt", 'r') as f:
    for i in tqdm(f.readlines()):
        output.write(i.lower())
output.close()

# Train With Fasttext

In [None]:
!./fastText/data/fasttext skipgram -input fastText/data/input.txt -output out -lr 0.025 -dim 100 -ws 5 -epoch 5 -minCount 1 -neg 5 -loss ns -minn 3 -maxn 18 -thread 8 -t 0.0001 -lrUpdateRate 100

# Process Summary Training Data

In [4]:
from tqdm import tqdm
import random
import collections, re
import struct
import langid
import json
from tensorflow.core.example import example_pb2
from nltk.tokenize import sent_tokenize, word_tokenize

ac_token = re.compile(r'[^a-zA-ZÄÜÖäáàâëéêèöòôüûŭúůíßæœÆç0-9\.\,\:\-\&\s\?\!\;]')
URL = re.compile(r'(URL)?:? ?https?://', re.IGNORECASE)
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'

def filter_sentence(l):
    s = l
    s = re.sub(URL, '', s)
    s = re.sub(r"\"",'',s)
    s = re.sub(r"/",' or ',s)
    s = re.sub(r"\(.+?\)",'',s)
    s = re.sub(r"\[.+?\]",'',s)
    s = re.sub(r"^\d+\)",'',s)
    
    s = s.strip()
    s = re.sub(r'\s+',' ',s)
    s = re.sub(r'\'s','',s)
    s = re.sub(r'\'','’',s)
    s = s.replace(r'’',' ’ ')
    s = re.sub(ac_token,'',s)
    s = re.sub(r'\s+',' ',s)
    s = s.strip()

    return s

def process_sample(sample):

    res = word_tokenize(filter_sentence(sample), language="german")
    new = ""
    for j in res:
        new += j + ' '
    return new


def write_bin(story, summary, writer):
    story = story.encode()
    
    summary = ' '.join(["%s %s %s" % (SENTENCE_START, summary, SENTENCE_END)])
    summary = summary.encode()
    
    tf_example = example_pb2.Example()
    tf_example.features.feature['story'].bytes_list.value.extend([story])
    tf_example.features.feature['summary'].bytes_list.value.extend([summary])
    tf_example_str = tf_example.SerializeToString()
    str_len = len(tf_example_str)
    writer.write(struct.pack('q', str_len))
    writer.write(struct.pack('%ds' % str_len, tf_example_str))
    
    story = story.decode()
    summary = summary.decode()

    tokens = story.split(' ')
    tokens = [t.strip() for t in tokens] # strip
    tokens = [t for t in tokens if t!=""] # remove empty
    return tokens


def write_json(story, summary, writer):
    
    summary = ' '.join(["%s %s %s" % (SENTENCE_START, summary, SENTENCE_END)])
    
    writer.write(
                json.dumps({
                    'story': story,
                    'summary': summary
                }, ensure_ascii=False) + '\n')
    
    tokens = story.split(' ')
    tokens = [t.strip() for t in tokens] # strip
    tokens = [t for t in tokens if t!=""] # remove empty
    return tokens
 

def process_file(story_root, summary_root, target_name, output_type):
    
    story_f = open(story_root)
    summary_f = open(summary_root)

    vocab_counter = collections.Counter()
    if output_type == "bin":
        with open(target_name + ".bin",'wb') as writer:
            for sto in tqdm(story_f.readlines()):
                summ = summary_f.readline().strip()
                vocab_counter.update(write_bin(process_sample(sto.lower()), process_sample(summ.lower()), writer))
    elif output_type == "json":
        with open(target_name + ".json",'w') as writer:
            for sto in tqdm(story_f.readlines()):
                summ = summary_f.readline().strip()
                vocab_counter.update(write_json(process_sample(sto.lower()), process_sample(summ.lower()), writer))
    
    with open(target_name + "_vocab.txt", 'w') as writer:
        for word, count in vocab_counter.most_common(len(vocab_counter)):
            writer.write(word + ' ' + str(count) + '\n')
     
    story_f.close()
    summary_f.close()

process_file("raw/summ/train.txt.src", "raw/summ/train.txt.tgt", "train", output_type="json")
process_file("raw/summ/test.txt.src", "raw/summ/test.txt.tgt", "test", output_type="json")
process_file("raw/summ/val.txt.src", "raw/summ/val.txt.tgt", "val", output_type="json")


100%|██████████| 220887/220887 [20:25<00:00, 180.23it/s]
100%|██████████| 10701/10701 [01:05<00:00, 164.40it/s]
100%|██████████| 11394/11394 [01:07<00:00, 168.93it/s]
