In [None]:
%load_ext autoreload
%autoreload 2

# Process Training Data

All files are in *raw/*, processed files are saved in *tokenized/*

## GerManC

In [None]:
from utils.helper import get_all_files
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
import re, langid

ac_token = re.compile(r'[^a-zA-ZäáàâëéèöòüûßæœÆ0-9\.’\,\:\-\&\s]')


def swap_GC(line):
    s = line
    s = s.replace('/',',')
    s = re.sub(r'\[.*?\]|\(.*?\)','',s)
    s = re.sub(r'& c','&c',s)
    s = re.sub(r'&c','etc',s)
    
    return s


def filter_GC(line):
    s = line.strip()
    s = re.sub('[\"§*„ºç’]','',s)
    s = re.sub(r'- ','-',s)
    s = re.sub(r'\'s','',s)
    s = re.sub(r'\'','’',s)
    
    s = s.strip()
    s = re.sub(r'\.+$|;$|\?$|\!$','',s)
    s = re.sub(r'^,|,$|^-','',s)
    
    s = s.strip()
    
    if len(s.split()) == 1:
        return ""
    return s


root_path = "raw/GerManC/"
with open("tokenized/GerManC.txt",'w') as output:
    for file_name in tqdm(get_all_files(root_path), desc="process files"):
        with open(root_path + file_name, 'r', encoding="utf-8-sig") as f:
            text = ""
            for l in f.readlines():
                text += l + ' '

            for i in sent_tokenize(swap_GC(text), language='german'):
                for p1 in i.split(';'):
                    for part in p1.split('?'):
                        sentence = filter_GC(part).strip()
                        if langid.classify(sentence)[0] != 'de':
                            continue
                        res = word_tokenize(sentence, language='german')
                        if len(res) == 0:
                            continue
                        for j in res:
                            output.write(j + ' ')
                        output.write('\n')

## DTA

In [None]:
from utils.helper import get_all_files
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
import langid
import re

ac_token = re.compile(r'[^a-zA-ZÄÜäáàâëéêèöòôüûŭúůßæœÆç0-9\.’\,\:\-\&\s]')

def swap_DTA(t):
    s = t
    s = re.sub(r"\[.*?\]|\(.*?\)",'\n',s)
    # delete redundant
    s = re.sub(r"\)[\s\:]*?\(\s+?\d+|\,?\d+\]|\d+\.?\)|[a-z]\)|†\)\:?",'\n',s)
    s = re.sub(r"\*\)",'\n',s)
    s = s.replace('- ','')

    s = s.replace('uͤ','ü')
    s = s.replace('oͤ','ö')
    s = s.replace('aͤ','ä')
    s = s.replace('ſ','s')
    s = s.replace(r'm̃','mm')
    s = s.replace(r'ñ','nn')
    s = s.replace(r'ẽ','ee')
    s = s.replace(r'ẽ','e')
    s = s.replace(r'ı','i')
    s = s.replace(r'& c','&c')
    s = s.replace(r'&c','etc')
    s = s.replace(r'ꝛ c','ꝛc')
    s = s.replace(r'ꝛc','etc')
    s = s.replace(r'ꝛ','r')
    s = s.replace('/',',')
    return s

def filter_DTA(line):
    s = line
    
    s = re.sub('[\"§_—\(\)‒*”]','',s)
    s = re.sub('Vorrede\.|Inhalt\.','',s)
    
    s = re.sub(r'“',' ',s)
    s = re.sub(r'„',' ',s)
    
    s = s.strip()
    s = re.sub(r'\.+$|;$|\?$|\!$','',s)
    s = re.sub(r'^,|,$|^-','',s)
    s = re.sub(r'\s+',' ',s)
    s = s.replace(r'’',' ’ ')
    
    s = re.sub(ac_token,'',s)
    s = s.strip()

    if len(s.split()) == 1:
        return ""

    return s
    
root_path = "raw/dta/"
output = open("tokenized/DTA.txt",'w', encoding="utf-8-sig")

for folder in tqdm(get_all_files(root_path), desc="process by type"):
    for file in get_all_files(root_path + folder):
        with open(root_path + folder + "/" + file, 'r', encoding="utf-8-sig") as f:
            text = ""
            for line in f.readlines():
                text += line.strip() + ' '
            for i in sent_tokenize(swap_DTA(text.strip()), language="german"):
                for p1 in i.split(';'):
                    for part in p1.split('?'):
                        for k in sent_tokenize(filter_DTA(part.strip()), language="german"):   
                            if langid.classify(k.strip())[0] != 'de':
                                continue
                            res = word_tokenize(k, language="german") 
                            if len(res) == 0:
                                continue
                            for j in res:
                                output.write(j + ' ')
                            output.write('\n')
output.close()

# Mannheimer

In [None]:
from utils.helper import get_all_files
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
import langid
import re

ac_token = re.compile(r'[^a-zA-ZÄÜäáàâëéêèöòôüûŭúůßæœÆç0-9\.’\,\:\-\&\s]')

def swap_Mannheimer(t):
    s = t
    s = re.sub(r'& c','&c',s)
    s = re.sub(r'&c','etc',s)
    s = re.sub(r'ſ','s',s)
    s = re.sub(r'm̅','mm',s)
    s = re.sub(r'n̄','nn',s)
    s = re.sub(r'ā','a',s)
    s = re.sub(r'ē','e',s)
    s = re.sub(r'r̄','r',s)
    return s
    
def filter_Mannheimer(line):
    s = line
    
    s = re.sub("\(.+?\)|\(|\)|\d+\s+?\)|\- ",'',s)
    s = re.sub("[|/„“\*§\{\}]",'',s)
    s = s.strip()
    s = re.sub(r'\.+$|;$|\?$|\!$','',s)
    s = re.sub(r'\s+',' ',s)
    s = re.sub(r'\'','’',s)
    s = s.replace(r'’',' ’ ')
    s = re.sub('!|\?|;','\n',s)
    s = re.sub(ac_token,'',s)
    s = s.strip()

    if len(s.split()) == 1:
        return ""
        
    return s


root_path = "raw/Mannheimer/"
with open("tokenized/Mannheimer.txt",'w') as output:
    for file_name in tqdm(get_all_files(root_path), desc="process files"):
        with open(root_path + file_name, 'r', encoding="utf-8-sig") as f:
            text = ""
            for l in f.readlines():
                text += l.strip() + ' '
            filtered = ""
            # First 10 lines are document info
            count = 0
            for i in re.findall(r"<p.+?</p>",text):
                count += 1
                if count > 10:
                    filtered += re.sub(r"<.+?>",' ',i.replace('=','-')) + '\n'
            for sentence in sent_tokenize(filtered.strip(), language="german"):
                for p1 in sentence.split(';'):
                    for part in p1.split('?'):
                        s = swap_Mannheimer(part.strip())
                        if langid.classify(s)[0] != 'de':
                            continue
                        s = filter_Mannheimer(s)
                        res = word_tokenize(s, language="german") 
                        if len(res) == 0:
                            continue
                        for j in res:
                            output.write(j + ' ')
                        output.write('\n')

# MercuriusTreebank

In [None]:
from utils.helper import get_all_files
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
import langid
import re

ac_token = re.compile(r'[^a-zA-ZÄÜäáàâëéêèöòôüûŭúůßæœÆç0-9\.’\,\:\-\&\s]')

def swap_Mercurius(word):
    word = word.replace("&#x00e4;",'ä')
    word = word.replace("&#x00df;",'ß')
    word = word.replace("&#x00fc;",'ü')
    word = word.replace("&#x00f6;",'ö')
    word = word.replace("&#x00e6;",'æ')
    word = word.replace("&#x00e1;",'á')
    word = word.replace("&#x00e0;",'à')
    word = word.replace("&#x00e8;",'è')
    word = word.replace("&#x00e9;",'é')
    word = word.replace("&#x00c6;",'Æ')
    word = word.replace("&#x00e2;",'â')
    word = word.replace("&#x00dc;",'Ü')
    word = word.replace("&#x00c4;",'Ä')
    word = word.replace("&amp;",'')
    word = word.replace("&apos;",'')
    
    return word

def filter_Mercurius(line):
    s = line.strip()
    s = re.sub(r'& c','&c',s)
    s = re.sub(r'&c','etc',s)
    s = re.sub(r'(\w)~','\g<1>\g<1>',s)
    s = re.sub('\(.+?\)|\(|\)','',s)
    s = re.sub('!|;|\?','\n',s)
    s = re.sub('@','',s)
    
    s = s.strip()
    s = re.sub(r'\.+$|;$|\?$|\!$','',s)
    s = re.sub(r'\s+',' ',s)
    s = re.sub(r'\'','’',s)
    s = s.replace(r'’',' ’ ')
    s = re.sub(ac_token,'',s)
    
    s = s.strip()
    if len(s.split()) == 1:
        return ""
    
    return s


root_path = "raw/MercuriusTreebank/"
with open("tokenized/MercuriusTreebank.txt",'w') as output:
    for file_name in tqdm(get_all_files(root_path), desc="process files"):
        with open(root_path + file_name, 'r', encoding="utf-8-sig") as f:
            text = ""
            for l in f.readlines():
                text += l.strip()
            for i in re.findall(r"<s.+?</s>",text):
                sentence = ""
                for j in re.findall(r"<t.+?/>",i):               
                    word = re.sub(r"<t.+?word=\"",'',j)
                    word = re.sub(r"\".+?/>",'',word) 
                    word = swap_Mercurius(word)
                    word = word.replace("/",',')
                    sentence += word + ' '
                sentence = filter_Mercurius(sentence.strip())
                if langid.classify(sentence)[0] != 'de':
                    continue
                res = word_tokenize(sentence, language="german") 
                if len(res) == 0:
                    continue
                for j in res:
                    output.write(j + ' ')
                output.write('\n')

# Train with Fasttext

In [None]:
from utils.helper import merge_files
merge_files("tokenized/","fastText/data/input.txt")

In [None]:
!./fastText/data/fasttext skipgram -input fastText/data/input.txt -output out/nonorm -lr 0.025 -dim 100 -ws 5 -epoch 5 -minCount 1 -neg 5 -loss ns -minn 3 -maxn 18 -thread 8 -t 0.0001 -lrUpdateRate 100

# Process Summary Testing Data

Normed testing story and summary are in *raw/norm/*.

In [10]:
from tqdm import tqdm
import random
import collections, re
import struct
import langid
import json
from tensorflow.core.example import example_pb2
from nltk.tokenize import sent_tokenize, word_tokenize

ac_token = re.compile(r'[^a-zA-ZÄÜÖäáàâëéêèöòôüûŭúůíßæœÆç0-9\.\,\:\-\&\s\?\!\;]')
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'

def filter_sentence(l):
    s = l
    s = re.sub(r"\"",'',s) 
    s = re.sub(r"/",',',s)
    s = re.sub(r"\(.+?\)",'',s)
    s = re.sub(r"\[.+?\]",'',s)
    s = re.sub(r"^\d+\)",'',s)
    
    s = s.strip()
    s = re.sub(r'\s+',' ',s)
    s = re.sub(r'\'s','',s)
    s = re.sub(r'\'','’',s)
    s = s.replace(r'’',' ’ ')
    s = re.sub(ac_token,'',s)
    s = re.sub(r'\s+',' ',s)
    s = s.strip()

    return s

def process_sample(sample):

    res = word_tokenize(filter_sentence(sample), language="german")
    new = ""
    for j in res:
        new += j + ' '
    return new


def write_bin(story, summary, writer):
    story = story.encode()
    
    summary = ' '.join(["%s %s %s" % (SENTENCE_START, summary, SENTENCE_END)])
    summary = abstract.encode()
    
    tf_example = example_pb2.Example()
    tf_example.features.feature['story'].bytes_list.value.extend([story])
    tf_example.features.feature['summary'].bytes_list.value.extend([summary])
    tf_example_str = tf_example.SerializeToString()
    str_len = len(tf_example_str)
    writer.write(struct.pack('q', str_len))
    writer.write(struct.pack('%ds' % str_len, tf_example_str))
    
    story = story.decode()
    tokens = story.split(' ')
    tokens = [t.strip() for t in tokens] # strip
    tokens = [t for t in tokens if t!=""] # remove empty
    return tokens


def write_json(story, summary, writer):
    
    summary = ' '.join(["%s %s %s" % (SENTENCE_START, summary, SENTENCE_END)])
    
    writer.write(
                json.dumps({
                    'story': story,
                    'summary': summary
                }, ensure_ascii=False) + '\n')
    
    tokens = story.split(' ')
    tokens = [t.strip() for t in tokens] # strip
    tokens = [t for t in tokens if t!=""] # remove empty
    return tokens
 
    
def process_file(story_root, summary_root, target_name, output_type):
    
    story_f = open(story_root)
    summary_f = open(summary_root)

    vocab_counter = collections.Counter()
    if output_type == "bin":
        with open(target_name + ".bin",'wb') as writer:
            for sto in tqdm(story_f.readlines()):
                summ = summary_f.readline().strip()
                vocab_counter.update(write_bin(process_sample(sto.lower()), process_sample(summ.lower()), writer))
    elif output_type == "json":
        with open(target_name + ".json",'w') as writer:
            for sto in tqdm(story_f.readlines()):
                summ = summary_f.readline().strip()
                vocab_counter.update(write_json(process_sample(sto.lower()), process_sample(summ.lower()), writer))
    
    with open(target_name + "_vocab.txt", 'w') as writer:
        for word, count in vocab_counter.most_common(len(vocab_counter)):
            writer.write(word + ' ' + str(count) + '\n')
     
    story_f.close()
    summary_f.close()

process_file("raw/norm/story.txt", "raw/norm/summary.txt", "normed_ancient_de_summ", output_type="json")

100%|██████████| 100/100 [00:00<00:00, 369.26it/s]
