In [None]:
%load_ext autoreload
%autoreload 2

# Extract and Process Training Data

## Using Regular Expression and Tag Punctuations
Data Source:
- [Ming Text](https://github.com/JiangYanting/Pre-modern_Chinese_corpus_dataset), choose files from "小说话本" (novels), "史书" (history) and "兵书" (military)
- *Wanli Gazette* 130 items

All data are in *raw/*. Some of the files in **Ming Text** don't have punctuations, split files and tag unmarked ones.

In [None]:
from utils import split_data

data_path = 'raw/'
punctuation_path = 'punc/'

split_data.split_marked_unmarked_files(data_path, punctuation_path)

Tag punctutation with CRFPunctuator from [Jiayan](https://github.com/jiaeyan/Jiayan), files after tagging are saved to *punc/mlmarked_path/*

In [None]:
from utils import helper

marked_path = punctuation_path + 'marked/'
unmarked_path = punctuation_path + 'unmarked/'
mlmarked_path = punctuation_path + 'mlmarked/'

helper.tag_text(unmarked_path, mlmarked_path)

Filter sentences, only save tokens that are ，：、\u4e00-\u9fa5.

In [None]:
from utils import helper

processed_path = 'processed/'
helper.process_file(mlmarked_path, processed_path, False)
helper.process_file(marked_path, processed_path, True)

# Train with cw2vec
Cut sentences into words and train using [cw2vec](https://github.com/bamtercelboo/cw2vec)

In [None]:
from utils import helper

helper.separate_words(processed_path, "cw2vec/input.txt")

In [None]:
!./cw2vec/word2vec substoke -input cw2vec/input.txt -infeature cw2vec/feature.txt -output cw2vec_result/substoke_out -lr 0.025 -dim 100 -ws 5 -epoch 5 -minCount 1 -neg 5 -loss ns -minn 3 -maxn 18 -thread 8 -t 1e-4 -lrUpdateRate 100

# Process Testing Wanli Data

Wanli data are in *wanli_story_path* and *wanli_summary_path*. Saved data are in *ancientChinese.bin* and *ancientChinese_vocab.txt*.

Choose to convert to traditional Chinese by setting *convert*.

In [2]:
from tqdm import tqdm
import collections, re
import struct
import re
import html
import json
import urllib
import w3lib.html
import opencc
from tensorflow.core.example import example_pb2


CHINESE_REGEX = re.compile('[\u4e00-\u9fa5]')
ac_token = re.compile('[^，：、\u4e00-\u9fa5。？！；]')
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'

def filter_sentence(text: str):

    text = "".join(text.split())
    
    text = re.sub(",","，", text)
    text = re.sub(":","：", text)
    text = text.replace("?","？")
    text = text.replace("!","！")
    text = re.sub(ac_token, '', text)  # Keep only '，', '、', and '：'
    text = re.sub("：+，", "，", text)
    text = re.sub("、+，", "，", text)
    text = re.sub("、+：", "：", text)
    text = re.sub("，+：", "：", text)
    text = re.sub("：+、", "：", text)
    text = re.sub("，+、", "，", text)
    text = re.sub(r"，+", '，', text)
    text = re.sub(r"、+", '、', text)
    text = re.sub(r"：+", '：', text)

    if len(text) == 1:
        return ""
   
    return text

def process_sample(sample):

    res = filter_sentence(sample)
    new = ""
    for j in res:
        new += j + ' '
    return new


def write_bin(story, summary, writer):
    story = story.encode()
    
    summary = ' '.join(["%s %s %s" % (SENTENCE_START, summary, SENTENCE_END)])
    summary = summary.encode()
    
    tf_example = example_pb2.Example()
    tf_example.features.feature['story'].bytes_list.value.extend([story])
    tf_example.features.feature['summary'].bytes_list.value.extend([summary])
    tf_example_str = tf_example.SerializeToString()
    str_len = len(tf_example_str)
    writer.write(struct.pack('q', str_len))
    writer.write(struct.pack('%ds' % str_len, tf_example_str))
    
    story = story.decode()
    summary = summary.decode()

    tokens = story.split(' ')
    tokens = [t.strip() for t in tokens] # strip
    tokens = [t for t in tokens if t!=""] # remove empty
    return tokens


def write_json(story, summary, writer):
    
    summary = ' '.join(["%s %s %s" % (SENTENCE_START, summary, SENTENCE_END)])
    
    writer.write(
                json.dumps({
                    'story': story,
                    'summary': summary
                }, ensure_ascii=False) + '\n')
    
    tokens = story.split(' ')
    tokens = [t.strip() for t in tokens] # strip
    tokens = [t for t in tokens if t!=""] # remove empty
    return tokens


def process_file(story_root, summary_root, convert, target_name, output_type):
    converter = opencc.OpenCC('s2t.json')
    story_f = open(story_root)
    summary_f = open(summary_root)

    vocab_counter = collections.Counter()
    if output_type == "bin":
        with open(target_name + ".bin",'wb') as writer:
            for sto in tqdm(story_f.readlines()):
                summ = summary_f.readline().strip()
                if convert:
                    vocab_counter.update(write_bin(process_sample(converter.convert(sto)), 
                                                   process_sample(converter.convert(summ)), writer))
                else:
                    vocab_counter.update(write_bin(process_sample(sto), 
                                                   process_sample(summ), writer))
    elif output_type == "json":
        with open(target_name + ".json",'w') as writer:
            for sto in tqdm(story_f.readlines()):
                summ = summary_f.readline().strip()
                if convert:
                    vocab_counter.update(write_json(process_sample(converter.convert(sto)), 
                                                   process_sample(converter.convert(summ)), writer))
                else:
                    vocab_counter.update(write_json(process_sample(sto), 
                                                   process_sample(summ), writer))
    
    with open(target_name + "_vocab.txt", 'w') as writer:
        for word, count in vocab_counter.most_common(len(vocab_counter)):
            writer.write(word + ' ' + str(count) + '\n')
     
    story_f.close()
    summary_f.close()

wanli_story_path = "wanli/story.txt"
wanli_summary_path = "wanli/summary.txt"
convert = True
process_file(wanli_story_path, wanli_summary_path, convert, "ancientChinese", output_type="json")

100%|██████████| 101/101 [00:00<00:00, 2550.50it/s]
