In [None]:
%load_ext autoreload
%autoreload 2

# Extract and Process Training Data

Data source:
- [new2016zh](https://github.com/brightmart/nlp_chinese_corpus#2%E6%96%B0%E9%97%BB%E8%AF%AD%E6%96%99json%E7%89%88news2016zh)
- [NLPCC 2017 task3](http://tcci.ccf.org.cn/conference/2017/taskdata.php)
- [中文短文本摘要数据集](https://www.jianshu.com/p/8f52352f0748?tdsourcetag=s_pcqq_aiomsg)
- LCSTS

All files are in *raw/*. Processed files are saved in *processed/*

## New2016zh

In [None]:
from utils import helper
helper.process_all_zh_files("raw/new2016zh/news2016zh_valid.txt", "processed/", "valid.txt")
helper.process_all_zh_files("raw/new2016zh/news2016zh_train.txt", "processed/", "train.txt")

## nlp2017 text summarization

In [None]:
from utils import helper
helper.process_all_nlpcc_files("raw/nlpcc2017textsummarization/train_with_summ.txt", "processed/", "with.txt")
helper.process_all_nlpcc_files("raw/nlpcc2017textsummarization/train_without_summ.txt", "processed/", "without.txt")

## 新闻标题数据集

In [None]:
from utils import helper 
helper.process_all_news_files("raw/新闻标题数据集/train_text.txt", "processed/", "news_train.txt")

## LCSTS

In [None]:
from utils import helper 

# Needs to transfer all stories in LCSTS into one file first, see steps below (two functions named "get_lcsts")
helper.process_all_news_files("raw/LCSTS/lcsts_story.txt", "processed/", "lcsts.txt")

## Merge Files

Merged all training files into one, and save it to *cw2vec/input.txt*. Set *convert* to True if convert text to traditional Chinese. Split sentences with *Jieba*.

In [None]:
from utils import helper 
helper.merge_files("processed/", "cw2vec/input.txt", convert)  # input：文件夹，目标文件 

# Training with cw2vec

In [None]:
!./cw2vec/word2vec substoke -input cw2vec/input.txt -infeature cw2vec/feature.txt -output cw2vec_result/substoke_out_train -lr 0.025 -dim 100 -ws 5 -epoch 5 -minCount 1 -neg 5 -loss ns -minn 3 -maxn 18 -thread 8 -t 1e-4 -lrUpdateRate 100

# Process Summary Training Data

All files are in *raw/LCSTS/*, processed files are saved in *summ_processed/*. Set *convert* to True if convert text to traditional Chinese.

In [None]:
from tqdm import tqdm
import xmltodict, re

# split LSCTS to story and summary
def get_lcsts():
    files = ["raw/LCSTS/PART_I.txt", "raw/LCSTS/PART_II.txt"]
    
    for root in files:
        count = 0
        fin = open(root)
        name = root.split(".")[0]
        f_src = open(name + "story.txt", "w")
        f_trg = open(name + "summary.txt", "w")
        
        xml_str = ""
        for line in tqdm(fin.readlines(), desc="process texts"):
            xml_str += line.strip()
            if "</doc>" in line.strip():
                xml_str = xml_str.replace(" id=%d" % count, "") \
                            .replace("&raquo", "：") \
                            .replace("<BR/>", "") \
                            .replace("<BR>", "")
                summary = re.sub("|\</summary\>.*", '', re.sub("\<doc\>.*?\<summary\>", '',xml_str)).replace("\n", "")
                story = re.sub("|\</short_text\>.*", '', re.sub("\<doc\>.*?\<short_text\>", '',xml_str)).replace("\n", "")
                f_trg.write(summary + "\n")
                f_src.write(story + "\n")
            
                count += 1
                xml_str = ""
        fin.close()
    f_src.close()
    f_trg.close()

get_lcsts()

In [None]:
from tqdm import tqdm
import xmltodict, re

# split LSCTS to story and summary
def get_lcsts():
    files = ["raw/LCSTS/PART_III.txt"]
    
    for root in files:
        count = 0
        fin = open(root)
        name = root.split(".")[0]
        f_src = open(name + "story.txt", "w")
        f_trg = open(name + "summary.txt", "w")
        
        xml_str = ""
        for line in tqdm(fin.readlines(), desc="process texts"):
            xml_str += line.strip()
            if "</doc>" in line.strip():
                xml_str = xml_str.replace(" id=%d" % count, "") \
                            .replace("&raquo", "：") \
                            .replace("<BR/>", "") \
                            .replace("<BR>", "")
                summary = re.sub("|\</summary\>.*", '', re.sub("\<doc\>.*?\<summary\>", '',xml_str))
                story = re.sub("|\</short_text\>.*", '', re.sub("\<doc\>.*?\<short_text\>", '',xml_str))
                label = re.sub("|\</human_label\>.*", '', re.sub("\<doc\>.*?\<human_label\>", '',xml_str))
                if label in ['3','4','5']:
                    f_trg.write(summary.replace("\n", "") + "\n")
                    f_src.write(story.replace("\n", "") + "\n")
                    count += 1
                xml_str = ""
        fin.close()
    f_src.close()
    f_trg.close()

get_lcsts()

In [19]:
from tqdm import tqdm
import collections, re
import struct
import re
import json
import jieba
import opencc
from tensorflow.core.example import example_pb2
from nltk.tokenize import sent_tokenize, word_tokenize
from utils.regu import filter_sentences

SENTENCE_START = '<s>'
SENTENCE_END = '</s>'

def process_sample(sample):

    res = jieba.lcut(filter_sentences(sample))
    new = ""
    for j in res:
        new += j + ' '
    return new


def write_bin(story, summary, writer):
    story = story.encode()
    
    summary = ' '.join(["%s %s %s" % (SENTENCE_START, summary, SENTENCE_END)])
    summary = summary.encode()
    
    tf_example = example_pb2.Example()
    tf_example.features.feature['story'].bytes_list.value.extend([story])
    tf_example.features.feature['summary'].bytes_list.value.extend([summary])
    tf_example_str = tf_example.SerializeToString()
    str_len = len(tf_example_str)
    writer.write(struct.pack('q', str_len))
    writer.write(struct.pack('%ds' % str_len, tf_example_str))
    
    story = story.decode()
    summary = summary.decode()

    tokens = story.split(' ')
    tokens = [t.strip() for t in tokens] # strip
    tokens = [t for t in tokens if t!=""] # remove empty
    return tokens


def write_json(story, summary, writer):
    
    summary = ' '.join(["%s %s %s" % (SENTENCE_START, summary, SENTENCE_END)])
    
    writer.write(
                json.dumps({
                    'story': story,
                    'summary': summary
                }, ensure_ascii=False) + '\n')
    
    tokens = story.split(' ')
    tokens = [t.strip() for t in tokens] # strip
    tokens = [t for t in tokens if t!=""] # remove empty
    return tokens


def process_file(story_root, summary_root, target_name, convert, output_type):
    converter = opencc.OpenCC('s2t.json')
    story_f = open(story_root)
    summary_f = open(summary_root)

    vocab_counter = collections.Counter()
    
    if output_type == "bin":
        with open(target_name + ".bin",'wb') as writer:
            for sto in tqdm(story_f.readlines()):
                summ = summary_f.readline().strip()
                if convert:
                    vocab_counter.update(write_bin(process_sample(converter.convert(sto)), process_sample(converter.convert(summ)), writer))
                else:
                    vocab_counter.update(write_bin(process_sample(sto), process_sample(summ), writer))
    elif output_type == "json":
        with open(target_name + ".json",'w') as writer:
            for sto in tqdm(story_f.readlines()):
                summ = summary_f.readline().strip()
                if convert:
                    vocab_counter.update(write_json(process_sample(converter.convert(sto)), process_sample(converter.convert(summ)), writer))
                else:
                    vocab_counter.update(write_json(process_sample(sto), process_sample(summ), writer))

    with open(target_name + "_vocab.txt", 'w') as writer:
        for word, count in vocab_counter.most_common(len(vocab_counter)):
            writer.write(word + ' ' + str(count) + '\n')
     
    story_f.close()
    summary_f.close()

process_file("raw/LCSTS/PART_Istory.txt", "raw/LCSTS/PART_Isummary.txt", "summ_processed/Part1", False, output_type="json")
process_file("raw/LCSTS/PART_IIstory.txt", "raw/LCSTS/PART_IIsummary.txt", "summ_processed/Part2", False, output_type="json")
process_file("raw/LCSTS/PART_IIIstory.txt", "raw/LCSTS/PART_IIIsummary.txt", "summ_processed/Part3", False, output_type="json")

100%|██████████| 2400591/2400591 [39:31<00:00, 1012.21it/s]
100%|██████████| 10666/10666 [00:10<00:00, 1019.21it/s]
100%|██████████| 725/725 [00:00<00:00, 899.63it/s]
