# 串词抽取——数据准备
0、提前将json数据中的title article 分词，将结果缓存到本地 <br>
1、读取数据，将article与title合并 <br>
2.1 去除最原始的脏数据，比如：文章只有一句“每日更新超多有趣漫画，喜欢请关注”、文章是空的、题目是空的 <br>
2.2 去除标点、停用词  <br>
2.3 根据长度范围，过滤comment article <br>
3、balance数据 <br>
4、对train dec数据集shuffle <br>
5、build dict（基于pre-trian word2vec/基于训练数据集） <br>
6、生成id化的训练集与测试集 <br>

In [1]:
import json
import collections
import pickle

import pandas as pd
from tqdm.autonotebook import tqdm
import os
import random


import jieba
import re

filename_train = r'D:\data\comment_generation\article_commenting_learning_comment_generation\newdata.train.json'
filename_dev = r'D:\data\comment_generation\article_commenting_learning_comment_generation\newdata.dev.json'
filename_test = r'D:\data\comment_generation\article_commenting_learning_comment_generation\newdata.test.json'

filename_cut_train = r'D:\data\comment_generation\article_commenting_learning_comment_generation\newdata.cut.train.json'
filename_cut_dev = r'D:\data\comment_generation\article_commenting_learning_comment_generation\newdata.cut.dev.json'
filename_cut_test = r'D:\data\comment_generation\article_commenting_learning_comment_generation\newdata.cut.test.json'



In [2]:
# 1、加载 train test dev 数据,把article_wb与title_wb合并为article_wb
def load_data(filename):
    data = []
    with open(filename, encoding='utf-8') as fin:
        for line in fin:
            item = json.loads(line)
            title = item['title_es'].strip()
            article = item['body'].strip()
            article_wb = item['article_wb'].strip()
            title_wb = item['title_wb'].strip()
            url = item['url']
            uid = url.split('/')[-1]
            for comment in item['comment']:
                comment_content = ''.join(comment[0])
                upvote = int(comment[1])
                data.append({
                    'article_id': uid,
                    'title': title,
                    'article': article,
                    'comment': comment_content,
                    'upvote': upvote,
                    'article_wb':(article_wb+" "+title_wb).strip()
                })
    return pd.DataFrame(data)

def split_comment(x):
    return len(x.split(" "))


def filter_article(df):
    mask = ((df['article_wb'] == "") | (df['article_wb'] == 'null\n') | (df['article_wb'] == 'null') | (df['title'] == 'null') | (df['title'].isnull()) | (df["article"] == "更多趣味内涵内容，敬请关注微信公众号：漫画精选集（jx2018mh）" )| (df["article"] == "每日更新超多有趣漫画，喜欢请关注") |(df["article"] == "文章已被原作者删除，目前无法查看。")|
            (df["article_wb"].apply(split_comment)<6) | (df["article_wb"].apply(split_comment)>300)|(df["comment"].apply(split_comment)>30))
    df = df[~mask]
    return df

def filter_comment(df):
    mask = ((df['article'] == "") | (df['article'] == 'null\n') | (df['article'] == 'null') | (df['title'] == 'null') | (df['title'].isnull()) | (df["article"] == "更多趣味内涵内容，敬请关注微信公众号：漫画精选集（jx2018mh）" )| (df["article"] == "每日更新超多有趣漫画，喜欢请关注") |(df["article"] == "文章已被原作者删除，目前无法查看。")|
            (df["comment"].apply(split_comment)<3)|(df['comment'] == "")|(df['comment'] == 'null'))
    df = df[~mask]
    return df


def sample_neg(grouped_df, count):
    neg_samples = grouped_df[grouped_df['upvote'] < 10]
    neg_sample_len_lte_100 = neg_samples[neg_samples['comment'].str.len() <= 100]
    neg_sample_len_gt_100 = neg_samples[neg_samples['comment'].str.len() > 100]

    if len(neg_sample_len_lte_100) >= count:
        return neg_sample_len_lte_100.sample(n=count, random_state=1)
    else:
        lte_100_samples = neg_sample_len_lte_100
        gt_100_count = min(len(neg_sample_len_gt_100), count - len(neg_sample_len_lte_100))
        gt_100_samples = neg_sample_len_gt_100.sample(n=gt_100_count, random_state=1)
        return pd.concat([lte_100_samples, gt_100_samples])


def sample_random_neg(grouped, name, su_grouped_df, count):
    names = []
    samples_neg = []
    count_name = count
    while (len(names) == 0):
        if len(grouped.groups.keys()) > count_name:
            names = random.sample(grouped.groups.keys(), count_name)
            if name in names:
                names.remove(name)
        else:
            count_name = len(grouped.groups.keys()) - 1

    article = su_grouped_df.iat[0, 0]
    article_id = su_grouped_df.iat[0, 1]
    title = su_grouped_df.iat[0, 3]
    upvote = 1
    for n in names:
        temp = grouped.get_group(n).sample(n=1, random_state=1)
        temp["article"] = article
        temp["article_id"] = article_id
        temp["title"] = title
        temp["upvote"] = upvote
        samples_neg.append(temp)
    return pd.concat(samples_neg)

def save(df, save_to):
    df.to_csv(save_to, sep='\t', encoding='utf-8')

def load(filename):
    return pd.read_csv(filename, sep='\t', encoding='utf-8')

In [69]:

#  加载停用词，选取的是哈工大停用词的汉字部分
stopwords = []
def fetch_stopwords(file = "./resources/stopwords.txt"):
    with open(file,"r",encoding="utf8") as f:
        textLines = f.readlines()
        for line in textLines:
            stopwords.append(line.strip())
fetch_stopwords()


# 去除标点和停用词（必须先分词，才能执行此方法）
def remove_punctuation(line):
    rule = re.compile("[^a-zA-Z0-9\u4e00-\u9fa5]")
    line = rule.sub('', line)
    if line in stopwords:
        return ""
    return line

# 对article tiltle 分词
class Tokenizer:
    def __init__(self):
        self._cache = dict()

    def cut(self, text):
        if text in self._cache:
            return self._cache[text]
        wb = list(jieba.cut(text))
        wb = ' '.join(wb)
        self._cache[text] = wb
        return wb

# 对评论 去除标点、停用词
class CleanPunStop:
    def __init__(self):
        self._cache = dict()

    def clean(self, text):
        if text in self._cache:
            return self._cache[text]
        comment = text.split(" ")
        comment = list(map(lambda x: remove_punctuation(x), comment))
        while "" in comment:
            comment.remove("")
        comment = ' '.join(comment)
        self._cache[text] = comment
        return comment

tokenizer = Tokenizer()
cleanPunStop = CleanPunStop()
tqdm.pandas(desc='word break:')

def text_to_ids(word_dict, text_wb):
    words = text_wb.split()
    ids = []
    unk_id = word_dict['<unk>']
    for word in words:
        word_id = word_dict.get(word, unk_id)
        ids.append(word_id)
    return ids


def transform_data(word_dict,df, save_to, inputs_length=200, outputs_length=30):
    with open(save_to, 'w', encoding='utf-8') as fout:
        for idx, row in tqdm(df.iterrows()):
            article_wb = row['article_wb']
            comment_wb = row['comment']
            label = 1 if row['upvote'] >= 10 else 0
#             title_ids = text_to_ids(word_dict, title_wb)
            article_ids = text_to_ids(word_dict, article_wb)
            comment_ids = text_to_ids(word_dict, comment_wb)
            inputs = article_ids
            inputs = inputs[0: inputs_length]
            inputs = ' '.join(str(it) for it in inputs)
            targets = comment_ids[0: outputs_length]
            targets = ' '.join(str(it) for it in targets)
            fout.write(f'{label}\t{inputs}\t{targets}\n')
            

def cut_art_title(read_path, write_path):
    from tqdm import tqdm
    with open(read_path, "r", encoding="utf8") as f2:
        with open(write_path, "w+", encoding="utf8") as f1:
            textLines = f2.readlines()
            print("读取原始文件" + read_path)
            for i, line in enumerate(tqdm(textLines)):
                line = json.loads(line)
                line['article_wb'] = " ".join(list(jieba.cut(line['body'].strip())))
                line['title_wb'] = " ".join(list(jieba.cut(line['title_es'].strip())))
                f1.write(json.dumps(line, sort_keys=True, separators=(',', ': '), ensure_ascii=False))
                f1.write("\n")

## 0、提前将json数据中的title article 分词，将结果缓存到本地

In [17]:
# （如果已经执行过，无需再执行）将分词后的结果缓存到本地，以后直接读取
cut_art_title(filename_train,filename_cut_train)
cut_art_title(filename_dev,filename_cut_dev)
cut_art_title(filename_test,filename_cut_test)

读取原始文件D:\data\comment_generation\article_commenting_learning_comment_generation\newdata.train.json


100%|█████████████████████████████████████████████████████████████████████████| 191502/191502 [12:40<00:00, 251.84it/s]


读取原始文件D:\data\comment_generation\article_commenting_learning_comment_generation\newdata.dev.json


100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:19<00:00, 254.40it/s]


读取原始文件D:\data\comment_generation\article_commenting_learning_comment_generation\newdata.test.json


100%|█████████████████████████████████████████████████████████████████████████████| 1610/1610 [00:05<00:00, 309.68it/s]


## 1、读取数据，将article与title合并

In [3]:
# 1、加载 train dev 数据，将aritle_wb 与 title_wb 合并为aritcle_wb
print("开始加载 train dev 数据,将article与title合并为aritcle_wb...")
df_train = load_data(filename_cut_train)
df_dev = load_data(filename_cut_dev)

开始加载 train dev 数据,将article与title合并为aritcle_wb...


In [4]:
df_dev.columns.values.tolist()

['article', 'article_id', 'article_wb', 'comment', 'title', 'upvote']

In [5]:
df_train["comment"].describe()

count     5131167
unique    4763025
top           傻 逼
freq         2869
Name: comment, dtype: object

In [6]:
df_train["article_wb"].describe()

count                             5131167
unique                             168749
top       文章 已 被 原作者 删除 ， 目前 无法 查看 。 null
freq                               604000
Name: article_wb, dtype: object

## 2、预处理

### 2.1 去除最原始的脏数据，比如：文章只有一句“每日更新超多有趣漫画，喜欢请关注”、文章是空的、题目是空的

In [7]:
def filter_original(df):
    mask = ((df['article_wb'] == "") | (df['article_wb'] == 'null\n') | (df['article_wb'] == 'null') | (df['article_wb'].isnull()) | (df["article"] == "更多趣味内涵内容，敬请关注微信公众号：漫画精选集（jx2018mh）" )| (df["article"] == "每日更新超多有趣漫画，喜欢请关注") |(df["article"] == "文章已被原作者删除，目前无法查看。")
           )
    df = df[~mask]
    return df

clean1_df_train = filter_original(df_train)
clean1_df_dev = filter_original(df_dev)

In [8]:
clean1_df_train["article_wb"].describe()

count                                               4524725
unique                                               168720
top       击 上面 　 　 免费 订阅 ！ 这 两天 又 一个 新闻 将 韩国 人气 坏 了 ！ 特朗...
freq                                                     84
Name: article_wb, dtype: object

In [9]:
clean1_df_train["comment"].describe()

count     4524725
unique    4218247
top           傻 逼
freq         2477
Name: comment, dtype: object

### 2.2 去除标点、停用词 （耗时22分钟）

In [13]:
# 去除comment article字段的标点、停用词
clean1_df_train["comment"] = clean1_df_train["comment"].progress_apply(cleanPunStop.clean)
clean1_df_train["article_wb"] = clean1_df_train["article_wb"].progress_apply(cleanPunStop.clean)

clean1_df_dev["comment"] = clean1_df_dev["comment"].progress_apply(cleanPunStop.clean)
clean1_df_dev["article_wb"] = clean1_df_dev["article_wb"].progress_apply(cleanPunStop.clean)

HBox(children=(IntProgress(value=0, description='word break:', max=4524725, style=ProgressStyle(description_wi…




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


HBox(children=(IntProgress(value=0, description='word break:', max=4524725, style=ProgressStyle(description_wi…




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(IntProgress(value=0, description='word break:', max=118783, style=ProgressStyle(description_wid…




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


HBox(children=(IntProgress(value=0, description='word break:', max=118783, style=ProgressStyle(description_wid…




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
# 删除一些由于去掉停用词而变空的comment 或者 article
def filter_null(df):
    mask = ((df['article_wb'] == "") |  (df['article_wb'] == 'null') | (df['article_wb'].isnull()) | 
           (df['comment'].isnull())|(df['comment'] == "")| (df['comment'] == 'null') )
    df = df[~mask]
    return df

clean2_df_train = filter_null(clean1_df_train)
clean2_df_dev = filter_null(clean1_df_dev)

In [44]:
clean2_df_dev["comment"].describe()

count     118117
unique    112922
top          傻 逼
freq         101
Name: comment, dtype: object

In [16]:
clean2_df_train["article_wb"].describe()

count                                               4499793
unique                                               168668
top       小米 6 还 未 发布 时 已 寄予厚望 雷军 饥饿 营销 制造 小米 6 一机 难求 景象...
freq                                                     84
Name: article_wb, dtype: object

In [40]:
su_grouped_test = clean2_df_dev.groupby(['article_id'])

In [41]:
len(su_grouped_test)

4439

### 2.3 根据长度范围，过滤comment article

In [19]:
# 先看一下长度的分布
a_com_ci = clean2_df_dev['comment'].map(lambda x:x.split(" "))
a_com_ci.str.len().describe()

count    118117.000000
mean          9.819789
std          14.467491
min           1.000000
25%           4.000000
50%           6.000000
75%          11.000000
max        1177.000000
Name: comment, dtype: float64

In [57]:
def filter_article_length(df):
    mask = ((df["article_wb"].apply(split_comment)<20) )
    df = df[~mask]
    return df 

def filter_comment_length(df):
    mask = ((df["comment"].apply(split_comment)<3))
    df = df[~mask]
    return df


In [58]:
cleaned_train_article = filter_article_length(clean2_df_train)
cleaned_dev_article = filter_article_length(clean2_df_dev)

In [59]:
len(cleaned_train_article)

4443663

In [60]:
cleaned_train_article_comment = filter_comment_length(cleaned_train_article)
cleaned_dev_article_comment = filter_comment_length(cleaned_dev_article)

In [61]:
len(cleaned_train_article_comment)

3853273

In [62]:
a_com_ci = cleaned_dev_article_comment['comment'].map(lambda x:x.split(" "))
a_com_ci.str.len().describe()

count    101050.000000
mean         11.014904
std          14.606550
min           3.000000
25%           5.000000
50%           7.000000
75%          13.000000
max        1177.000000
Name: comment, dtype: float64

In [63]:
su_grouped_test = cleaned_dev_article_comment.groupby(['article_id'])
len(su_grouped_test)

4396

## 3、balance数据（>2耗时23分钟，>10耗时10分钟，数据共1710545条，有79320条title_id）

In [75]:
cleaned_df_article_comment['upvote'].describe()

count    1.710545e+06
mean     5.106490e+00
std      3.932079e+01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      3.000000e+00
max      1.646300e+04
Name: upvote, dtype: float64

In [64]:
# 3(1) 对train数据集进行正负平衡（负例评论是来自其他文章下的评论）
print("开始对数据集进行正负平衡...")
su_grouped = cleaned_train_article_comment.groupby(['article_id'])
su_samples = []
for su_name, su_grouped_df in tqdm(su_grouped):
    su_grouped_pos_df = su_grouped_df[su_grouped_df['upvote'] >= 10]
    su_pos_count = len(su_grouped_pos_df)
    if su_pos_count == 0:
        continue
    su_grouped_neg_df = sample_random_neg(su_grouped, su_name, su_grouped_df, su_pos_count)
    su_samples.append(su_grouped_pos_df)
    su_samples.append(su_grouped_neg_df)

sampled_df_train = pd.concat(su_samples)

# 将正负均衡后的训练集数据保存到本地，以后直接从本地读取即可
save(sampled_df_train, './5resources/train_balance.csv')
print("正负均衡后数据集size：",len(sampled_df_train))

开始对数据集进行正负平衡...


HBox(children=(IntProgress(value=0, max=166751), HTML(value='')))

正负均衡后数据集size： 811557


In [65]:
# 3(2) 对dev数据集进行正负平衡（负例评论是来自其他文章下的评论）
print("开始对dev数据集进行正负平衡...")
su_grouped = cleaned_dev_article_comment.groupby(['article_id'])
su_samples = []
for su_name, su_grouped_df in tqdm(su_grouped):
    su_grouped_pos_df = su_grouped_df[su_grouped_df['upvote'] >= 10]
    su_pos_count = len(su_grouped_pos_df)
    if su_pos_count == 0:
        continue
    su_grouped_neg_df = sample_random_neg(su_grouped, su_name, su_grouped_df, su_pos_count)
    su_samples.append(su_grouped_pos_df)
    su_samples.append(su_grouped_neg_df)

sampled_df_dev = pd.concat(su_samples)

# 将正负均衡后的训练集数据保存到本地，以后直接从本地读取即可
save(sampled_df_dev, './5resources/dev_balance.csv')
print("正负均衡后数据集size：",len(sampled_df_dev))

开始对dev数据集进行正负平衡...


HBox(children=(IntProgress(value=0, max=4396), HTML(value='')))

正负均衡后数据集size： 18799


## 4、对train dec数据集shuffle

In [66]:
train = sampled_df_train.sample(frac=1)
dev= sampled_df_dev.sample(frac=1)

In [67]:
train.columns.values.tolist()

['article', 'article_id', 'article_wb', 'comment', 'title', 'upvote']

## 5、build dict（基于train数据集）

In [68]:
counter = collections.Counter()
for article_wb, comment_wb in tqdm(
        zip(train['article_wb'], train['comment'])):
    for wb in [article_wb, comment_wb]:
        words = wb.split()
        counter.update(words)

counter = counter.most_common()
word_dict = dict()
word_dict['<pad>'] = 0
word_dict['<unk>'] = 1
for word, count in counter:
#     if count < 20:
#         break
    word_dict[word] = len(word_dict)

with open('./5resources/'+str(len(word_dict))+'_word_dict.pkl', 'wb') as f:
    pickle.dump(word_dict, f)
print("词汇表的size：", len(word_dict))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

词汇表的size： 740469


## 5、build dict（基于pre trian word2vec）

In [72]:
from collections import defaultdict
import numpy as np

base_path = "D:\\data\\搜狗新闻word2vec\\"
embeddingFile = "corpus.vector"
embeddingFile = base_path + embeddingFile

def load_word_dict_ms(filename):
    """
    加载词向量文件

    :param filename: 文件名
    :return: embeddings列表和它对应的索引
    """
    embeddings = []
    embedding_pad = []
    embedding_unk = []
    word2idx = defaultdict(list)
    with open(filename, mode="r", encoding="utf-8") as rf:
        lock = True
        for i,line in enumerate(rf):
            if lock == True:
                word2idx["<pad>"] = 0
                embedding_pad = [float(0)]*(int(line.split(" ")[1]))
                embeddings.append(embedding_pad)
                word2idx["<unk>"] = 1
                embedding_unk = np.random.randn(int(line.split(" ")[1])).tolist()
                embeddings.append(embedding_unk)
                lock = False
                continue
            arr = line.split(" ")
            embedding = [float(val) for val in arr[1: ]]
            word2idx[arr[0]] = len(word2idx)
            embeddings.append(embedding)
    return embeddings, word2idx

In [73]:
embeddings, word_dict = load_word_dict_ms(embeddingFile)

## 6、生成id化的训练集与测试集

In [74]:
transform_data(word_dict,train, './5resources/train.csv')
transform_data(word_dict,dev, './5resources/dev.csv')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

## 生成test数据集

In [75]:
# 1、读取数据
df_test = load_data(filename_cut_test)

In [76]:
# 2、预处理
# 2.1清除原始的脏数据
clean1_df_test = filter_original(df_test)

In [77]:
# 2.2去除comment article字段的标点、停用词
clean1_df_test["comment"] = clean1_df_test["comment"].progress_apply(cleanPunStop.clean)
clean1_df_test["article_wb"] = clean1_df_test["article_wb"].progress_apply(cleanPunStop.clean)

HBox(children=(IntProgress(value=0, description='word break:', max=37939, style=ProgressStyle(description_widt…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


HBox(children=(IntProgress(value=0, description='word break:', max=37939, style=ProgressStyle(description_widt…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [79]:
len(clean1_df_test)

37939

In [80]:
cleaned_df_test_article = filter_article_length(clean1_df_test)
cleaned_df_test_article_comment = filter_comment_length(cleaned_df_test_article)

In [81]:
len(cleaned_df_test_article_comment)

35499

In [83]:
# with open('./simplify/newOrder_article200comment30pkl353198upvote2/'+'353198_word_dict.pkl', 'rb') as f:
#     word_dict_test = pickle.load(f)
transform_data(word_dict,cleaned_df_test_article_comment, './5resources_jieduan_pretrian_art200com30up10/test.csv')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))