## 连接es，批量读取wechat数据，划分正负样本

In [2]:
'''
    连接es
'''
from elasticsearch import Elasticsearch
from elasticsearch import helpers

es = Elasticsearch('http://ip:port')

In [13]:
'''
    从es里读取的content是html格式，需要解析成文本
'''
def sentencesMaker(html):
    sentences = []
    if not html or not html.strip():
        return sentences
    try:
        from html.parser import unescape
        html = unescape(html)

        import justext
        paragraphs = justext.justext(html, [])

        cache_sentences = ''

        for p in paragraphs:
            sent = p.text.strip().replace('\xa0', '').replace('\u3000', '')
            sent = sent.encode('gb2312', 'ignore').decode('gb2312').encode('gbk', 'ignore').decode('gbk')
            if not sent:
                continue

            # 可能是含有名字，需要进一步处理
            if len(cache_sentences) < 5:
                cache_sentences += ' ' + sent
            else:
                sentences.append(cache_sentences.strip())
                cache_sentences = sent

        if not not cache_sentences:
            sentences.append(cache_sentences.strip())
    except Exception as e:
        logger.error(e)

    return sentences

In [49]:
topic_words = ["互访", "沙龙", "博览会", "展览会", "圆桌", "挂牌仪式", "发布会", "演讲", "组委会", "对接会", "会议", "理事会", 
              "报告", "分享", "入选", "培训", "融资", "大会", "峰会", "年会", "高峰论坛", "论坛", "研讨会", "比赛", "大赛"]

In [50]:
print(len(set(topic_words)))

25


In [239]:
'''
    从索引wechat_bl2中获取含有主题关键字的doc的id；
    利用获得的id去索引wechat_ner中检索该doc中的关系rel的个数是否高于自己设置的阈值，高于，正样本；低于，负样本；
    注：从bl2中取的id，去ner中查看rel，会出现返回结果与输入id个数不一致，原因是ner在做rel操作时可能会有延迟，故按当前时刻获取的ner数据为样本；
        bl2索引中也有rel关系，但不准，以ner索引中的为准；
'''

# -*- coding: utf-8 -*-

from elasticsearch import helpers


def search_bl2(from_number, offsize):
    es_search_options = set_search_optional_bl2(from_number, offsize)
    es_result = get_search_result(es_search_options, index='wechat_bl2')
    return es_result

def search_ner(final_results):
    es_search_options = set_search_optional_ner(final_results)
    es_result = get_search_result(es_search_options, index='wechat_ner')
    return es_result

def get_id_list(es_result):
    final_result = []
    for item in es_result: # ["hits"]["hits"]
        final_result.append(item["_id"])
    return final_result

def get_cleaned_content_list(es_result):
    final_result = []
    for item in es_result:
        final_result.append("".join(sentencesMaker(item["_source"]["content"])).replace("\n", ""))
    return final_result


def get_search_result(es_search_options, index, scroll='5m', doc_type='news', timeout="1m"):
    es_result = helpers.scan(
        es,
        query=es_search_options,
        scroll=scroll,
        index=index,
        doc_type=doc_type,
        timeout=timeout
    )
    return es_result


def set_search_optional_bl2(from_number, offsize):
    # 检索选项
    es_search_options = {
        "from":from_number, "size":offsize,
        "query": {
            "bool": {
              "should": [
                {"match": {"title": "演讲 培训 大会"}},
                {"match": {"content": "演讲 培训 大会"}},
              ]
           }
        },
        "_source":{
            "includes":[ 
                "url",
                "title",
                "abstract",
                "content"
            ]
        }
    }
    return es_search_options

def set_search_optional_ner(final_results):
    # 检索选项
    es_search_options = {
          "query": {
            "ids":{
              "values": final_results
            }
          }, 
          "_source":  ["rel"]
    }
    return es_search_options


def split_samples(data):
    negative_samples = []
    positive_samples = []
    for i in data:
        if not i["_source"]["rel"] or len(i["_source"]["rel"]) < 2:
            negative_samples.append(i["_id"])
        else:
            positive_samples.append(i["_id"])
    return negative_samples, positive_samples


if __name__ == '__main__': 
    all_results_bl2_1 = search_bl2(0, 5)
    bl2_contents_results = get_cleaned_content_list(all_results_bl2_1)
    
    all_results_bl2_2 = search_bl2(0, 5)
    bl2_ids_results = get_id_list(all_results_bl2_2)
    
    print("length of total samples from bl2:", len(bl2_ids_results))
    
    ids_contents_dic = zip(bl2_ids_results, bl2_contents_results)
    all_results_ner = search_ner(bl2_ids_results)
    
    negative_samples, positive_samples = split_samples(all_results_ner)
    
    print("length of total samples from ner:", len(negative_samples) + len(positive_samples))
    print("length of negative samples:", len(negative_samples))
    print("length of positive samples:", len(positive_samples))
    
    with open("./test_negative_samples.txt", "a", encoding="utf-8") as nf, open("./test_positive_samples.txt", "a", encoding="utf-8") as pf:
        for _id, _content  in ids_contents_dic:
            if _id in negative_samples:
                nf.write("1" + "\t" + _id + "\t" +_content + "\n")
            if _id in positive_samples:
                pf.write("0" + "\t" + _id + "\t" + _content + "\n")

length of total samples from bl2: 15851
length of total samples from ner: 15358
length of negative samples: 7975
length of positive samples: 7383


In [114]:
# sum_negative_samples = 1339
# sum_positive_samples = 1275
# total_samples = sum_negative_samples + sum_positive_samples

In [240]:
sum_negative_samples += len(negative_samples)
sum_positive_samples += len(positive_samples)
total_samples = sum_negative_samples + sum_positive_samples
print("所有总样本个数%d" % total_samples)
print("所有负样本个数%d，占比%.2f" % (sum_negative_samples, (sum_negative_samples / total_samples)))
print("所有正样本个数%d，占比%.2f" % (sum_positive_samples, (sum_positive_samples / total_samples)))

所有总样本个数151485
所有负样本个数89547，占比0.59
所有正样本个数61938，占比0.41


In [None]:
'''
"互访 沙龙" 
length of total samples from bl2: 844
length of total samples from ner: 804
length of negative samples: 397  
length of positive samples: 407
所有总样本个数804
所有负样本个数397，占比0.493781
所有正样本个数407，占比0.506219

"博览会 "
length of total samples from bl2: 915
length of total samples from ner: 870
length of negative samples: 430
length of positive samples: 440
所有总样本个数1674
所有负样本个数827，占比0.494026
所有正样本个数847，占比0.505974

"展览会"
length of total samples from bl2: 967
length of total samples from ner: 940
length of negative samples: 512
length of positive samples: 428
所有总样本个数2614
所有负样本个数1339，占比0.512242
所有正样本个数1275，占比0.487758

"圆桌"
length of total samples from bl2: 1012
length of total samples from ner: 994
length of negative samples: 314
length of positive samples: 680
所有总样本个数3608
所有负样本个数1653，占比0.458149
所有正样本个数1955，占比0.541851

"年会"
length of total samples from bl2: 1152
length of total samples from ner: 1128
length of negative samples: 466
length of positive samples: 662
所有总样本个数4736
所有负样本个数2119，占比0.447424
所有正样本个数2617，占比0.552576

"研讨会"
length of total samples from bl2: 1776
length of total samples from ner: 1720
length of negative samples: 788
length of positive samples: 932
所有总样本个数6456
所有负样本个数2907，占比0.450279
所有正样本个数3549，占比0.549721

"入选"
length of total samples from bl2: 1364
length of total samples from ner: 1316
length of negative samples: 630
length of positive samples: 686
所有总样本个数7772
所有负样本个数3537，占比0.455095
所有正样本个数4235，占比0.544905

"大赛"
length of total samples from bl2: 1767
length of total samples from ner: 1689
length of negative samples: 954
length of positive samples: 735
所有总样本个数9461
所有负样本个数4491，占比0.47
所有正样本个数4970，占比0.53

"理事会"
length of total samples from bl2: 1955
length of total samples from ner: 1895
length of negative samples: 564
length of positive samples: 1331
所有总样本个数11356
所有负样本个数5055，占比0.45
所有正样本个数6301，占比0.55

"组委会"
length of total samples from bl2: 2405
length of total samples from ner: 2306
length of negative samples: 977
length of positive samples: 1329
所有总样本个数13662
所有负样本个数6032，占比0.44
所有正样本个数7630，占比0.56

"峰会"
length of total samples from bl2: 3587
length of total samples from ner: 3499
length of negative samples: 1697
length of positive samples: 1802
所有总样本个数17161
所有负样本个数7729，占比0.45
所有正样本个数9432，占比0.55

"对接会"
length of total samples from bl2: 47877
length of total samples from ner: 46478
length of negative samples: 30533
length of positive samples: 15945
所有总样本个数63639
所有负样本个数38262，占比0.60
所有正样本个数25377，占比0.40

"发布会"
length of total samples from bl2: 25741
length of total samples from ner: 24973
length of negative samples: 15524
length of positive samples: 9449
所有总样本个数88612
所有负样本个数53786，占比0.61
所有正样本个数34826，占比0.39

"报告 分享"
length of total samples from bl2: 27257
length of total samples from ner: 26413
length of negative samples: 16715
length of positive samples: 9698
所有总样本个数115025
所有负样本个数70501，占比0.61
所有正样本个数44524，占比0.39

                        "报告"
                        length of total samples from bl2: 16098
                        length of total samples from ner: 15567
                        length of negative samples: 9711
                        length of positive samples: 5856

                        "分享"
                        length of total samples from bl2: 14930
                        length of total samples from ner: 14472
                        length of negative samples: 8903
                        length of positive samples: 5569
                        
"挂牌仪式 会议 融资"
length of total samples from bl2: 21807
length of total samples from ner: 21102
length of negative samples: 11071
length of positive samples: 10031
所有总样本个数136127
所有负样本个数81572，占比0.60
所有正样本个数54555，占比0.40


"演讲 培训 大会"
length of total samples from bl2: 15851
length of total samples from ner: 15358
length of negative samples: 7975
length of positive samples: 7383
所有总样本个数151485
所有负样本个数89547，占比0.59
所有正样本个数61938，占比0.41


'''

In [242]:
'''
    剔除结果中重复的文章
'''
with open("./filtered_negative_samples.txt", "w", encoding="utf-8") as fnf, open("./filtered_positive_samples.txt", "w", encoding="utf-8") as fpf:
    with open("./test_negative_samples.txt", "r", encoding="utf-8") as nf, open("./test_positive_samples.txt", "r", encoding="utf-8") as pf:

        nf_data = nf.readlines()
        print("negative_samples中所有文件长度：", len(nf_data))
        print("filtered_negative_samples长度：", len(set(nf_data)))
        for i in set(nf_data):
            fnf.write(i)

        pf_data = pf.readlines()
        print("positive_samples中所有文件长度：", len(pf_data))
        print("filtered_positive_samples长度：", len(set(pf_data)))
        for j in set(pf_data):
            fpf.write(j)
            
        print("正样本占比%.2f" %  (len(set(pf_data))/ (len(set(pf_data)) + len(set(nf_data)))))
        print("负样本占比%.2f" %  (len(set(nf_data))/ (len(set(pf_data)) + len(set(nf_data)))))

negative_samples中所有文件长度： 89540
filtered_negative_samples长度： 56839
positive_samples中所有文件长度： 61936
filtered_positive_samples长度： 32254
正样本占比0.36
负样本占比0.64


### 扩充负样本的特征，接着从toutiao里取负样本

In [329]:
'''
    从头条取负样本
'''
# -*- coding: utf-8 -*-

from elasticsearch import helpers


def search_bl2(from_number, offsize):
    es_search_options = set_search_optional_bl2(from_number, offsize)
    es_result = get_search_result(es_search_options, index='toutiao_bl2')
    return es_result

def search_ner(final_results):
    es_search_options = set_search_optional_ner(final_results)
    es_result = get_search_result(es_search_options, index='toutiao_ner')
    return es_result

def get_id_list(es_result):
    final_result = []
    for item in es_result: # ["hits"]["hits"]
        final_result.append(item["_id"])
    return final_result

# def get_cleaned_content_list(es_result):
#     final_result = []
#     for item in es_result:
#         final_result.append("".join(sentencesMaker(item["_source"]["content"])).replace("\n", ""))
#     return final_result


def get_search_result(es_search_options, index, scroll='5m', doc_type='news', timeout="1m"):
    es_result = helpers.scan(
        es,
        query=es_search_options,
        scroll=scroll,
        index=index,
        doc_type=doc_type,
        timeout=timeout
    )
    return es_result


def set_search_optional_bl2(from_number, offsize):
    # 检索选项
    es_search_options = {
      "query": {

        "match_all": {}
      },
        "_source":["content"]

    }
    return es_search_options

def set_search_optional_ner(final_results):
    # 检索选项
    es_search_options = {
          "query": {
            "ids":{
              "values": final_results
            }
          }, 
          "_source":  ["rel"]
    }
    return es_search_options


def split_samples(data):
    negative_samples = []
    positive_samples = []
    for i in data:
        if not i["_source"]["rel"] or len(i["_source"]["rel"]) < 2:
            negative_samples.append(i["_id"])
        else:
            positive_samples.append(i["_id"])
    return negative_samples, positive_samples


if __name__ == '__main__': 
    all_results_bl2 = search_bl2(0, 5)
    all_results_bl2 = [item for item in all_results_bl2]  # 把从bl2获取的数据保存成list，数据里包括id,content等较少的信息
    print("all_results_bl2长度：", len(all_results_bl2))
    
    bl2_ids_results = get_id_list(all_results_bl2)  # 接着返回的bl2结果中，取出来ids，传给ner返回rel
    print("从bl2获取的id的长度：", len(bl2_ids_results))
    
    all_results_ner = search_ner(bl2_ids_results)   # ner利用bl2返回的ids的数据，取rel
    all_results_ner = [item for item in all_results_ner]  # 把从ner获取的结果也保存成list，包括id，rel等较少的信息
    print("all_results_ner长度：", len(all_results_ner))
    
    
    negative_samples, positive_samples = split_samples(all_results_ner)  # 查看ner返回的rel信息，划分正负样本的id
    
    print("length of total samples from ner:", len(negative_samples) + len(positive_samples))
    print("length of negative samples:", len(negative_samples))
    print("length of positive samples:", len(positive_samples))
    
    with open("./toutiao_negative_samples.txt", "w", encoding="utf-8") as nf:  # 根据正负样本的id，去bl2返回的结果中，取回相应的content
        for item  in all_results_bl2:
            if item["_id"] in negative_samples:
                nf.write("1" + "\t" + item["_id"] + "\t" + "".join(sentencesMaker(item["_source"]["content"])).replace("\n", "") + "\n")

all_results_bl2长度： 52760
从bl2获取的id的长度： 52760
all_results_ner长度： 52760
length of total samples from ner: 52760
length of negative samples: 40120
length of positive samples: 12640


In [330]:
'''
    查看头条中的非重复的负样本数
'''
with open("./toutiao_negative_samples.txt", "r", encoding="utf-8") as nf:
    nf_data = nf.readlines()
    print("negative_samples中所有文件长度：", len(nf_data))
    print("filtered_negative_samples长度：", len(set(nf_data)))

negative_samples中所有文件长度： 40120
filtered_negative_samples长度： 40120


In [332]:
select_number = []
for i in range(0, 40120, 4):
    select_number.append(i)
print(len(select_number))

10030


In [333]:
'''
    计划取10030条数据，去掉一些内容可能为空的，实际从头条取9947条数据
'''
with open("./selected_toutiao_negative_samples.txt", "w", encoding="utf-8") as inf:
    with open("./toutiao_negative_samples.txt", "r", encoding="utf-8") as outf:
        nf_data = outf.readlines()
        for i in select_number:
            if len(nf_data[i].strip().split('\t')) == 3:
                inf.write(nf_data[i])

###  扩充负样本的特征，接着从baidu里取负样本

In [334]:
'''
    从百度取负样本
'''
# -*- coding: utf-8 -*-

from elasticsearch import helpers


def search_bl2(from_number, offsize):
    es_search_options = set_search_optional_bl2(from_number, offsize)
    es_result = get_search_result(es_search_options, index='baidu_bl2')
    return es_result

def search_ner(final_results):
    es_search_options = set_search_optional_ner(final_results)
    es_result = get_search_result(es_search_options, index='baidu_ner')
    return es_result

def get_id_list(es_result):
    final_result = []
    for item in es_result: # ["hits"]["hits"]
        final_result.append(item["_id"])
    return final_result

# def get_cleaned_content_list(es_result):
#     final_result = []
#     for item in es_result:
#         final_result.append("".join(sentencesMaker(item["_source"]["content"])).replace("\n", ""))
#     return final_result


def get_search_result(es_search_options, index, scroll='5m', doc_type='news', timeout="1m"):
    es_result = helpers.scan(
        es,
        query=es_search_options,
        scroll=scroll,
        index=index,
        doc_type=doc_type,
        timeout=timeout
    )
    return es_result


def set_search_optional_bl2(from_number, offsize):
    # 检索选项
    es_search_options = {
      "query": {

        "match_all": {}
      },
        "_source":["content"]

    }
    return es_search_options

def set_search_optional_ner(final_results):
    # 检索选项
    es_search_options = {
          "query": {
            "ids":{
              "values": final_results
            }
          }, 
          "_source":  ["rel"]
    }
    return es_search_options


def split_samples(data):
    negative_samples = []
    positive_samples = []
    for i in data:
        if not i["_source"]["rel"] or len(i["_source"]["rel"]) < 2:
            negative_samples.append(i["_id"])
        else:
            positive_samples.append(i["_id"])
    return negative_samples, positive_samples


if __name__ == '__main__': 
    all_results_bl2 = search_bl2(0, 5)
    all_results_bl2 = [item for item in all_results_bl2]  # 把从bl2获取的数据保存成list，数据里包括id,content等较少的信息
    print("all_results_bl2长度：", len(all_results_bl2))
    
    bl2_ids_results = get_id_list(all_results_bl2)  # 接着返回的bl2结果中，取出来ids，传给ner返回rel
    print("从bl2获取的id的长度：", len(bl2_ids_results))
    
    all_results_ner = search_ner(bl2_ids_results)   # ner利用bl2返回的ids的数据，取rel
    all_results_ner = [item for item in all_results_ner]  # 把从ner获取的结果也保存成list，包括id，rel等较少的信息
    print("all_results_ner长度：", len(all_results_ner))
    
    
    negative_samples, positive_samples = split_samples(all_results_ner)  # 查看ner返回的rel信息，划分正负样本的id
    
    print("length of total samples from ner:", len(negative_samples) + len(positive_samples))
    print("length of negative samples:", len(negative_samples))
    print("length of positive samples:", len(positive_samples))
    
    with open("./baidu_negative_samples.txt", "w", encoding="utf-8") as nf:  # 根据正负样本的id，去bl2返回的结果中，取回相应的content
        for item  in all_results_bl2:
            if item["_id"] in negative_samples:
                nf.write("1" + "\t" + item["_id"] + "\t" + "".join(sentencesMaker(item["_source"]["content"])).replace("\n", "") + "\n")

all_results_bl2长度： 45456
从bl2获取的id的长度： 45456
all_results_ner长度： 45391
length of total samples from ner: 45391
length of negative samples: 23997
length of positive samples: 21394


In [335]:
'''
    查看百度中的非重复的负样本数
'''
with open("./baidu_negative_samples.txt", "r", encoding="utf-8") as nf:
    nf_data = nf.readlines()
    print("negative_samples中所有文件长度：", len(nf_data))
    print("filtered_negative_samples长度：", len(set(nf_data)))

negative_samples中所有文件长度： 23997
filtered_negative_samples长度： 23997


In [348]:
select_number = []
for i in range(0, 23997, 2):
    select_number.append(i)
print(len(select_number))

11999


In [339]:
'''
    计划取11999条数据，去掉一些内容可能为空的，实际从头条取9947条数据
'''
with open("./selected_baidu_negative_samples.txt", "w", encoding="utf-8") as inf:
    with open("./baidu_negative_samples.txt", "r", encoding="utf-8") as outf:
        nf_data = outf.readlines()
        count = 0
        for i in select_number:
            if len(nf_data[i].strip().split('\t')) == 3:
                inf.write(nf_data[i])
                count += 1
        print(count)

11999


In [340]:
# wechat + baidu + toutiao 获取的负样本数
11999 + 9947 + 11846

33792

In [351]:
# 从36kr获取2732条负样本
33792+2732

36524

In [350]:
12738

12738

###  扩充负样本的特征，接着从36kr里取负样本

In [342]:
'''
    从百度取负样本
'''
# -*- coding: utf-8 -*-

from elasticsearch import helpers


def search_bl2(from_number, offsize):
    es_search_options = set_search_optional_bl2(from_number, offsize)
    es_result = get_search_result(es_search_options, index='36kr_bl2')
    return es_result

def search_ner(final_results):
    es_search_options = set_search_optional_ner(final_results)
    es_result = get_search_result(es_search_options, index='36kr_ner')
    return es_result

def get_id_list(es_result):
    final_result = []
    for item in es_result: # ["hits"]["hits"]
        final_result.append(item["_id"])
    return final_result

# def get_cleaned_content_list(es_result):
#     final_result = []
#     for item in es_result:
#         final_result.append("".join(sentencesMaker(item["_source"]["content"])).replace("\n", ""))
#     return final_result


def get_search_result(es_search_options, index, scroll='5m', doc_type='news', timeout="1m"):
    es_result = helpers.scan(
        es,
        query=es_search_options,
        scroll=scroll,
        index=index,
        doc_type=doc_type,
        timeout=timeout
    )
    return es_result


def set_search_optional_bl2(from_number, offsize):
    # 检索选项
    es_search_options = {
      "query": {

        "match_all": {}
      },
        "_source":["content"]

    }
    return es_search_options

def set_search_optional_ner(final_results):
    # 检索选项
    es_search_options = {
          "query": {
            "ids":{
              "values": final_results
            }
          }, 
          "_source":  ["rel"]
    }
    return es_search_options


def split_samples(data):
    negative_samples = []
    positive_samples = []
    for i in data:
        if not i["_source"]["rel"] or len(i["_source"]["rel"]) < 2:
            negative_samples.append(i["_id"])
        else:
            positive_samples.append(i["_id"])
    return negative_samples, positive_samples


if __name__ == '__main__': 
    all_results_bl2 = search_bl2(0, 5)
    all_results_bl2 = [item for item in all_results_bl2]  # 把从bl2获取的数据保存成list，数据里包括id,content等较少的信息
    print("all_results_bl2长度：", len(all_results_bl2))
    
    bl2_ids_results = get_id_list(all_results_bl2)  # 接着返回的bl2结果中，取出来ids，传给ner返回rel
    print("从bl2获取的id的长度：", len(bl2_ids_results))
    
    all_results_ner = search_ner(bl2_ids_results)   # ner利用bl2返回的ids的数据，取rel
    all_results_ner = [item for item in all_results_ner]  # 把从ner获取的结果也保存成list，包括id，rel等较少的信息
    print("all_results_ner长度：", len(all_results_ner))
    
    
    negative_samples, positive_samples = split_samples(all_results_ner)  # 查看ner返回的rel信息，划分正负样本的id
    
    print("length of total samples from ner:", len(negative_samples) + len(positive_samples))
    print("length of negative samples:", len(negative_samples))
    print("length of positive samples:", len(positive_samples))
    
    with open("./36kr_negative_samples.txt", "w", encoding="utf-8") as nf:  # 根据正负样本的id，去bl2返回的结果中，取回相应的content
        for item  in all_results_bl2:
            if item["_id"] in negative_samples:
                nf.write("1" + "\t" + item["_id"] + "\t" + "".join(sentencesMaker(item["_source"]["content"])).replace("\n", "") + "\n")

all_results_bl2长度： 4970
从bl2获取的id的长度： 4970
all_results_ner长度： 4970
length of total samples from ner: 4970
length of negative samples: 2732
length of positive samples: 2238


In [343]:
'''
    查看百度中的非重复的负样本数
'''
with open("./36kr_negative_samples.txt", "r", encoding="utf-8") as nf:
    nf_data = nf.readlines()
    print("negative_samples中所有文件长度：", len(nf_data))
    print("filtered_negative_samples长度：", len(set(nf_data)))

negative_samples中所有文件长度： 2732
filtered_negative_samples长度： 2732


In [345]:
'''
    计划取全部的36kr负样本，实际从头条取9947条数据
'''
with open("./selected_36kr_negative_samples.txt", "w", encoding="utf-8") as inf:
    with open("./36kr_negative_samples.txt", "r", encoding="utf-8") as outf:
        nf_data = outf.readlines()
        count = 0
        for i in nf_data:
            if len(i.strip().split('\t')) == 3:
                inf.write(i)
                count += 1
        print(count)

2732


### 将从toutiao，baidu，36kr，wechat取得的非空非重复的负样本合并，并出除id列
### 将从wechat取得到正样本换成统一名字

In [356]:
with open("./selected_36kr_negative_samples.txt", "r", encoding="utf-8") as kr:
    with open("./selected_baidu_negative_samples.txt", "r", encoding="utf-8") as baidu:
        with open("./selected_toutiao_negative_samples.txt", "r", encoding="utf-8") as toutiao:
            with open("./selected_wechat_negative_samples.txt", "r", encoding="utf-8") as wechat:
                with open("./total_negative_samples.txt", "w", encoding="utf-8") as negative:
                    data_kr = kr.readlines()
                    data_baidu = baidu.readlines()
                    data_toutiao = toutiao.readlines()
                    data_wechat = wechat.readlines()
                    for line in data_kr:
                        negative.write(line.split("\t")[0] + "\t" + line.split("\t")[2])
                    for line in data_baidu:
                        negative.write(line.split("\t")[0] + "\t" + line.split("\t")[2])
                    for line in data_toutiao:
                        negative.write(line.split("\t")[0] + "\t" + line.split("\t")[2])
                    for line in data_wechat:
                        negative.write(line.split("\t")[0] + "\t" + line.split("\t")[2])

In [357]:
 with open("./total_negative_samples.txt", "r", encoding="utf-8") as negative:
        data = negative.readlines()
        print(len(data))

36524


In [358]:
with open("./filtered_wechat_positive_samples.txt", "r", encoding="utf-8") as wechat:
    with open("./total_positive_samples.txt", "w", encoding="utf-8") as positive:
        data_wechat = wechat.readlines()
        for line in data_wechat:
            positive.write(line.split("\t")[0] + "\t" + line.split("\t")[2])

In [359]:
 with open("./total_positive_samples.txt", "r", encoding="utf-8") as positive:
        data = positive.readlines()
        print(len(data))

12738


### 获取的ids和contents数量不一致问题
#### 因为从es获得的数据是generator形式，为了获取id和content就先后两次连接es，造成返回的结果数量不一致，接着后面再用zip将
#### 数量不一致的id和content拼接，可能更错。
#### 改正： 一次获取的es数据，存成list格式；避免使用zip，直接根据id返回对应的content
#### 但是根据bl2的id去ner获取数据时，可能出现数据不一致，因为ner在解析时可能有延迟，但是只要保证id和content对应起来就可以

In [322]:
'''
    从36kr获取数据实验，36kr数据量较少，4933条左右
'''
# -*- coding: utf-8 -*-

from elasticsearch import helpers


def search_bl2(from_number, offsize):
    es_search_options = set_search_optional_bl2(from_number, offsize)
    es_result = get_search_result(es_search_options, index='toutiao_bl2')
    return es_result

def search_ner(final_results):
    es_search_options = set_search_optional_ner(final_results)
    es_result = get_search_result(es_search_options, index='toutiao_ner')
    return es_result

def get_id_list(es_result):
    final_result = []
    for item in es_result: # ["hits"]["hits"]
        final_result.append(item["_id"])
    return final_result

def get_cleaned_content_list(es_result):
    final_result = []
    for item in es_result:
        final_result.append("".join(sentencesMaker(item["_source"]["content"])).replace("\n", ""))
    return final_result


def get_search_result(es_search_options, index, scroll='5m', doc_type='news', timeout="1m"):
    es_result = helpers.scan(
        es,
        query=es_search_options,
        scroll=scroll,
        index=index,
        doc_type=doc_type,
        timeout=timeout
    )
    return es_result


def set_search_optional_bl2(from_number, offsize):
    # 检索选项
    es_search_options = {
      "query": {

        "match_all": {}
      }

    }
    return es_search_options

def set_search_optional_ner(final_results):
    # 检索选项
    es_search_options = {
          "query": {
            "ids":{
              "values": final_results
            }
          }, 
          "_source":  ["rel"]
    }
    return es_search_options


def split_samples(data):
    negative_samples = []
    positive_samples = []
    for i in data:
        if not i["_source"]["rel"] or len(i["_source"]["rel"]) < 2:
            negative_samples.append(i["_id"])
        else:
            positive_samples.append(i["_id"])
    return negative_samples, positive_samples


if __name__ == '__main__': 
    all_results_bl2_1 = search_bl2(0, 5)
    
    j  = 0
    for i in all_results_bl2_1:
        j  += 1
    print(j)
        
#     bl2_contents_results = get_cleaned_content_list(all_results_bl2_1)
#     print(len(bl2_contents_results))
    
    all_results_bl2_2 = search_bl2(0, 5)
    
    k = 0
    for i in all_results_bl2_2:
        k += 1
    print(k)
        
#     bl2_ids_results = get_id_list(all_results_bl2_2)
#     print(len(bl2_ids_results))
    
#     print("length of total samples from bl2:", len(bl2_ids_results))
    
#     ids_contents_dic = zip(bl2_ids_results, bl2_contents_results)
    
#     all_results_ner = search_ner(bl2_ids_results)
    
#     negative_samples, positive_samples = split_samples(all_results_ner)
    
#     print("length of total samples from ner:", len(negative_samples) + len(positive_samples))
#     print("length of negative samples:", len(negative_samples))
#     print("length of positive samples:", len(positive_samples))
    
#     with open("./toutiao_negative_samples.txt", "w", encoding="utf-8") as nf:
#         for _id, _content  in ids_contents_dic:
#             if _id in negative_samples:
#                 nf.write("1" + "\t" + _id + "\t" +_content + "\n")

52701
52708


In [162]:
# -*- coding: utf-8 -*-
# import es_client
from elasticsearch import helpers


def search(final_results):
    es_search_options = set_search_optional(final_results)
    es_result = get_search_result(es_search_options)
    final_result = get_result_list(es_result)
    return final_result


def get_result_list(es_result):
    final_result = []
    for item in es_result: # ["hits"]["hits"]
        final_result.append(item["_source"]["rel"])
    return final_result


def get_search_result(es_search_options, scroll='5m', index='wechat_ner', doc_type='news', timeout="1m"):
    es_result = helpers.scan(
        es,
        query=es_search_options,
        scroll=scroll,
        index=index,
        doc_type=doc_type,
        timeout=timeout
    )
    return es_result


def set_search_optional(final_results):
    # 检索选项
    es_search_options = {
          "query": {
            "ids":{
              "values": final_results
            }
          }, 
          "_source":  ["rel"]
    }
    return es_search_options


if __name__ == '__main__':
    final_results = search(final_results)
    print(len(final_results))

48
