In [None]:
'''
    从*.bl2中批量获取四个平台的数据，wechat/baidu/toutiao/36kr，以保证样本特征多样性。
'''

### 按主题词从es里重新获取wechat数据，划分正负样本

In [92]:
topic_words = ["大赛", "比赛", "博览会", "研讨会", "论坛", "高峰论坛", "年会", "峰会", "大会", "融资", "培训","沙龙", "入选",  
              "分享", "报告", "理事会", "会议", "对接会", "组委会", "展览会", "演讲", "发布会", "挂牌仪式", "互访", "圆桌"]

In [93]:
len(topic_words)

25

In [41]:
'''
    从es里读取的content是html格式，需要解析成文本
'''
def sentencesMaker(html):
    sentences = []
    if not html or not html.strip():
        return sentences
    try:
        from html.parser import unescape
        html = unescape(html)

        import justext
        paragraphs = justext.justext(html, [])

        cache_sentences = ''

        for p in paragraphs:
            sent = p.text.strip().replace('\xa0', '').replace('\u3000', '')
            sent = sent.encode('gb2312', 'ignore').decode('gb2312').encode('gbk', 'ignore').decode('gbk')
            if not sent:
                continue

            # 可能是含有名字，需要进一步处理
            if len(cache_sentences) < 5:
                cache_sentences += ' ' + sent
            else:
                sentences.append(cache_sentences.strip())
                cache_sentences = sent

        if not not cache_sentences:
            sentences.append(cache_sentences.strip())
    except Exception as e:
        logger.error(e)

    return sentences

In [11]:
'''
    连接es
'''
from elasticsearch import Elasticsearch
from elasticsearch import helpers

es = Elasticsearch('http://ip:port')

In [401]:
'''
    之前是只有title或者content里包括25个主题词就取出来，现在是必须题目里包括关键词
    取正、负样本，划分依据从之前的rel至少又3个变成rel中不同的per至少有3个
'''

from elasticsearch import helpers


def search_bl2(from_number, off_size):
    es_search_options = set_search_optional_bl2(from_number, off_size)
    es_result = get_search_result(query=es_search_options, index="*_bl2")
    return es_result

def search_ner(final_results, from_number, off_size):
    es_search_options = set_search_optional_ner(final_results, from_number, off_size)
    es_result = get_search_result(query=es_search_options, index='*_ner')
    return es_result

def get_id_list(es_result):
    final_result = []
    for item in es_result["hits"]["hits"]: # 
        final_result.append(item["_id"])
    return final_result

def get_search_result(query, index):
    es_result = es.search(
        body=query,
        index=index,
        doc_type='news'
    )
    return es_result

def set_search_optional_bl2(from_number, off_size): # 必须题目里包括关键词
    # 检索选项
    es_search_options = {
  "from":from_number, "size":off_size,
  "query": {
    "bool": {
      "must": [
        {
          "terms": {
            "title": [
                "圆桌"
            ]
          }
        },
#         {
#           "terms": {
#             "title": [
#                "互访", "沙龙", "博览会", "展览会", "圆桌", "挂牌仪式", "发布会",
#                "演讲", "组委会", "对接会", "会议", "理事会", "报告", "分享", "入选",
#                "培训", "融资", "大会", "峰会", "年会", "高峰论坛", "论坛", "研讨会", "比赛", "大赛"
#             ]
#           }
#         }
      ]
    }
  }
}
    return es_search_options

def set_search_optional_ner(final_results, from_number, off_size):
    # 检索选项
    es_search_options = {
        "from":from_number, "size":off_size,
          "query": {
            "ids":{
              "values": final_results
            }
          }, 
          "_source":  ["rel"]
    }    
    return es_search_options


def split_samples(data):
    negative_samples = []
    positive_samples = []
    for i in data["hits"]["hits"]:
        
        pers = set([rels.get("per", None) for rels in i['_source'].get('rel',[])])      # 抽出来的rel中不同的人名的个数per
        titles = set([rels.get("title", None) for rels in i['_source'].get('rel', [])]) # rel中解析出来的title是否为空
        
        if len(pers) < 3 or "" in titles: # per小于3或者title有为空的
            negative_samples.append(i["_id"])  
        else:
            positive_samples.append(i["_id"])
            
    return negative_samples, positive_samples

    
    

if __name__ == '__main__': 
    all_results_bl2 = search_bl2(0,1000)
    
    bl2_ids_results = get_id_list(all_results_bl2)  # 接着返回的bl2结果中，取出来ids，传给ner返回rel
    print("从bl2获取的id的长度：", len(bl2_ids_results))
    
    all_results_ner = search_ner(bl2_ids_results, 0, 1000)   # ner利用bl2返回的ids的数据，取rel 
    negative_samples, positive_samples = split_samples(all_results_ner)  # 查看ner返回的rel信息，划分正负样本的id
    
    print("length of total samples from ner:", len(set(negative_samples)) + len(set(positive_samples)))
    print("length of negative samples:", len(set(negative_samples)))
    print("length of positive samples:", len(set(positive_samples)))
    
#    根据正负样本的id，去bl2返回的结果中，取回相应的content
    with open("./negative_samples.txt", "a", encoding="utf-8") as nf, open("./positive_samples.txt", "a", encoding="utf-8") as pf:  
        for item in all_results_bl2["hits"]["hits"]:
            if item["_id"] in negative_samples:
                nf.write("1" + "\t" + item["_id"] + "\t" + item["_source"]["title"]+"//" + "".join(sentencesMaker(item["_source"]["content"])).replace("\n", "") + "\n")
            if item["_id"] in positive_samples:
                pf.write("0" + "\t" + item["_id"] + "\t" + item["_source"]["title"]+"//" + "".join(sentencesMaker(item["_source"]["content"])).replace("\n", "") + "\n")



#     with open("./negative_dasai.txt", "w", encoding="utf-8") as nf, open("./positive_dasai.txt", "w", encoding="utf-8") as pf:  
#         for item in all_results_bl2["hits"]["hits"]:
#             if item["_id"] in negative_samples:
#                 nf.write("1" + "\t" + item["_id"] + "\t" + item["_source"]["title"]+"//" + "".join(sentencesMaker(item["_source"]["content"])).replace("\n", "") + "\n")
#             if item["_id"] in positive_samples:
#                 pf.write("0" + "\t" + item["_id"] + "\t" + item["_source"]["title"]+"//" + "".join(sentencesMaker(item["_source"]["content"])).replace("\n", "") + "\n")

从bl2获取的id的长度： 93
length of total samples from ner: 92
length of negative samples: 62
length of positive samples: 30


In [403]:
'''
    剔除结果中重复的文章
'''
with open("./filtered_negative_samples.txt", "w", encoding="utf-8") as fnf, open("./filtered_positive_samples.txt", "w", encoding="utf-8") as fpf:
    with open("./negative_samples.txt", "r", encoding="utf-8") as nf, open("./positive_samples.txt", "r", encoding="utf-8") as pf:

            nf_data = nf.readlines()
            print("negative_samples中所有文件长度：", len(nf_data))
            print("filtered_negative_samples长度：", len(set(nf_data)))
            count1 = 0
            for i in set(nf_data):
                if len(i.strip().split("\t")) == 3:
                    fnf.write(i)
                    count1 += 1
            print("负样本剔除空内容之后文件长度：", count1)
            print("\n")

            pf_data = pf.readlines()
            print("positive_samples中所有文件长度：", len(pf_data))
            print("filtered_positive_samples长度：", len(set(pf_data)))
            count2 = 0
            for j in set(pf_data):
                if len(j.strip().split("\t")) == 3:
                    fpf.write(j)
                    count2 += 1
            print("正样本剔除空内容之后文件长度：", count2)
            print("\n")

            print("包含空样本的负样本占比：%.2f" %  (len(set(nf_data))/ (len(set(pf_data)) + len(set(nf_data)))))
            print("包含空样本的正样本占比：%.2f" %  (len(set(pf_data))/ (len(set(pf_data)) + len(set(nf_data)))))
            print("\n")

            print("实际负样本占比：%.2f" % (count1/(count1+count2)))
            print("实际正样本占比：%.2f" % (count2/(count1+count2)))

negative_samples中所有文件长度： 20459
filtered_negative_samples长度： 19389
负样本剔除空内容之后文件长度： 19389


positive_samples中所有文件长度： 4044
filtered_positive_samples长度： 3622
正样本剔除空内容之后文件长度： 3622


包含空样本的负样本占比：0.84
包含空样本的正样本占比：0.16


实际负样本占比：0.84
实际正样本占比：0.16


## 测试获取rel中per

In [190]:
'''
    连接es
'''
from elasticsearch import Elasticsearch
from elasticsearch import helpers

es = Elasticsearch('http://123.206.13.101:59200')

doc_index = '*_ner'
doc_type = 'news'

query = {
  "query": {
    "ids":{
      "values": ["MzA3MzI4MjgzMw==_2650742342_5",
      "4643dd433d0c74494430b0147a92d50b"]
    }
  }, 
  "_source":  ["rel"]
  
}


# es.search可以批量输出
# helpers.scan尽管设置了size，仍是全部输出

full_data2 = es.search(index=doc_index, doc_type=doc_type, body=query) 

In [235]:
for i in full_data2["hits"]["hits"]:
    name = []
    print("******************")
    for rel in i['_source'].get('rel',[]):
        print(rel.get('per'))
        name.append(rel.get('per'))
    print(len(set(name)))

******************
凌小宁
凌小宁
凌小宁
张宏江
凌小宁
凌小宁
凌小宁
凌小宁
2
******************
王坚
唐文斌
路人王
3


In [258]:
for i in full_data2["hits"]["hits"]:
#     print(i)
    pers = set([rels.get("per", None) for rels in i['_source'].get('rel',[])])
    print(pers)

{'凌小宁', '张宏江'}
{'唐文斌', '路人王', '王坚'}


In [317]:
for i in full_data2["hits"]["hits"]:
#     print(i)
    titles = set([rels.get("title", None) for rels in i['_source'].get('rel', [])])
print(titles)

{'', '联合创始人', '主席'}


In [318]:
if "" in titles:
    print(0)
else:
    print(1)

0
