In [1]:
import json
import os
import re
import sys
from tqdm import tqdm
from tika import parser
import pandas as pd
import uuid
import markdown
from pathlib import Path
from html.parser import HTMLParser
from ast import literal_eval

def df2squad(df, squad_version="v1.1", output_dir=None, filename=None):
    """
     Converts a pandas dataframe with columns ['title', 'paragraphs'] to a json file with SQuAD format.

     Parameters
    ----------
     df : pandas.DataFrame
         a pandas dataframe with columns ['title', 'paragraphs']
     squad_version : str, optional
         the SQuAD dataset version format (the default is 'v2.0')
     output_dir : str, optional
         Enable export of output (the default is None)
     filename : str, optional
         [description]

    Returns
    -------
    json_data: dict
        A json object with SQuAD format

    """

    json_data = {}
    json_data["version"] = squad_version
    json_data["data"] = []

    for idx, row in tqdm(df.iterrows()):
        temp = {"title": row["title"], "paragraphs": []}
        for paragraph in row["body_text_parags"]:
            temp["paragraphs"].append({"context": paragraph, "qas": []})
        json_data["data"].append(temp)

    if output_dir:
        with open(os.path.join(output_dir, "{}.json".format(filename)), "w",encoding = 'utf-8') as outfile:
            json.dump(json_data, outfile,ensure_ascii=False)

    return json_data

In [3]:
import  pandas as pd

train_df = pd.read_csv(r'疫情政务问答助手数据集\NCPPolicies_train_20200301.csv',sep = '\t',encoding = 'utf-8')

context_df = pd.read_csv(r'疫情政务问答助手数据集\NCPPolicies_context_20200301.csv',sep = '\t',error_bad_lines=False,encoding = 'utf-8')

b'Skipping line 134: expected 2 fields, saw 3\nSkipping line 2130: expected 2 fields, saw 3\nSkipping line 2877: expected 2 fields, saw 3\nSkipping line 2955: expected 2 fields, saw 3\nSkipping line 2974: expected 2 fields, saw 3\nSkipping line 3038: expected 2 fields, saw 3\nSkipping line 3052: expected 2 fields, saw 3\nSkipping line 3053: expected 2 fields, saw 3\nSkipping line 3080: expected 2 fields, saw 3\nSkipping line 3086: expected 2 fields, saw 3\nSkipping line 3094: expected 2 fields, saw 3\nSkipping line 3115: expected 2 fields, saw 22\nSkipping line 3180: expected 2 fields, saw 3\nSkipping line 3189: expected 2 fields, saw 3\nSkipping line 3191: expected 2 fields, saw 3\nSkipping line 3197: expected 2 fields, saw 3\nSkipping line 3203: expected 2 fields, saw 3\nSkipping line 3209: expected 2 fields, saw 3\nSkipping line 3215: expected 2 fields, saw 3\nSkipping line 3611: expected 2 fields, saw 3\nSkipping line 3613: expected 2 fields, saw 3\nSkipping line 3716: expected 2 f

In [4]:
merge_df = pd.merge(context_df,train_df, how='left', on='docid')
merge_df.dropna(inplace = True)
merge_df.head()

Unnamed: 0,docid,text,id,question,answer
0,edd1413c78e534afb136f36fdc9c9a00,福建：6部门联合出台暖企措施支持复工稳岗 为解决企业复产的用工困难，经省政府同意，省人社厅、...,fdc51a7baeff3fafbae6736422783528,福建联合出台暖企措施支持复工稳岗的部门都有谁？,省人社厅、省工信厅、省教育厅、省财政厅、省交通运输厅、省卫健委
1,edd1413c78e534afb136f36fdc9c9a00,福建：6部门联合出台暖企措施支持复工稳岗 为解决企业复产的用工困难，经省政府同意，省人社厅、...,6b7b0c209ac939afa43f030d67433178,福建政府针对引入本地劳动力的、未经有关机构确认的疫情物资生产企业做何补助？,一次性用工服务奖补标准最高提到每人2000元。
3,8bab952dfa1a367da1b8e7ad864d766b,吉林多措并举保民生 2月18日，省政府新闻办围绕就业和医疗保障等民生问题召开新闻发布会。省...,2c3b39b675493830b4d1f2f4f2a436f9,吉林省各级医疗保障部门为百姓提供哪些便捷服务？,全面实行“非必须、不窗口”经办服务，通过“网上办”“掌上办”“延长时限办”“后期补办”等方式...
4,8bab952dfa1a367da1b8e7ad864d766b,吉林多措并举保民生 2月18日，省政府新闻办围绕就业和医疗保障等民生问题召开新闻发布会。省...,9a9acefa0f533ca99fd174aa89b7f600,吉林省提前预付医保基金总额达到了多少？,1.22亿元
8,a84ad5c0fc7d3755926ec5bf2b2d9dae,重庆出台援企稳岗返还政策 2月16日，记者从市人力社保局获悉，为切实减轻中小企业负担，充分发...,a505ce9785333655aae92fabba500f70,重庆援企稳岗返还政策的申请条件是什么？,依法参加社会保险并足额缴纳2019年度社会保险费（截至2019年12月31日无欠费），企业2...


In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(r'chinese-bert-wwm-ext')

chars = list(set(''.join(list(set(merge_df['text'].tolist())))))


 

add_vocabs = ['’','‘','“','”','—']
for i in chars:
    if i not in tokenizer.vocab.keys():
        add_vocabs.append(i) 
 
 

In [7]:
with open('voc.txt','a+',encoding = 'utf-8') as f:
    for i in add_vocabs:
        f.write(str(i) +'\n')

In [8]:
with open('voc.txt','r',encoding = 'utf-8') as f:
    add_vocabs = [i.strip() for i in f.readlines()]
    
tokenizer.add_tokens(add_vocabs)

132

In [14]:
def checkAnyStop(token_list, token_stops):
    return any([stop in token_list for stop in token_stops])

def firstFullStopIdx(token_list, token_stops):
    """
    Returns the index of first full-stop token appearing.  
    """
    idxs = []
    for stop in token_stops:
        if stop in token_list:
            idxs.append(token_list.index(stop))
    minIdx = min(idxs) if idxs else None
    return minIdx


puncts = ['？', '。', '?', '；','...',"！","!",'，']
puncts_tokens = [tokenizer.tokenize(x)[0] for x in puncts]

def splitTokens(tokens, punct_tokens, split_length):
    """
    To avoid splitting a sentence and lose the semantic meaning of it, a paper is splitted 
    into chunks in such a way that each chunk ends with a full-stop token (['？', '。', '?', '；','...']) 
    """
    splitted_tokens = []
    while len(tokens) > 0:
        if len(tokens) < split_length or not checkAnyStop(tokens, punct_tokens):
            splitted_tokens.append(tokens)
            break
        # to not have too long parapraphs, the nearest fullstop is searched both in the previous 
        # and the next strings.
        prev_stop_idx = firstFullStopIdx(tokens[:split_length][::-1], puncts_tokens)
        next_stop_idx = firstFullStopIdx(tokens[split_length:], puncts_tokens)
        if pd.isna(next_stop_idx):
            splitted_tokens.append(tokens[:split_length - prev_stop_idx])
            tokens = tokens[split_length - prev_stop_idx:]
        elif pd.isna(prev_stop_idx):
            splitted_tokens.append(tokens[:split_length + next_stop_idx + 1])
            tokens = tokens[split_length + next_stop_idx + 1:] 
        elif prev_stop_idx < next_stop_idx:
            splitted_tokens.append(tokens[:split_length - prev_stop_idx])
            tokens = tokens[split_length - prev_stop_idx:]
        else:
            splitted_tokens.append(tokens[:split_length + next_stop_idx + 1])
            tokens = tokens[split_length + next_stop_idx + 1:] 
    return splitted_tokens

def splitParagraph(text, split_length=512):
    tokens = tokenizer.tokenize(text)
    splitted_tokens = splitTokens(tokens, puncts_tokens, split_length)
    return ["".join(tokenizer.convert_tokens_to_string(x)).replace(' ','') for x in splitted_tokens]


In [15]:
tqdm.pandas(desc='pandas bar')

merge_df.columns = ['docid','paragraphs','title','question','answer']
merge_df['body_text_parags'] = merge_df['paragraphs'].progress_apply(splitParagraph)

merge_df.head()


pandas bar: 100%|█████████████████████████████████████████████████████████████████| 4997/4997 [00:38<00:00, 131.43it/s]


Unnamed: 0,docid,paragraphs,title,question,answer,body_text_parags
0,edd1413c78e534afb136f36fdc9c9a00,福建：6部门联合出台暖企措施支持复工稳岗 为解决企业复产的用工困难，经省政府同意，省人社厅、...,fdc51a7baeff3fafbae6736422783528,福建联合出台暖企措施支持复工稳岗的部门都有谁？,省人社厅、省工信厅、省教育厅、省财政厅、省交通运输厅、省卫健委,[福建：6部门联合出台暖企措施支持复工稳岗为解决企业复产的用工困难，经省政府同意，省人社厅、...
1,edd1413c78e534afb136f36fdc9c9a00,福建：6部门联合出台暖企措施支持复工稳岗 为解决企业复产的用工困难，经省政府同意，省人社厅、...,6b7b0c209ac939afa43f030d67433178,福建政府针对引入本地劳动力的、未经有关机构确认的疫情物资生产企业做何补助？,一次性用工服务奖补标准最高提到每人2000元。,[福建：6部门联合出台暖企措施支持复工稳岗为解决企业复产的用工困难，经省政府同意，省人社厅、...
3,8bab952dfa1a367da1b8e7ad864d766b,吉林多措并举保民生 2月18日，省政府新闻办围绕就业和医疗保障等民生问题召开新闻发布会。省...,2c3b39b675493830b4d1f2f4f2a436f9,吉林省各级医疗保障部门为百姓提供哪些便捷服务？,全面实行“非必须、不窗口”经办服务，通过“网上办”“掌上办”“延长时限办”“后期补办”等方式...,[吉林多措并举保民生2月18日，省政府新闻办围绕就业和医疗保障等民生问题召开新闻发布会。省人...
4,8bab952dfa1a367da1b8e7ad864d766b,吉林多措并举保民生 2月18日，省政府新闻办围绕就业和医疗保障等民生问题召开新闻发布会。省...,9a9acefa0f533ca99fd174aa89b7f600,吉林省提前预付医保基金总额达到了多少？,1.22亿元,[吉林多措并举保民生2月18日，省政府新闻办围绕就业和医疗保障等民生问题召开新闻发布会。省人...
8,a84ad5c0fc7d3755926ec5bf2b2d9dae,重庆出台援企稳岗返还政策 2月16日，记者从市人力社保局获悉，为切实减轻中小企业负担，充分发...,a505ce9785333655aae92fabba500f70,重庆援企稳岗返还政策的申请条件是什么？,依法参加社会保险并足额缴纳2019年度社会保险费（截至2019年12月31日无欠费），企业2...,[重庆出台援企稳岗返还政策2月16日，记者从市人力社保局获悉，为切实减轻中小企业负担，充分发...


In [16]:
merge_df['body_text_parags'][0]


['福建：6部门联合出台暖企措施支持复工稳岗为解决企业复产的用工困难，经省政府同意，省人社厅、省工信厅、省教育厅、省财政厅、省交通运输厅、省卫健委联合下发通知，出台一系列暖企措施支持疫情防控期间复工稳岗。通知明确，切实发挥各级农民工工作领导小组办公室的统筹协调作用,加强劳务用工有效对接，对具备外出务工条件、可成规模输送到我省用工地，并在出行前14天内及在途没有相关症状的，可由用工地和输出地联合开展“点对点、一站式”直达企业的专门运输。省级公共就业服务机构可与主要劳务输出省份签订劳务协作协议、设立劳务协作工作站，对每个工作站给予一次性10万元就业服务经费补助。鼓励优先聘用本地劳务人员。未经省应对新冠肺炎疫情工作有关机构确认的疫情防控急需物资生产企业引进劳动力的，一次性用工服务奖补标准最高提到每人2000元。对上述企业坚持在生产一线工作的职工，给予每人每天100元的生活补助，纳入一次性用工服务奖补范畴。对春节当月至疫情一级响应结束月，采取稳定职工队伍保持连续生产的企业，给予一次性稳就业奖补。加大失业保险稳岗返还力度，将中小微企业稳岗返还政策裁员率标准调整为不高于上年度全国调查失业率的控制目标，对参保职工30人（含）以下的企业，',
 '裁员率调整为不超过企业参保职工总数的20%。对不裁员或少裁员，符合条件的参保企业，可返还其上年度实际缴纳失业保险费的50%。对受疫情影响面临暂时性生产经营困难且恢复有望、坚持不裁员或少裁员、符合条件的参保企业，按6个月的当地月人均失业保险金和参保职工人数落实失业保险稳岗返还政策。加强职业技能培训，鼓励技工院校学生在符合疫情防控条件下参加实习实训，探索简易岗前技能培训。对企业因生产急需新录用的人员，按每人200元标准一次性给予企业简易岗前技能培训补贴。鼓励实施线上培训，对受疫情影响的企业，在停工期、恢复期组织职工参加各类线上或线下职业培训的，可按规定纳入补贴类培训范围。通知要求，各地要着力提升政策措施的精准度和有效性，提升各类企业享受政策措施的获得感。各类企业要落实落细防控主体责任，严格落实返岗信息登记、班车错峰接送、员工分散用餐、体温监测等具体应对措施，确保复工稳岗和疫情防控两不误。（记者潘园园）']

In [17]:
json_data = df2squad(df=merge_df, squad_version='v1.1',
                     output_dir=r'C:\Users\hp\Desktop\2021.02.21 BertForQuestionAnsweringWithMultiTask\疫情政务问答助手数据集', 
                     filename='Coronavirus2019'
                                )



4997it [00:00, 6333.57it/s]
