In [1]:
import json
from tqdm import tqdm
import os, re
import numpy as np
import pandas as pd

In [2]:
def delete_tag(s):
    
    s = re.sub('\{IMG:.?.?.?\}', '', s)                    #图片
    s = re.sub(re.compile(r'[a-zA-Z]+://[^\s]+'), '', s)   #网址
    s = re.sub(re.compile('<.*?>'), '', s)                 #网页标签
    s = re.sub(re.compile('&[a-zA-Z]+;?'), ' ', s)         #网页标签
    s = re.sub(re.compile('[a-zA-Z0-9]*[./]+[a-zA-Z0-9./]+[a-zA-Z0-9./]*'), ' ', s)
    s = re.sub("\?{2,}", "", s)
    s = re.sub("\r", "", s)
    s = re.sub("\n", ",", s)
    s = re.sub("\t", ",", s)
    s = re.sub("（", ",", s)
    s = re.sub("）", ",", s)
    s = re.sub("\u3000", "", s)
    s = re.sub(" ", "", s)
    r4 = re.compile('\d{4}[-/]\d{2}[-/]\d{2}')             #日期
    s=re.sub(r4,'某时',s)    
    return s

def cut_sentences(content):
    # 结束符号，包含中文和英文的
    end_flag = ['。',';','；']

    content_len = len(content)
    sentences = []
    tmp_char = ''
    for idx, char in enumerate(content):
        # 拼接字符
        tmp_char += char

        # 判断是否已经到了最后一位
        if (idx + 1) == content_len:
            sentences.append(tmp_char)
            break

        # 判断此字符是否为结束符号
        if char in end_flag:
            # 再判断下一个字符是否为结束符号，如果不是结束符号，则切分句子
            next_idx = idx + 1
            if not content[next_idx] in end_flag:
                sentences.append(tmp_char)
                tmp_char = ''

    return sentences

def metl_data(df):
    z = df.groupby(['uid'])['content'].apply(lambda x:np.concatenate(list(x))).reset_index()
    i = pd.concat([pd.Series(row['uid'], row['content']) for _, row in z.iterrows()]).reset_index()
    i.columns = ['uid','content']
    return i

def get_data():
    train = pd.read_csv('./data/event_entity_train_data_label.csv',sep='\t',header=None,names=['uid','content','content_type','entity'])
    test = pd.read_csv('./data/event_entity_dev_data.csv',sep='\t',header=None,names=['uid','content'])
    train = train[~train.content_type.isnull()].drop_duplicates().reset_index(drop = True)
    train['content'] = train['content'].apply(lambda x:cut_sentences(x))
    train['content'] = list(map(lambda x,y:[i for i in x if y in i], train['content'],train['entity']))
    train_n = metl_data(train)
    train = train_n.merge(train[['uid','entity']],how = 'left')
    test['content'] = test['content'].apply(lambda x:cut_sentences(x))
    test = metl_data(test)
    train['content'] = train['content'].apply(lambda x:delete_tag(x))
    test['content'] = test['content'].apply(lambda x:delete_tag(x))
    
    train['content'] = list(map(lambda x,y:x[x.find(y)-200:x.find(y)+200],train['content'],train['entity']))
    return train, test

In [3]:
train, test = get_data()

In [9]:
train.head()

Unnamed: 0,content,uid,entity
0,皖通科技(002331)实控人杨世宁减持360万股比亚迪预计一季度业绩下滑开盘跌停,2001123,比亚迪
1,上海验配眼镜质量抽查:宝山申视宝铧等店不合格萨博销量下滑过半世爵已资不抵债,2001512,萨博
2,江苏金湖通报疫苗过期三大主因县疾控中心领导班子已全免职山东墨龙(002490)业绩“变脸”及...,2001958,山东墨龙
3,原标题：斐讯0元购陷阱买家数亿资金被套无法提现责任编辑：柯金定,2003028,斐讯
4,"2018年8月27日?-?就无法提现的问题,健康猫给出了以下处理方案:自签订协议起,每6个月...",2004193,健康猫


In [7]:
test.head()

Unnamed: 0,content,uid
0,"比如北京蓝天瑞德环保技术股份有限公司,,，经营业绩异常波动、涉及重大诉讼、公司银行账户被冻结...",2146341
1,03Ø风险中心由此线索深挖出福佑卡车数据造假、财务严重违规操作、多位核心高管知情并串谋、横跨...,2146779
2,2018年1月24日，公司发布了《海南航空控股股份有限公司重大资产重组停牌公告》(公告编号：...,2146933
3,三、被告广州市财京融资担保有限公司对被告鞠佳珍、谭厚兵的上述债务承担连带清偿责任。,2148028
4,4、2014年4月9日，詹庆辉向北京市海淀区人民法院申请司法轮候冻结合慧伟业持有本公司全部股...,2150490


In [5]:
def list_find(list1, list2):
    """在list1中寻找子串list2，如果找到，返回第一个下标；
    如果找不到，返回-1。
    """
    n_list2 = len(list2)
    for i in range(len(list1)):
        if list1[i: i+n_list2] == list2:
            return i
    return -1

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41418 entries, 0 to 41417
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  41418 non-null  object
 1   uid      41418 non-null  int64 
 2   entity   41418 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB


In [21]:
train['tag'] = None
for idx, row in train.iterrows():
#     content = row['content']
#     entity = row['entity']
    
    tag = list('O' * len(row['content']))
#     tag = row['tag']
    start_pos = list_find(row['content'], row['entity'])
    if start_pos != -1:
        end_pos = start_pos + len(row['entity'])
        tag[start_pos] = 'B'
        for i in range(start_pos+1, end_pos):
            tag[i] = 'I' 
    #row['tag'] = ''.join(tag)
    train.at[idx, 'tag'] = tag

In [22]:
train.head()

Unnamed: 0,content,uid,entity,tag
0,皖通科技(002331)实控人杨世宁减持360万股比亚迪预计一季度业绩下滑开盘跌停,2001123,比亚迪,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,上海验配眼镜质量抽查:宝山申视宝铧等店不合格萨博销量下滑过半世爵已资不抵债,2001512,萨博,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,江苏金湖通报疫苗过期三大主因县疾控中心领导班子已全免职山东墨龙(002490)业绩“变脸”及...,2001958,山东墨龙,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,原标题：斐讯0元购陷阱买家数亿资金被套无法提现责任编辑：柯金定,2003028,斐讯,"[O, O, O, O, B, I, O, O, O, O, O, O, O, O, O, ..."
4,"2018年8月27日?-?就无法提现的问题,健康猫给出了以下处理方案:自签订协议起,每6个月...",2004193,健康猫,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [25]:
test['tag'] = None
for idx, row in test.iterrows():
#     content = row['content']
#     entity = row['entity']
    
    tag = list('O' * len(row['content']))
#     tag = row['tag']
    start_pos = list_find(row['content'], row['entity'])
    if start_pos != -1:
        end_pos = start_pos + len(row['entity'])
        tag[start_pos] = 'B'
        for i in range(start_pos+1, end_pos):
            tag[i] = 'I' 
    #row['tag'] = ''.join(tag)
    test.at[idx, 'tag'] = tag

KeyError: 'entity'