In [16]:
import pandas as pd
import re
from collections import Counter

def get_list_kv(re_path, sentence):
    wanted = {}
    re_expressions = pd.read_csv(re_path)['re'].values
    k = 1
    for re_expression in re_expressions:
        finds = re.findall(re_expression, sentence)
        if finds:
            if len(finds[0]) == 2:
                for v in finds:
                    name1 = '套餐{}名称'.format(k)
                    name3 = '套餐{}还剩余'.format(k)
                    k+=1
                    wanted.update({name1:v[0],name3:v[1]})
            elif len(finds[0]) == 3:
                for v in finds:
                    name1 = '套餐{}名称'.format(k)
                    name2 = '套餐{}已使用'.format(k)
                    name3 = '套餐{}还剩余'.format(k)
                    k+=1
                    wanted.update({name1:v[0], name2:v[1],name3:v[2]})
    return wanted


def get_kv(re_path, sentence):
    wanted = []
    re_csv = pd.read_csv(re_path)
    for i in re_csv.index:
        re_expression = re_csv.loc[i]['re']
        find = re.search(re_expression, sentence)
        if find:
            key = re_csv.loc[i]['key']
            cn_key = re_csv.loc[i]['cn_key']
            #shengjian's regular expression's group name is cn_key
            if key.startswith('jsheng'):
                value = find.group(cn_key)
            else:
                value = find.group(key)
                
            wanted.append({'find': find, \
                           'key': key, \
                           'cn_key': cn_key, \
                           'value': value, \
                          'level': re_csv.loc[i]['level']})
    return wanted

def get_result(sentence,kre_path, kre_list_path = None):

    #the output is dict object
    if kre_list_path is not None:
        list_kv = get_list_kv(kre_list_path, sentence)
    else:
        list_kv ={}
    #the output is a dict list
    kv = get_kv(kre_path, sentence)
    
    if len(kv) == 0:
        return list_kv
    kv_df = pd.DataFrame(kv)
    #selected level higher values
    def get_level(df):
        if df.shape[0]< 2:
            return df
        level_values = df.sort_values("level",ascending=False)['level'].values
        return df[df.level == level_values[0]]
    kv_df = kv_df.groupby(['cn_key']).apply(get_level)
    
    #vote same (cn_key, value) ,drop other cn_key but not same values
    def vote(df):
        if df.shape[0]< 3:
            return df
        all_values = df['value'].values
        if len(all_values) == len(set(all_values)):
            return df
        v_c = Counter(all_values)
        most_v = v_c.most_common(1)
        return df[df.value==most_v[0][0]]
    kv_df = kv_df.groupby(['cn_key']).apply(vote)
    
    # drop the same (cn_key, value) item 
    kv_df = kv_df.drop_duplicates(['cn_key', 'value'])
    
    cn_key_list = kv_df['cn_key'].values
    
    #if all (cn_key, value) is unique, return result
    if len(cn_key_list) == len(set(cn_key_list)):
        re_r =  dict(zip(kv_df['cn_key'].values, kv_df['value'].values))
        re_r.update(list_kv)
        return re_r
    
    #choose  min length find when cn_key is same
    def get_minlen_find(df):
        find_length = [len(str(item)) for item in df['find'].values]
        df['find_length'] = find_length
        min_len = min(find_length)
        return df[df.find_length == min_len]
    kv_df= kv_df.groupby(['cn_key']).apply(get_minlen_find)
    
    #if all (cn_key, value) is unique, return result
    cn_key_list = kv_df['cn_key'].values
    if len(cn_key_list) == len(set(cn_key_list)):
        re_r =  dict(zip(kv_df['cn_key'].values, kv_df['value'].values))
        re_r.update(list_kv)
        return re_r
   
    #choose  min length value when cn_key is same
    def get_minlen_value(df):
        find_length = [len(str(item)) for item in df['value'].values]
        df['value_length'] = find_length
        min_len = min(find_length)
        return df[df.value_length == min_len]
    kv_df= kv_df.groupby(['cn_key']).apply(get_minlen_value)
    
    re_r =dict(zip(kv_df['cn_key'].values, kv_df['value'].values))
    re_r.update(list_kv)
    return re_r

In [17]:
raw_sentence = '12月手机账单：12月01日至12月31日，优惠后个人实际消费82.57元，家庭统一支付成员消费13.60元，合计96.17元。回复“11”查询当前余额，查费缴费敬请登录手机营业厅 http://dx.10086.cn/gjdxsy【中国移动】'
result = get_result(raw_sentence, kre_path='data/kre_914.csv')
print('='*100)
for k,v in result.items():
    print('{} : {}'.format(k, v))

[0 0]
[0 0]
[1 0 0 0 0]
截止时间 : 12月31日
消费金额 : 96.17元
账单月份 : 12月


Defaulting to column, but this will raise an ambiguity error in a future version


In [390]:
tracffic_data = pd.read_excel('data/北京中移在线/智能短信模板数据20180827.xls', sheet_name='流量类')

for i in tracffic_data.index:
    sentence = tracffic_data.loc[i]['短信原文']
    if sentence.find('余额-') != -1:
        print('='*40+'%s'%i+'='*40+'\n%s\n'%sentence)
        result = get_result(sentence)
        for k,v in result.items():
            print('-'*50+'\n'+'{}:{}\n'.format(k, v))

【话费流量提醒】：截止01月15日00时00分，您当月话费已消费195.67元，话费余额-28.21元；上月结转至本月的国内流量套餐为388.59M。您的上网套餐流量已用完，超出流量667.00M，超出流量费用合计193.43元，回复KTAXB即可免费办理流量安心包服务，套餐之外流量资费更优惠，次日生效，回复AXB即可了解安心包的服务规则。如需以下服务，请直接回复序号：

--------------------------------------------------
截止时间:01月15日00时00分

--------------------------------------------------
上月不清零国内流量:388.59M

--------------------------------------------------
话费余额:-28.21元

--------------------------------------------------
当前话费:195.67元

--------------------------------------------------
消费金额:195.67元

--------------------------------------------------
流量状态:已用完

--------------------------------------------------
套餐1名称:上网套餐流量

--------------------------------------------------
套餐1还剩余:已用完

话费、流量、账单、积分等信息查询请点击：http://dx.10086.cn/wxgrzy  一、话费使用情况： 您为合帐账户截止10月31日09时00分，您当月合帐账户话费已消费253.64元，话费余额-31.22元； 二、流量使用情况: 您的国内流量套餐已使用完；省内流量套餐已使用完；视频定向流量月包（话费扣费）-24元-爱奇艺已使用0MB，还剩余30720.00M；套餐外流量已使用468.00M，套餐外流量总费用为135.72元，回复KTAXB即可免费办理流量安心包服务，套餐之外流量资费更优惠，次日生效，回复AXB即可了解安心包的服务规则。 您有10元话费待领取！

In [358]:
find_data = []
for i in call_data.index:
    sentence = call_data.loc[i]['短信原文'] 
    #print('='*40+'%s'%i+'='*40+'\n%s\n'%sentence)
    result = get_result(sentence)
    for k,v in result.items():
        #print('-'*50+'\n'+'{}:{}\n'.format(k, v))
        item += '({}:{})'.format(k, v)
    find_data.append(item)
call_data['we_find'] = find_data