In [144]:
import re

class BaseExtractor(object):
    regex_list = [
        re.compile(r"base_regex")
    ]
    tag_name = "base"
    def extract(self, text):
        string = text.lower()
        ret = []
        for regex in self.regex_list:
            for find in regex.finditer(string):
                ret.append([find.start(1), find.end(1), self.tag_name])
        return ret


class VxExtractor(BaseExtractor):
    regex_list = [
        re.compile(r"微信(?:.{0,5}号)?[:：]?([a-zA-z][a-z_A-z_0-9\_-]{5,19})"),
        re.compile(r"wx(?:.{0,5}号)?[:：]?([a-zA-z][a-z_A-z_0-9\_-]{5,19})")
    ]
    tag_name = "vx"
vx_extractor = VxExtractor()


class QQExtractor(BaseExtractor):
    regex_list = [
        re.compile(r"[Qq][Qq]?(?:.{0,5}群)?[:：]?([1-9][0-9]{5,11})(?:(?!@))"),
        re.compile(r"[Qq][Qq]?.{0,2}微信?[:：]?([1-9][0-9]{5,11})(?:(?!@))") # 微信号不能以数字开头，此时应为qq
    ]
    tag_name = "QQ"
qq_extractor = QQExtractor()


# class MobileExtractor(BaseExtractor):
#     regex_list = [
#         # 大陆手机
#         re.compile(r"((?:[(（]\+?86[)）])?1(?:3\d{3}|5[^4\D]\d{2}|8\d{3}|7(?:[0-35-9]\d{2}|4(?:0\d|1[0-2]|9\d))|9[0-35-9]\d{2}|6[2567]\d{2}|4(?:(?:10|4[01])\d{3}|[68]\d{4}|[579]\d{2}))\d{6})"),
#         # 台湾手机
#         re.compile(r"[(（]886[）)]?09\d{8}")
#     ]
#     tag_name = "mobile"
# mobile_extractor = MobileExtractor()


# class EmailExtractor(BaseExtractor):
#     regex_list = [
#         re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4})"),
#     ]
#     tag_name = "email"
# email_extractor = EmailExtractor()

In [None]:
# text = "微信:1827967769"
# ret = qq_extractor.extract(text)
# for r in ret:
#     print(ret, text[r[0]:r[1]])

# text = "qq:2308713823"
# ret = qq_extractor.extract(text)
# for r in ret:
#     print(ret, text[r[0]:r[1]])

# text = "微信:hyl222"
# ret = vx_extractor.extract(text)
# for r in ret:
#     print(ret, text[r[0]:r[1]])

In [22]:
def test_extractor(extractor, tag):
    from glob import glob
    import os
    import pandas as pd

    # extractor = vx_extractor
    # tag = "vx"
    train_dir = "../data/train_data"
    pred_set = set()
    labe_set = set()
    for file_path in glob(os.path.join(train_dir, "*.txt")):
        file_id = os.path.basename(file_path).split(".")[0]
        with open(file_path, "r", encoding="utf8") as r:
            text = r.read()
        pred_entities = extractor.extract(text)
        for start, end, pred_tag in pred_entities:
            assert pred_tag == tag, [pred_tag, tag]
            pred_set.add((file_id, tag, str(start), str(end-1), text[start:end]))
        ann_path = "../data/train_label/"  + file_id + ".csv"
        ann_df = pd.read_csv(ann_path, dtype=str)

        records = ann_df.to_records(index=None)
        records = [tuple(record) for record in records if record[1]==tag]
        labe_set.update(records)
    print("预测值和真实值第一条：")
    print(list(pred_set)[0])
    print(list(labe_set)[0])
    print("=" * 66)


    def precision_score(y_true, y_pred, average='micro'):
        true_entities = set(y_true)
        pred_entities = set(y_pred)

        nb_correct = len(true_entities & pred_entities)
        nb_pred = len(pred_entities)

        score = nb_correct / nb_pred if nb_pred > 0 else 0

        return score

    def recall_score(y_true, y_pred, average='micro', suffix=False):
        true_entities = set(y_true)
        pred_entities = set(y_pred)

        nb_correct = len(true_entities & pred_entities)
        nb_true = len(true_entities)

        score = nb_correct / nb_true if nb_true > 0 else 0

        return score

    def f_score(y_true, y_pred, average='micro', suffix=False):
        true_entities = set(y_true)
        pred_entities = set(y_pred)

        nb_correct = len(true_entities & pred_entities)
        nb_pred = len(pred_entities)
        nb_true = len(true_entities)

        p = nb_correct / nb_pred if nb_pred > 0 else 0
        r = nb_correct / nb_true if nb_true > 0 else 0
        score = 2 * p * r / (p + r) if p + r > 0 else 0

        return score

    print(f"for {tag}: ")

    precision = precision_score(labe_set, pred_set)
    print(f"precision: {precision}")

    recall = recall_score(labe_set, pred_set)
    print(f"recall: {recall}")

    f_score = f_score(labe_set, pred_set)
    print(f"f_score: {f_score}")

    sorted_func = lambda x: sorted(list(x), key=lambda y: (int(y[0]), int(y[2]), int(y[3])))
    print("在label，不在pred", sorted_func(labe_set-pred_set))
    print("=" * 66)
    print("在pred, 不在label", sorted_func(pred_set-labe_set))
    print("")

In [10]:
test_extractor(vx_extractor, "vx")

('1503', 'vx', '35', '41', 'wwjshow')
('1503', 'vx', '35', '41', 'wwjshow')
for vx: 
precision: 0.9444444444444444
recall: 0.8947368421052632
f_score: 0.918918918918919
在label，不在pred [('118', 'vx', '80', '86', 'lily潘小芬'), ('1801', 'vx', '56', '63', '57150788')]
在pred, 不在label [('1345', 'vx', '60', '68', 'zzz813222')]



In [145]:
test_extractor(qq_extractor, "QQ")

预测值和真实值第一条：
('1334', 'QQ', '44', '52', '490401933')
('1334', 'QQ', '44', '52', '490401933')
for QQ: 
precision: 0.9285714285714286
recall: 0.9285714285714286
f_score: 0.9285714285714286
在label，不在pred [('272', 'QQ', '4', '12', '415392980'), ('1994', 'QQ', '7', '17', '18618193311')]
在pred, 不在label [('318', 'QQ', '10', '18', '772845851'), ('401', 'QQ', '13', '21', '764957359')]



In [197]:
class MobileExtractor(BaseExtractor):
    regex_list = [
        # 大陆手机
        re.compile(r"(?:电话|联系方式|Tel|TEL|手机|Mobile|聯絡|联系|咨询)?(?:.{0,10}(?:号|号码))?[:：]?((?:[(（]\+?86[)）])?1(?:3\d{3}|5[^4\D]\d{2}|8\d{3}|7(?:[0-35-9]\d{2}|4(?:0\d|1[0-2]|9\d))|9[0-35-9]\d{2}|6[2567]\d{2}|4(?:(?:10|4[01])\d{3}|[68]\d{4}|[579]\d{2}))\d{6})(?:(?!@))"),
#         re.compile(r"([\(（]?\+?886[）\)]?\d{8})")
#         re.compile(r"([(（]886[）)](?:[0-9]{8,9}|[0-9\-]{10,11}))"),
        re.compile(r"(?:电话|联系方式|Tel|TEL|手机|Mobile|聯絡|联系|咨询)?(?:.{0,10}(?:号|号码))?[:：]?([＋\+0-9\-\(\)]{11,18})"),
#         re.compile(r"(?:电话|联系方式|Tel|TEL|手机|Mobile)?(?:.{0,5}(?:号|号码))?[:：]?([+0-9\-\(\)]{11,16})")
        re.compile(r"(（\+?[0-9]{2,3}）(?:(?:[1-9][0-9-]{10, 12})|(?:[1-9][0-9]{8-11})))")
    ]
    tag_name = "mobile"
    
    def extract(self, text):
        string = text.lower()
        ret = []
        for regex in self.regex_list:
            for find in regex.finditer(string):
                if string[find.start(1)]=='(' and string[find.end(1)-1]==')':
                    ret.append([find.start(1)+1, find.end(1)-1, self.tag_name])
                elif string[find.start(1)]=='-' and string[find.end(1)-1]=='-':
                    continue
                elif string[find.end(1)-1] not in list('0123456789'):
                    k = find.end(1)-1
                    while k > find.start(1) + 5:
                        if string[k] not in {')', '-', '）', '(', '（'}:
                            break
                        k -= 1
                    ret.append([find.start(1), k, self.tag_name])
                elif string[find.start(1)] in {')', '）', '-'}:
                    ret.append([find.start(1)+1, find.end(1), self.tag_name])
                elif string[find.start(1)] in {'))', '））', '--'}:
                    ret.append([find.start(1)+2, find.end(1), self.tag_name])
                elif string[find.start(1)] in {')))'}:
                    ret.append([find.start(1)+3, find.end(1), self.tag_name])
                elif string[find.end(1)] == '&':
                    continue
                elif len(string[find.start(1): find.end(1)]) == 18 and string[find.start(1)] == '1' and '-' not in string[find.start(1): find.end(1)] and '(' not in string[find.start(1): find.end(1)]:
                    ret.append([find.start(1), find.start(1)+11, self.tag_name])
                    ret.append([find.start(1)+11, find.start(1)+22, self.tag_name])
                elif string[find.start(1): find.start(1)+4] in {'2015','2016','2017','2018','2019','2020'}:
                    continue
                else:
                    ret.append([find.start(1), find.end(1), self.tag_name])
        return ret
    
mobile_extractor = MobileExtractor()

In [198]:
test_extractor(mobile_extractor, "mobile")

预测值和真实值第一条：
('481', 'mobile', '51', '61', '13501379817')
('712', 'mobile', '48', '60', '+886287713553')
for mobile: 
precision: 0.8216783216783217
recall: 0.8483754512635379
f_score: 0.8348134991119004
在label，不在pred [('72', 'mobile', '23', '39', '886-2369-9886#585'), ('253', 'mobile', '34', '50', '（+886）2-2752-1874'), ('288', 'mobile', '34', '46', '（852）94896744'), ('288', 'mobile', '52', '68', '（86）1380-2841-004'), ('318', 'mobile', '10', '18', '772845851'), ('318', 'mobile', '20', '26', '6821155'), ('401', 'mobile', '13', '21', '764957359'), ('432', 'mobile', '42', '63', '1861060699713940087450'), ('437', 'mobile', '41', '53', '（852）23072034'), ('657', 'mobile', '36', '54', '2335-2305/9263-3317'), ('667', 'mobile', '62', '71', '1827967769'), ('889', 'mobile', '22', '25', '2009'), ('905', 'mobile', '57', '69', '13910405280董晨'), ('1083', 'mobile', '27', '34', '23352103'), ('1123', 'mobile', '68', '77', '0988006707'), ('1147', 'mobile', '45', '64', '010-5762608757626088'), ('1161', 'mob

In [136]:
class EmailExtractor(BaseExtractor):
    regex_list = [
        re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.(?:com|cn|hk|net|asia|tw|org|edu))"),
    ]
    tag_name = "email"
    
    def extract(self, text):
        string = text.lower()
        ret = []
        for regex in self.regex_list:
            for find in regex.finditer(string):
                if string[find.start(1)]=='-':
                    ret.append([find.start(1)+1, find.end(1), self.tag_name])
                else:
                    ret.append([find.start(1), find.end(1), self.tag_name])
        return ret
                
email_extractor = EmailExtractor()

In [137]:
test_extractor(email_extractor, "email")

预测值和真实值第一条：
('767', 'email', '43', '60', 'chrissyliu@mac.com')
('767', 'email', '43', '60', 'chrissyliu@mac.com')
for email: 
precision: 0.9221789883268483
recall: 0.915057915057915
f_score: 0.9186046511627907
在label，不在pred [('221', 'email', '17', '35', 'www.starmusichk.com'), ('288', 'email', '69', '89', 'bill_yeung@uih.com.hk'), ('465', 'email', '14', '25', '金晶kinkiemail'), ('559', 'email', '35', '58', 'jeffreychen@vip.sina.com'), ('568', 'email', '44', '67', 'Calvin-c2009@hotmail.com'), ('574', 'email', '7', '25', 'jimmylin@jimmylin.a'), ('712', 'email', '16', '40', 'den02162000@chicgroup.com'), ('1461', 'email', '68', '81', 'info@wowmusic.'), ('1507', 'email', '12', '37', 'yabinstudio@sina.com@music'), ('1521', 'email', '26', '46', 'mailyaoavnu@gmail.com'), ('1531', 'email', '60', '83', "'pamela@dreamstardom.com"), ('1613', 'email', '57', '74', 'www.celinajade.com'), ('1773', 'email', '23', '44', 'rickylin7028@gmail.com'), ('1773', 'email', '67', '86', 'emilykuo77@gmail.com'), ('18

In [199]:
def pred_extractor(extractor, tag):
    from glob import glob
    import os
    import pandas as pd

    # extractor = vx_extractor
    # tag = "vx"
    train_dir = "../data/test_data"
    pred_set = set()
    for file_path in glob(os.path.join(train_dir, "*.txt")):
        file_id = os.path.basename(file_path).split(".")[0]
        with open(file_path, "r", encoding="utf8") as r:
            text = r.read()
        pred_entities = extractor.extract(text)
        for start, end, pred_tag in pred_entities:
            assert pred_tag == tag, [pred_tag, tag]
            pred_set.add((file_id, tag, str(start), str(end-1), text[start:end]))
    print("预测值")
    print(list(pred_set)[0])
    print("=" * 66)
    return pred_set

In [202]:
vx = pred_extractor(vx_extractor, 'vx')
qq = pred_extractor(qq_extractor, 'QQ')
mobile = pred_extractor(mobile_extractor, 'mobile')
email = pred_extractor(email_extractor, 'email')

预测值
('1866', 'vx', '33', '45', 'hermesxiaomei')
预测值
('1100', 'QQ', '19', '27', '116463151')
预测值
('213', 'mobile', '58', '68', '18613866608')
预测值
('1880', 'email', '28', '43', 'x-xxx2012@qq.com')


In [203]:
import pandas as pd

In [None]:
df_ret = {'ID':[],'Category':[],'Pos_b':[],'Pos_e':[],'Privacy':[]}
df_scores = {'strings': [], 'score': []}