In [1]:
# 特征工程部分，大致思路为：统计特征、nlp特征和图特征
# 首先通过做数据预处理，将数据划分为三个版本：原始版本、去掉停用词的版本、文本清洗的版本
import pandas as pd

train_query = pd.read_csv("./train/train.query.tsv", sep="\t", header=None)
train_reply = pd.read_csv("./train/train.reply.tsv", sep="\t", header=None)
test_query = pd.read_csv("./test/test.query.tsv", sep="\t", header=None)
test_reply = pd.read_csv("./test/test.reply.tsv", sep="\t", header=None)
train_query.columns = ["cid", "d1"]
train_reply.columns = ["cid", "rid", "d2", "label"]
train_df = pd.merge(train_query, train_reply, how="left", on="cid")
test_query.columns = ["cid", "d1"]
test_reply.columns = ["cid", "rid", "d2"]
test_df = pd.merge(test_query, test_reply, how="left", on="cid")
train_df.head(5)

Unnamed: 0,cid,d1,rid,d2,label
0,0,采荷一小是分校吧,0,杭州市采荷第一小学钱江苑校区，杭州市钱江新城实验学校。,1
1,0,采荷一小是分校吧,1,是的,0
2,0,采荷一小是分校吧,2,这是5楼,0
3,1,毛坯吗？,0,因为公积金贷款贷的少,0
4,1,毛坯吗？,1,是呢,0


In [2]:
train_df[train_df['d2'].isnull()]

Unnamed: 0,cid,d1,rid,d2,label
2194,604,您好，请问这个房子周边有哪些学校,3,,0


In [3]:
train_df['d2'] = train_df.d2.fillna("好的")
train_df[train_df['cid'] == 604]

Unnamed: 0,cid,d1,rid,d2,label
2191,604,您好，请问这个房子周边有哪些学校,0,中学附近有一初,1
2192,604,您好，请问这个房子周边有哪些学校,1,一初是重点,1
2193,604,您好，请问这个房子周边有哪些学校,2,有什么可以帮到您,0
2194,604,您好，请问这个房子周边有哪些学校,3,好的,0
2195,604,您好，请问这个房子周边有哪些学校,4,一初只能考,0


In [4]:
# 生成不同版本的数据

# 原始版本
df = pd.concat([train_df, test_df])
df[['d1', 'd2']].to_csv("df_raw.tsv", index = None, sep=' ', header = None)

In [35]:
df = pd.read_csv("train.tsv", sep='\t')

In [38]:
# 去除停用词版本

# 加载停用词（本来想考虑情感词的停用词，但是发现好像用不到，
# 很多句子使用停用词之后变为空的了，这里停用了标点符号和感叹词）
import string


stopwords = [x for x in string.punctuation]
with open("stopwords.txt", encoding='utf-8') as fin:
    for word in fin.readlines():
        stopwords.append(word.strip())
        
def remove_sw(sen):
    for w in stopwords:
        sen = sen.replace(w, "")
    return sen

df_sw_removal = df.copy(deep=True)
df_sw_removal['d1'] = df_sw_removal['d1'].apply(remove_sw)
df_sw_removal['d2'] = df_sw_removal['d2'].apply(remove_sw)
df_sw_removal.to_csv("df_rm_sw.tsv", index = None, sep = '\t',)

In [6]:
# 文本清洗过的数据

# 数据并没有很脏，只不过有比较多的打错字的情况
# 还有部分打成拼音的情况
# 看看有哪些特殊字符
import re

def find_special_tokens(text):
    # 过滤中文字符
    result = re.findall(u'[^\u4e00-\u9fa5]', text)
    # 过滤英文字符
    result = re.findall(r'[^a-zA-Z]', ''.join(result))
    # 过滤数字
    result = re.findall(r'[^\d]', ''.join(result))
    # 过滤中英文标点符号
    result = re.findall(r'[^ !"#$%&\'()*+,-./:;<=>?@\[\\\]^_`{}~·—‘“”…、。《》！（），：；？]', ''.join(result))
    return result

def get_special_tokens():
    special_tokens = []
    sentences = []
    for i, row in df.iterrows():
        res = find_special_tokens(str(row.d1) + str(row.d2))
        special_tokens += res
        if res != []:
            sentences.append((row, res))
    return "".join(set(special_tokens)), sentences

special_tokens, sentences = get_special_tokens()
with open("special_tokens_sentence.txt", 'w', encoding='utf-8') as fin:
    for s in sentences:
        fin.write(str(s[0].cid)+"\t" + str(s[0].rid)+"\t"+str(s[0].d1)+"\t" + str(s[0].d2) + "\t" + " ".join(s[1])+"\n")
# 好家伙，什么奇奇怪怪的符号都有，甚至还有全角的，保存到文件推测这些符号的含义，然后用文字替换
special_tokens

'🉐②😔💰⊙😊√｛😂😆【✪😜ⅤˊＲｖ∨㎡Ｐ×～😅ˋ😝😳👍ｒ＋﹉＆λ☀😲🏠】🏾️＃😖🔑➕😍Ｖω😫ù😌🎊Ｗ％🏻😃😓😁●㥪😘😄😰😋→🈶￼😏｝－❓👌▽'

In [None]:
# 清除特殊符号和打成拼音的汉字
# 里面有全角的字符，好在不多，可以穷举完
# 主要对立面的特殊字符进行标准化，便于训练
import re


def clean_special_tokens(text):
    text = text.replace("😂", "哈哈")
    text = text.replace("👌", "好")
    text = text.replace("😊", "好")
    text = text.replace("😓", "汗")
    text = text.replace("😁", "哈哈")
    text = text.replace("👌🏻", "好")
    text = text.replace(" 🏻", "")
    text = text.replace("ù", "")
    text = text.replace("🈶️", "有")
    text = text.replace("🔑", "钥匙")
    text = text.replace("➕", "加")
    text = text.replace("🏠", "房子")
    text = text.replace("👍", "可以")
    text = text.replace("㎡", "平米")
    text = text.replace("🉐", "得")
    text = text.replace("Ｖ", "v")
    text = text.replace("Ｒ", "r")
    text = text.replace("❓", "？")
    text = text.replace("☀", "")
    text = text.replace("ｖ", "v")
    text = text.replace("ｒ", "r")
    text = text.replace("㥪", "楼")
    text = text.replace("￼", "")
    text = text.replace("＃", "#")
    text = text.replace("√", "对")
    text = text.replace("＋", "加")
    text = text.replace("﹉", "")
    text = text.replace("②", "二")
    text = text.replace("Ｗ", "万")
    text = text.replace("λ", "")
    text = text.replace("nh", "你好")
    text = text.replace("zaima", "在吗")
    text = text.replace("me", "么")
    text = text.replace("ceng", "层")
    text = text.replace("keyi", "可以")
    text = text.replace("taobao", "淘宝")
    text = text.replace("VR", "vr")
    text = text.replace("vR", "vr")
    text = text.replace("Vr", "vr")
    text = text.replace("NAMEPHONE", "NAME / PHONE")
    text = text.replace("l", "")
    text = text.replace("keyitan", "可以谈")
    text = text.replace("be", "")
    text = text.replace("ve", "vr")
    text = text.replace("key", "可以")
    text = text.replace("laile", "来了")
    text = text.replace("haole", "好了")
    text = text.replace("shaodeng", "稍等")
    text = text.replace("ninha", "您好")
    text = text.replace("nihao", "您好")
    text = text.replace("Ｐ", "P")
    text = text.replace("wan", "万")
    text = text.replace("DAU", "带")
    text = text.replace("lou", "楼")
    text = text.replace("kanfang", "看房")
    text = text.replace("is", "")
    text = text.replace("shenm", "")
    text = text.replace("＆", "&")
    text = text.replace("gaosunoi", "告诉你")
    text = text.replace("Va", "vr")
    text = text.replace("hao", "好")
    text = text.replace("ma", "")
    text = text.replace("zengzhi", "增值")
    # url直接清洗
    html = re.compile(r'(https?://)([\da-zA-Z=&\?_\.-]+)\.([a-z=&\?_\.]{2,6})([/\w =&\?_\.-]*)*/?')
    text = re.sub(html, "", text)
    return text

df_cleaned = df.copy(deep=True)
df_cleaned['d1'] = df_cleaned['d1'].apply(clean_special_tokens)
df_cleaned['d2'] = df_cleaned['d2'].apply(clean_special_tokens)
df_cleaned.to_csv("df_cleaned.tsv", index=None, sep="\t")

In [20]:
train = df_cleaned[:len(train_df)]
test = df_cleaned[len(train_df):]
unmatched = pd.read_csv("unmatched.tsv", sep="\t")
for i, row in unmatched.iterrows():
    train.loc[int(row['id']), 'label'] = int(row['label'])
train.to_csv("train.tsv", sep='\t', index=None)
test.to_csv("test.tsv", sep='\t', index=None)

In [7]:

# 生成某个句子的n-gram
def n_gram(x, n=2):
    if len(x) <= (n-1):
        return x
    result = []
    # zip函数在达到最短长度时就停止迭代
    n_grams = set(zip(*[x[i:] for i in range(n)]))
    for n_gram in n_grams:
        result.append("".join(n_gram))
    return result

df_cleaned['d1_unigram'] = df_cleaned['d1'].apply(lambda x: list(str(x)))
df_cleaned['d2_unigram'] = df_cleaned['d2'].apply(lambda x: list(str(x)))
df_cleaned['d1_bigrams'] = df_cleaned['d1'].apply(lambda x: n_gram(list(str(x))))
df_cleaned['d2_bigrams'] = df_cleaned['d2'].apply(lambda x: n_gram(list(str(x))))
df_cleaned['shared_words_unigram'] = df_cleaned.apply(
    lambda x: set(x['d1_unigram']).intersection(set(x['d2_unigram'])),
    axis = 1
)
df_cleaned['shared_words_bigrams'] = df_cleaned.apply(
    lambda x: set(x['d1_bigrams']).intersection(set(x['d2_bigrams'])),
    axis = 1
)

In [8]:
df_cleaned.to_csv("df_share_words.tsv", sep='\t')

In [9]:
# 条件本来应该是：一元语法2个字及以上， 二元语法1个词及以上，就算作有share words
def add_feature(x):
    special_unigram = '是能有对'
    
    unigram = x['shared_words_unigram']
    bigram = x['shared_words_bigrams']
    
    if len(unigram) == 1:
        for _ in special_unigram:
            if _ in unigram:
                return True
        
    if len(unigram) >= 2 and len(bigram) >= 1:
        return True
    
    return False

df_cleaned['add_feature'] = df_cleaned.apply(lambda x: add_feature(x), axis=1)

In [10]:
new_df = df_cleaned.copy(deep=True)

In [11]:
new_df

Unnamed: 0,cid,d1,rid,d2,label,d1_unigram,d2_unigram,d1_bigrams,d2_bigrams,shared_words_unigram,shared_words_bigrams,add_feature
0,0,采荷一小是分校,0,杭州市采荷第一小学钱江苑校区杭州市钱江新城实验学校,1,"[采, 荷, 一, 小, 是, 分, 校]","[杭, 州, 市, 采, 荷, 第, 一, 小, 学, 钱, 江, 苑, 校, 区, 杭, ...","[采荷, 小是, 荷一, 一小, 是分, 分校]","[第一, 采荷, 学校, 城实, 州市, 区杭, 苑校, 校区, 验学, 新城, 小学, 学...","{采, 一, 校, 小, 荷}","{一小, 采荷}",True
1,0,采荷一小是分校,1,是,0,"[采, 荷, 一, 小, 是, 分, 校]",[是],"[采荷, 小是, 荷一, 一小, 是分, 分校]",[是],{是},{},True
2,0,采荷一小是分校,2,这是5楼,0,"[采, 荷, 一, 小, 是, 分, 校]","[这, 是, 5, 楼]","[采荷, 小是, 荷一, 一小, 是分, 分校]","[5楼, 这是, 是5]",{是},{},True
3,1,毛坯,0,因为公积金贷款贷少,0,"[毛, 坯]","[因, 为, 公, 积, 金, 贷, 款, 贷, 少]",[毛坯],"[公积, 因为, 为公, 贷少, 款贷, 积金, 金贷, 贷款]",{},{},False
4,1,毛坯,1,是,0,"[毛, 坯]",[是],[毛坯],[是],{},{},False
...,...,...,...,...,...,...,...,...,...,...,...,...
21580,5998,您好我正在看尚林家园房子,1,有,0,"[您, 好, 我, 正, 在, 看, 尚, 林, 家, 园, 房, 子]",[有],"[尚林, 家园, 房子, 看尚, 林家, 您好, 好我, 在看, 正在, 园房, 我正]",[有],{},{},False
21581,5998,您好我正在看尚林家园房子,2,我带你看看,0,"[您, 好, 我, 正, 在, 看, 尚, 林, 家, 园, 房, 子]","[我, 带, 你, 看, 看]","[尚林, 家园, 房子, 看尚, 林家, 您好, 好我, 在看, 正在, 园房, 我正]","[我带, 看看, 你看, 带你]","{看, 我}",{},False
21582,5999,今天可以安排看房子,0,我约下房东稍后回你,1,"[今, 天, 可, 以, 安, 排, 看, 房, 子]","[我, 约, 下, 房, 东, 稍, 后, 回, 你]","[天可, 排看, 安排, 看房, 房子, 可以, 今天, 以安]","[下房, 房东, 回你, 后回, 东稍, 稍后, 约下, 我约]",{房},{},False
21583,5999,今天可以安排看房子,1,可以看你几点有时间过,1,"[今, 天, 可, 以, 安, 排, 看, 房, 子]","[可, 以, 看, 你, 几, 点, 有, 时, 间, 过]","[天可, 排看, 安排, 看房, 房子, 可以, 今天, 以安]","[点有, 几点, 看你, 可以, 间过, 有时, 时间, 你几, 以看]","{可, 以, 看}",{可以},True


In [13]:
length = len(new_df)
for i, flag in enumerate(new_df['add_feature']):
    print('\r %d/%d'%(i, length), end='')
    if flag:
        df_cleaned.loc[i, 'd1'] = str(df_cleaned.loc[i, 'd1']) +  '@'
        df_cleaned.loc[i, 'd2'] = str(df_cleaned.loc[i, 'd2']) +  '@'
    else:
        df_cleaned.loc[i, 'd1'] = str(df_cleaned.loc[i, 'd1']) +  '&'
        df_cleaned.loc[i, 'd2'] = str(df_cleaned.loc[i, 'd2']) +  '&'

 21584/21585

In [15]:
df_cleaned.to_csv("aug_train.tsv", sep='\t', index=None)

In [49]:
train = pd.read_csv("code/train.tsv", sep='\t')
train['q1'] = df_cleaned['d1']

In [51]:
train.to_csv("code/train.tsv", sep='\t')

In [17]:
# 获取shared words、powerful words、key words
import jieba
import jieba.posseg as psg
import jieba.analyse
from LAC import LAC
lac = LAC(mode='seg')


def fetch_feature(data):
    # 分词、词性标注
    %time data['d1_word_cut'] = data['d1'].apply(lambda x: list(lac.run(str(x))))
    %time data['d2_word_cut'] = data['d2'].apply(lambda x: list(lac.run(str(x))))
#     %time data['d1_pos_tag'] = data['d1'].apply(lambda x: [x.flag for x in list(psg.cut(str(x)))])
#     %time data['d2_pos_tag'] = data['d2'].apply(lambda x: [x.flag for x in list(psg.cut(str(x)))])
    # 回答一般会复述问题的关键词，所以抽取复述的词，而且复述词不能是停用词
#     data['shared_words'] = data.apply(lambda x: [_ for _ in set(x['d1_word_cut']).intersection(set(x['d2_word_cut'])) if _ not in stopwords], axis=1)

In [7]:
# 统计特征
# 文本特征：主要分为问句和回答
from collections import Counter
from time import time

# 共现词比例
def shared_word_proportion(x):
    count_d1 = Counter(x['d1_word_cut'])
    count_d2 = Counter(x['d2_word_cut'])
    n_shared_word_in_d1 = sum([count_d1[w] for w in count_d1 if w in count_d2])
    n_shared_word_in_d2 = sum([count_d2[w] for w in count_d2 if w in count_d1])
    n_total = sum(count_d1.values()) + sum(count_d2.values())
    return 1.0 * (n_shared_word_in_d1 + n_shared_word_in_d2) / n_total

# 动态规划求解编辑距离，时间复杂度是O(n^2)
def edit_distance(str1, str2):
    # 坑爹的二维数组声明方法，如果直接用*就是浅拷贝
    dp = [[0]*(len(str1)+1) for _ in range((len(str2)+1))]
    for i in range(len(str1)+1):
        dp[0][i] = i
    for j in range(len(str2)+1):
        dp[j][0] = j
    for i in range(1, len(str2)+1):
        for j in range(1, len(str1)+1):
            if str2[i-1] == str1[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = min(dp[i-1][j-1], dp[i-1][j], dp[i][j-1])+1
    return dp[len(str2)][len(str1)]

# 生成某个句子的n-gram
def n_gram(x, n=2):
    if len(x) <= (n-1):
        return x
    result = []
    # zip函数在达到最短长度时就停止迭代
    n_grams = set(zip(*[x[i:] for i in range(n)]))
    for n_gram in n_grams:
        result.append(" ".join(n_gram))
    return result

# 计算两个句子的Jaccard相似度
def Jaccard(str1, str2):
    s1 = set(str1)
    s2 = set(str2)
    intersection = s1.intersection(s2)
    union = s1.union(s2)
    return 1.0 * len(intersection) / len(union)


# 统计特征，选择了57个特征，后续再加
def statistical_feature(data):
    punctuations = r'^ !"#$%&\'()*+,-./:;<=>?@[]^_`{}~·—‘“”…、。《》！（），：；？【】｛｝～'
    # 统计文本和单词的长度
    %time data['d1_char_length'] = data['d1'].apply(lambda x: len(str(x)))
    %time data['d2_char_length'] = data['d2'].apply(lambda x: len(str(x)))
    %time data['d1_word_length'] = data['d1_word_cut'].apply(lambda x: len(x))
    %time data['d2_word_length'] = data['d2_word_cut'].apply(lambda x: len(x))
    %time data['d1_max_word_length'] = data['d1_word_cut'].apply(lambda x: max([len(_) for _ in x]))
    %time data['d1_mean_word_length'] = data['d1_word_cut'].apply(lambda x: sum([len(_) for _ in x])/len(x))
    %time data['d2_max_word_length'] = data['d2_word_cut'].apply(lambda x: max([len(_) for _ in x]))
    %time data['d2_mean_word_length'] = data['d2_word_cut'].apply(lambda x: sum([len(_) for _ in x])/len(x))
    %time data['char_length_difference'] = data.apply(lambda x: abs(len(str(x['d1'])) - len(str(x['d2']))), axis=1)
    %time data['word_length_difference'] = data.apply(lambda x: abs(len(x['d1_word_cut']) - len(x['d2_word_cut'])), axis=1)
    
    # 统计词性唯一值数量
    %time data['d1_pos_unique_num'] = data['d1_pos_tag'].apply(lambda x: len(set(x)))
    %time data['d1_contain_location'] = data['d1_pos_tag'].apply(lambda x: int('nr' in x))
    %time data['d1_contain_mood_particle'] = data['d1_pos_tag'].apply(lambda x: int('y' in x))
    
    # 统计唯一字符、单词数量以及标点符号数量
    %time data['d1_unique_char_num'] = data['d1'].apply(lambda x: len(set(str(x))))
    %time data['d2_unique_char_num'] = data['d2'].apply(lambda x: len(set(str(x))))
    %time data['d1_unique_word_num'] = data['d1_word_cut'].apply(lambda x: len(set(x)))
    %time data['d2_unique_word_num'] = data['d2_word_cut'].apply(lambda x: len(set(x)))
    data['d1_punc_num'] = data['d1'].apply(lambda x: sum(1 for _ in str(x) if _ in punctuations))
    data['d2_punc_num'] = data['d2'].apply(lambda x: sum(1 for _ in str(x) if _ in punctuations))
    data['d1_punc_category'] = data['d1'].apply(lambda x: len(set([_ for _ in str(x) if _ in punctuations])))
    data['d2_punc_category'] = data['d2'].apply(lambda x: len(set([_ for _ in str(x) if _ in punctuations])))
    
    # 是否包含停用词、字母、数字、emoji等等
    data['d1_contain_stopwords'] = data['d1'].apply(lambda x: 1 if len([_ for _ in str(x) if _ in stopwords]) > 0 else 0)
    data['d2_contain_stopwords'] = data['d2'].apply(lambda x: 1 if len([_ for _ in str(x) if _ in stopwords]) > 0 else 0)
    %time data['d1_contain_alphabet'] = data['d1'].apply(lambda x: 1 if len(re.findall(r'[a-zA-Z]', str(x))) > 0 else 0)
    %time data['d2_contain_alphabet'] = data['d2'].apply(lambda x: 1 if len(re.findall(r'[a-zA-Z]', str(x))) > 0 else 0)
    %time data['d1_contain_number'] = data['d1'].apply(lambda x: 1 if len(re.findall(r'[\d]', str(x))) > 0 else 0)
    %time data['d2_contain_number'] = data['d2'].apply(lambda x: 1 if len(re.findall(r'[\d]', str(x))) > 0 else 0)
    %time data['d1_contain_emoji'] = data['d1'].apply(lambda x: 1 if len(re.findall(u'[\U00010000-\U0010ffff\\uD800-\\uDBFF\\uDC00-\\uDFFF]', str(x))) > 0 else 0)
    %time data['d2_contain_emoji'] = data['d2'].apply(lambda x: 1 if len(re.findall(u'[\U00010000-\U0010ffff\\uD800-\\uDBFF\\uDC00-\\uDFFF]', str(x))) > 0 else 0)
    
    # 问题可以提取的业务特征有：
    # 是否带有“?？”，一般带问号的是问句，问句的特征比较好找，而有些问题是陈述句，这种关系就不太好抽取
    # 特殊句式“是/能/可以...”/“有...不”/“....对吗”+语气词一般回复“是的”，“可以”，“有的”，“嗯嗯”（否定回复一般加个否定词就可以了）
    # 带“多少”的一般要会回复数量关系如：数额、百分比等等，进一步可以抽取：是否含有税、贷款、首付、价格、年等等关键词
    # 带有“怎么样”，“怎么办”，这个回答太过于灵活，比较难抽取
    # 询问位置：“在哪”，这个需要靠词性来推断回复是否含有地点
    # 问很多的：“采光”基于规则也不太好判断
    # 回答可提取的业务特征：是否含有价格、hashTag、疑问词、地点、时间、学校、楼层、百分比、单元、面积、年限、几期以及回复“是的”、“好的”、“可以”
    # 找业务特征的目的：通过问题所问的内容，可以通过回答中某些业务特征找到是否匹配的关系；同理，问题中的业务特征和回答的特征可能也存在，某些关系
    %time data['d1_is_interrogative'] = data['d1'].apply(lambda x: 1 if '?' in str(x) or '？' in str(x) else 0)
    %time data['d1_spcial_statement'] = data['d1'].apply(lambda x: 1 if re.match(r'(是|能|可以|有|对吗)', str(x)) or '不' in str(x)[-2:] else 0)
    %time data['d1_how_many'] = data['d1'].apply(lambda x: 1 if '多少' in str(x) else 0)
    %time data['d1_num_feature'] = data['d1'].apply(lambda x: 1 if re.match(r'(税|贷|款|首付|价格|年|费|优惠)', str(x)) else 0)
    %time data['d1_contain_where'] = data['d1'].apply(lambda x:1 if '在哪' in str(x) else 0)
    
    %time data['d1_contain_price'] = data['d1'].apply(lambda x: 1 if re.match(r'(\d+\.?)\d+((来|几)?)[w|W|万]', str(x)) else 0)
    %time data['d2_contain_price'] = data['d2'].apply(lambda x: 1 if re.match(r'(\d+\.?)\d+((来|几)?)[w|W|万]', str(x)) else 0)

    # 带有hashTag的一般都是客户复制的小区信息来咨询的
    %time data['d1_contain_hashtag'] = data['d1'].apply(lambda x: 1 if re.match(r'#.+#', str(x)) else 0)
    %time data['d2_contain_hashtag'] = data['d2'].apply(lambda x: 1 if re.match(r'#.+#', str(x)) else 0)
    %time data['d1_contain_interrogation'] = data['d1'].apply(lambda x: 1 if re.match(r'(什么|哪儿|哪里|几时|几|多少怎|怎么|怎样|怎么样|如何|吗|呢)', str(x)) else 0)
    %time data['d2_contain_interrogation'] = data['d2'].apply(lambda x: 1 if re.match(r'(什么|哪儿|哪里|几时|几|多少怎|怎么|怎样|怎么样|如何|吗|呢)', str(x)) else 0)

    # 一点有歧义，数据里面一点表示很少的意思比较多，所以去掉了
    %time data['d1_contain_time'] = data['d1'].apply(lambda x: 1 if re.match(r'(今天|明天|上午|中午|下午|晚上|周[二三四五六日末\d]|[\d]+点(半?)|[两三四五六七八九十]点(半?))', str(x)) else 0)
    %time data['d2_contain_time'] = data['d2'].apply(lambda x: 1 if re.match(r'(今天|明天|上午|中午|下午|晚上|周[二三四五六日末\d]|[\d]+点(半?)|[两三四五六七八九十]点(半?))', str(x)) else 0)
    %time data['d1_contain_school'] = data['d1'].apply(lambda x: 1 if re.match(r'([一三三四五六七八九十]+中|校区|(实验|南雅|雅礼)?(中学|小学|附中|幼儿园))', str(x)) else 0)
    %time data['d2_contain_school'] = data['d2'].apply(lambda x: 1 if re.match(r'([一三三四五六七八九十]+中|校区|(实验|南雅|雅礼)?(中学|小学|附中|幼儿园))', str(x)) else 0)
    %time data['d1_contain_floor'] = data['d1'].apply(lambda x: 1 if re.match(r'(([\d一二三四五六七八九十]+(号?)[楼|层])|([一二三四五六七八九十\d]+栋))', str(x)) else 0)
    %time data['d2_contain_floor'] = data['d2'].apply(lambda x: 1 if re.match(r'(([\d一二三四五六七八九十]+(号?)[楼|层])|([一二三四五六七八九十\d]+栋))', str(x)) else 0)
    %time data['d1_contain_percentage'] = data['d1'].apply(lambda x: 1 if re.match(r'(\d+\.?)\d+%', str(x)) else 0)
    %time data['d2_contain_percentage'] = data['d2'].apply(lambda x: 1 if re.match(r'(\d+\.?)\d+%', str(x)) else 0)
    %time data['d1_contain_unit'] = data['d1'].apply(lambda x: 1 if re.match(r'[ABCDEF东南西北一二三四五六七八九十\d]+([边栋]?)单元', str(x)) else 0)
    %time data['d2_contain_unit'] = data['d2'].apply(lambda x: 1 if re.match(r'[ABCDEF东南西北一二三四五六七八九十\d]+([边栋]?)单元', str(x)) else 0)
    %time data['d1_contain_area'] = data['d1'].apply(lambda x: 1 if re.match(r'((\d+\.?)[一二三四五六七八九十两百\d]+(平|平方|平方米|平米)|面积\d+)', str(x)) else 0)
    %time data['d2_contain_area'] = data['d2'].apply(lambda x: 1 if re.match(r'((\d+\.?)[一二三四五六七八九十两百\d]+(平|平方|平方米|平米)|面积\d+)', str(x)) else 0)
    %time data['d1_contain_year'] = data['d1'].apply(lambda x: 1 if re.match(r'[一两二三四五六七八九十半\d]+年', str(x)) else 0)
    %time data['d2_contain_year'] = data['d2'].apply(lambda x: 1 if re.match(r'[一两二三四五六七八九十半\d]+年', str(x)) else 0)
    
    # 文本相似度特征：考虑到本次任务是语义上的匹配，不一定要问句和答句相似，但是大多数问题问句和答句都比较相似，在重复主题和关键字
    # 可以查看共现词、编辑距离、或者其他距离如Jaccard相似度
    %time data['shared_word_proportion'] = data.apply(shared_word_proportion, axis=1)
    %time data['shared_word_num'] = data['shared_words'].apply(lambda x: len(x))
    %time data['jaccard_similarity'] = data.apply(lambda x: Jaccard(x['d1_word_cut'], x['d2_word_cut']), axis=1)
    %time data['jaccard_similarity_bigram'] = data.apply(lambda x: Jaccard(n_gram(x['d1_word_cut']), n_gram(x['d2_word_cut'])), axis=1)
    %time data['jaccard_similarity_trigram'] = data.apply(lambda x: Jaccard(n_gram(x['d1_word_cut'], 3), n_gram(x['d2_word_cut'], 3)), axis=1)
    %time data['edit_distance'] = data.apply(lambda x: edit_distance(str(x['d1']), str(x['d2'])), axis=1)
    %time data['dice_distance'] = data.apply(lambda x: 2.0 * (len(set(x['d1_word_cut']).intersection(set(x['d2_word_cut']))) / (len(set(x['d1_word_cut']))+len(set(x['d2_word_cut'])))), axis=1)

In [18]:
# 拿原始版本做训练
# df_raw = pd.read_csv("df_raw.tsv", sep='\t')
df_raw = df_cleaned
fetch_feature(df_raw)

Wall time: 41.8 s
Wall time: 46 s


In [23]:
statistical_feature(df_raw)

Wall time: 64 ms
Wall time: 63 ms
Wall time: 54 ms
Wall time: 53 ms
Wall time: 173 ms
Wall time: 143 ms
Wall time: 146 ms
Wall time: 124 ms
Wall time: 1.36 s
Wall time: 1.4 s
Wall time: 67 ms
Wall time: 51 ms
Wall time: 53 ms
Wall time: 157 ms
Wall time: 170 ms
Wall time: 78 ms
Wall time: 76 ms
Wall time: 141 ms
Wall time: 147 ms
Wall time: 161 ms
Wall time: 156 ms
Wall time: 156 ms
Wall time: 149 ms
Wall time: 68 ms
Wall time: 172 ms
Wall time: 57.1 ms
Wall time: 126 ms
Wall time: 48 ms
Wall time: 138 ms
Wall time: 142 ms
Wall time: 151 ms
Wall time: 154 ms
Wall time: 165 ms
Wall time: 200 ms
Wall time: 141 ms
Wall time: 158 ms
Wall time: 204 ms
Wall time: 154 ms
Wall time: 154 ms
Wall time: 168 ms
Wall time: 161 ms
Wall time: 156 ms
Wall time: 190 ms
Wall time: 146 ms
Wall time: 184 ms
Wall time: 151 ms
Wall time: 171 ms
Wall time: 179 ms
Wall time: 2.62 s
Wall time: 37 ms
Wall time: 1.88 s
Wall time: 2.95 s
Wall time: 2.74 s
Wall time: 11.1 s
Wall time: 2.42 s


In [24]:
df_raw.to_csv("raw_tmp.tsv", index=None, sep="\t")

In [8]:
# 计算文本的count vector、tfidf vector、word2vec、doc2vec、LDA、kmeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import word2vec, doc2vec
from LAC import LAC

lac = LAC(mode='seg')

# 训练word2vec，输入是List(List(str))
def train_word2vec(data):
    sentences = []
    for i, row in data.iterrows():
        print("\r 读取语料中：{}".format(i), end="")
        sentences.append(lac.run(str(row['d1'])))
        sentences.append(lac.run(str(row['d2'])))
    w2v = word2vec.Word2Vec(
        sentences,
        size=300,
        iter=30,
        window = 5,
        min_count = 0,
        workers = 4,
        sample = 1e-4)
    return w2v


In [10]:
import logging

logging.basicConfig(level=logging.INFO)

w2v = train_word2vec(df_cleaned)

 读取语料中：53681

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 48553 words, keeping 4014 word types


 读取语料中：53756

INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 96801 words, keeping 6132 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 145414 words, keeping 7878 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 195362 words, keeping 9373 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 243708 words, keeping 10750 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 292432 words, keeping 11987 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 341237 words, keeping 13142 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 390430 words, keeping 14325 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #90000, processed 439933 words, keeping 15385 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #100000, processed 489066 words, keeping 16417 word types
INFO:gensim.models.word2vec:PROGRESS: at

In [11]:
w2v.save("w2v.pkt")

INFO:gensim.utils:saving Word2Vec object under w2v.pkt, separately None
INFO:gensim.utils:not storing attribute vectors_norm
INFO:gensim.utils:not storing attribute cum_table
INFO:gensim.utils:saved w2v.pkt


In [None]:
# 分别提取关键词

In [19]:
# 图特征，这里使用共现矩阵和相似度矩阵来构造图
# 先提取关键词，提取关键词采用无监督学习算法，采用：词频统计、tfidf、hits算法和sgrank
from time import time

def key_words_union(k1, k2):
    s1 = set([_[0] for _ in k1])
    s2 = set([_[0] for _ in k2])
    return s1.union(s2)

def key_words_intersection(k1, k2):
    s1 = set([_[0] for _ in k1])
    s2 = set([_[0] for _ in k2])
    return s1.intersection(s2)

# 提取关键字时没有注意较短文本和短文本的关键字提取的效果不同，问题和回答体悟关键字也不同，所以这里对长短文本分开处理
# 短文本一般只有一个核心词，而长一点的文本则有多个
def extract_key_words(text, text_rank = False):
    allow_flag=['a', 'ad', 'ag', 'an', 'b', 'd',
                'df', 'dg', 'eng', 'f', 'g', 'h',
                'i', 'j', 'k', 'l', 'n',
                'ns', 'nt', 'nz',
                's', 't', 'tg','v', 'vd', 'vg', 'vi',
                'vn', 'vq']
    extractor = jieba.analyse.extract_tags if not text_rank else jieba.analyse.textrank
    if len(text) <= 6:
        return extractor(text, topK=1, withWeight=True)
    else:
        return extractor(text, topK=3, withWeight=True, allowPOS=allow_flag)
        

def key_words(data):
    # 两种算法的并集作为关键词
    # 利用tfidf和textrank来获取关键词
    # 有时候两种算法都不能覆盖到关键字，大约有1800多条就人工提取把

    start = time()
    data['key_words_tfidf'] = data.apply(lambda x: extract_key_words(str(x['d1'])+" "+str(x['d2'])), axis=1)
    print("合并问答对TFIDF提取关键词累积耗时:{:.2f}".format(time()-start))
    data['key_words_textrank'] = data.apply(lambda x: extract_key_words(str(x['d1'])+" "+str(x['d2']), text_rank=True), axis=1)
    print("合并问答对TextRank提取关键词累积耗时:{:.2f}".format(time()-start))
    data['d1_key_words_tfidf'] = data['d1'].apply(lambda x: extract_key_words(str(x)))
    print("d1 TFIDF提取关键词累积耗时:{:.2f}".format(time()-start))
    data['d2_key_words_tfidf'] = data['d2'].apply(lambda x: extract_key_words(str(x)))
    print("d2 TFIDF提取关键词累积耗时:{:.2f}".format(time()-start))
    data['d1_key_words_textrank'] = data['d1'].apply(lambda x: extract_key_words(str(x), text_rank=True))
    print("d1 TextRank提取关键词累积耗时:{:.2f}".format(time()-start))
    data['d2_key_words_textrank'] = data['d2'].apply(lambda x: extract_key_words(str(x), text_rank=True))
    print("d2 TextRank提取关键词累积耗时:{:.2f}".format(time()-start))
    data['key_words'] = data.apply(lambda x: key_words_union(x['key_words_tfidf'], x['key_words_textrank']), axis=1)
    print("取两种算法结果并集提取关键词累积耗时:{:.2f}".format(time()-start))
    data['key_words_i'] = data.apply(lambda x: key_words_intersection(x['key_words_tfidf'], x['key_words_textrank']), axis=1)
    print("取两种算法结果交集提取关键词累积耗时:{:.2f}".format(time()-start))

In [20]:
key_words(df_raw)

合并问答对TFIDF提取关键词累积耗时:141.10
合并问答对TextRank提取关键词累积耗时:297.72
d1 TFIDF提取关键词累积耗时:341.87
d2 TFIDF提取关键词累积耗时:408.70
d1 TextRank提取关键词累积耗时:474.30
d2 TextRank提取关键词累积耗时:562.05
取两种算法结果并集提取关键词累积耗时:564.04
取两种算法结果交集提取关键词累积耗时:566.00


In [24]:
df_raw

Unnamed: 0,cid,d1,rid,d2,label,d1_word_cut,d2_word_cut,key_words_tfidf,key_words_textrank,d1_key_words_tfidf,d2_key_words_tfidf,d1_key_words_textrank,d2_key_words_textrank,key_words,key_words_i
0,0,采荷一小是分校吧,0,杭州市采荷第一小学钱江苑校区，杭州市钱江新城实验学校。,1.0,"[采荷, 一, 小, 是, 分校, 吧]","[杭州市采荷第一小学, 钱江苑校区, ，, 杭州市, 钱江新城实验学校, 。]","[(杭州市, 2.3164871659375), (实验学校, 1.462931634325...","[(杭州市, 1.0), (新城, 0.504449124178066), (校区, 0.4...","[(分校, 4.60009864309), (一小, 3.88909242123)]","[(杭州市, 3.0886495545833337), (实验学校, 1.950575512...","[(分校, 1.0), (一小, 0.9961264494011037)]","[(杭州市, 1.0), (新城, 0.6213251057675828), (校区, 0....","{杭州市, 校区}","{杭州市, 校区}"
1,0,采荷一小是分校吧,1,是的,0.0,"[采荷, 一, 小, 是, 分校, 吧]","[是, 的]","[(分校, 4.60009864309), (一小, 3.88909242123)]","[(分校, 1.0), (一小, 0.9961264494011037)]","[(分校, 4.60009864309), (一小, 3.88909242123)]",[],"[(分校, 1.0), (一小, 0.9961264494011037)]",[],"{一小, 分校}","{一小, 分校}"
2,0,采荷一小是分校吧,2,这是5楼,0.0,"[采荷, 一, 小, 是, 分校, 吧]","[这, 是, 5, 楼]","[(分校, 4.60009864309), (一小, 3.88909242123)]","[(分校, 1.0), (一小, 0.9961264494011037)]","[(分校, 4.60009864309), (一小, 3.88909242123)]","[(这是, 4.29162827639)]","[(分校, 1.0), (一小, 0.9961264494011037)]",[],"{一小, 分校}","{一小, 分校}"
3,1,毛坯吗？,0,因为公积金贷款贷的少,0.0,"[毛坯, 吗, ？]","[因为, 公积金, 贷款, 贷, 的, 少]","[(毛坯, 3.7308758169666665), (公积金, 2.78778112832...","[(贷款, 1.0), (公积金, 0.9961264494011037)]","[(毛坯, 11.1926274509)]","[(公积金, 4.18167169248), (贷款, 2.836784708815)]",[],"[(贷款, 1.0), (公积金, 0.9961264494011037)]","{贷款, 公积金}","{贷款, 公积金}"
4,1,毛坯吗？,1,是呢,0.0,"[毛坯, 吗, ？]","[是, 呢]","[(毛坯, 11.1926274509)]",[],"[(毛坯, 11.1926274509)]",[],[],[],{},{}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53752,13998,这套房子有啥问题吗 我看价格不高,3,租约还有两年,,"[这, 套, 房子, 有, 啥, 问题, 吗, , , 我, 看, 价格, 不, 高]","[租约, 还有, 两年]","[(租约, 2.1066763644), (房子, 1.2631809148720001),...","[(租约, 1.0), (问题, 0.6703672480838158), (房子, 0.6...","[(房子, 2.105301524786667), (价格, 1.5155855171733...","[(租约, 3.511127274)]","[(问题, 1.0), (房子, 0.9961264494011037)]","[(还有, 1.0)]","{房子, 租约}","{房子, 租约}"
53753,13998,这套房子有啥问题吗 我看价格不高,4,都有学位的,,"[这, 套, 房子, 有, 啥, 问题, 吗, , , 我, 看, 价格, 不, 高]","[都, 有, 学位, 的]","[(学位, 2.08790039177), (房子, 1.57897614359), (价格...","[(问题, 1.0), (房子, 0.9961264494011037)]","[(房子, 2.105301524786667), (价格, 1.5155855171733...","[(学位, 8.35160156708)]","[(问题, 1.0), (房子, 0.9961264494011037)]",[],{房子},{房子}
53754,13999,我看看时间吧,0,没有呢,,"[我, 看看, 时间, 吧]","[没有, 呢]","[(看看, 1.8034815473799999), (时间, 1.359846544153...","[(看看, 1.0), (没有, 0.9966849915940917), (时间, 0.9...","[(看看, 2.70522232107)]","[(没有, 3.11282356515)]","[(看看, 1.0)]",[],"{看看, 时间, 没有}","{看看, 时间, 没有}"
53755,13999,我看看时间吧,1,今天新上的,,"[我, 看看, 时间, 吧]","[今天, 新, 上, 的]","[(看看, 1.8034815473799999), (今天, 1.664039425610...","[(看看, 1.0), (时间, 0.9966849915940917), (今天, 0.9...","[(看看, 2.70522232107)]","[(新上, 5.97738375145)]","[(看看, 1.0)]",[],"{看看, 时间, 今天}","{看看, 时间, 今天}"


In [30]:
# 统计共现矩阵，这里统计的是关键词在在一条记录中共现的
# 还有一种基于相似度的图，这里将关键词的相似度来构建出图特征
import networkx as nx
from tqdm import tqdm

graph = nx.Graph()

def fetch_nodes(data):
    key_words = set()
    for i, row in data.iterrows():
        res = row['key_words']
        key_words.update(res)

    # 顶点用数字表示==
    nodes2words = [_ for _ in key_words]
    words2nodes = {word:idx for idx, word in enumerate(nodes2words)}
    return nodes2words, words2nodes


def build_graph(graph, data):
    # 将顶点加入到图中
    graph.add_nodes_from([_ for _ in range(len(nodes2words))])

    # 给图增加边，一条记录的所有关键词两两之间都有边
    for i, row in tqdm(data.iterrows()):
        res = list(row['key_words'])
        if len(res) >= 2:
            for i in range(len(res)-1):
                for j in range(i, len(res)):
                    graph.add_edge(words2nodes[res[i]], words2nodes[res[j]])

nodes2words, words2nodes = fetch_nodes(df_raw)
build_graph(graph, df_raw)

75342it [00:10, 7016.18it/s]


In [31]:
%%time
# 主要提取的特征有：每个关键字的连接边数（度），pagerrank值， hits算法的A和H值，每个关键词的邻居数量

# 所有节点的连通分量
def components(graph):
    max_components = {}
    components = nx.connected_components(graph)
    for component in components:
        for n in component:
            max_components[n] = max(max_components.get(n, 0), len(component))
    return max_components

# hits算法
def hits(graph):
    hits_h, hits_a = nx.hits(graph, max_iter=500)
    return hits_h, hits_a

# 所有单词的度的计算
def degrees(graph):
    max_degrees = {}
    edges = graph.edges()
    for edge in edges:
        for n in edge:
            max_degrees[n] = max_degrees.get(n, 0) + 1
    return max_degrees

# 先求所有关键词的连通分量、hits、度
max_components = components(graph)
hits_h, hits_a = hits(graph)
max_degrees = degrees(graph)
pagerank = nx.pagerank_scipy(graph)

Wall time: 16.7 s


In [32]:
# 计算图特征
def graph_feature(data, graph):
    data['neighbors_num_coo'] = data['key_words'].apply(lambda x: [len(set(graph.neighbors(words2nodes[_]))) for _ in x])
    data['hits_a_coo'] = data['key_words'].apply(lambda x: [hits_a[words2nodes[_]] for _ in x])
    data['hits_h_coo'] = data['key_words'].apply(lambda x: [hits_h[words2nodes[_]] for _ in x])
    data['max_degrees_coo'] = data['key_words'].apply(lambda x: [max_degrees.get(words2nodes[_], 0) for _ in x])
    data['pagerank_coo'] = data['key_words'].apply(lambda x: [pagerank[words2nodes[_]] for _ in x])
    
    # 映射成数值
    data['max_hits_a_coo'] = data['hits_a_coo'].apply(lambda x: max(x) if len(x) > 0 else 0)
    data['mean_hits_a_coo'] = data['hits_a_coo'].apply(lambda x: sum(x)/len(x) if len(x) > 0 else 0)
    data['max_max_degrees_coo'] = data['max_degrees_coo'].apply(lambda x: max(x) if len(x) > 0 else 0)
    data['mean_max_degrees_coo'] = data['max_degrees_coo'].apply(lambda x: sum(x)/len(x) if len(x) > 0 else 0)
    data['max_pagerank_coo'] = data['pagerank_coo'].apply(lambda x: max(x) if len(x) > 0 else 0)
    data['mean_pagerank_coo'] = data['pagerank_coo'].apply(lambda x: sum(x)/len(x) if len(x) > 0 else 0)
    
    data['max_tfidf_coo'] = data['key_words_tfidf'].apply(lambda x: max([_[1] for _ in x]) if len(x) > 0 else 0)
    data['mean_tfidf_coo'] = data['key_words_tfidf'].apply(lambda x: sum([_[1] for _ in x])/len(x) if len(x) > 0 else 0)
    data['max_textrank_coo'] = data['key_words_textrank'].apply(lambda x: max([_[1] for _ in x])  if len(x) > 0 else 0)
    data['mean_textrank_coo'] = data['key_words_textrank'].apply(lambda x: sum([_[1] for _ in x])/len(x) if len(x) > 0 else 0)

graph_feature(df_raw, graph)

In [33]:
# 再基于相似度来构造图模型
# 根据关键字的相似词图构造图特征
sim_graph = nx.Graph()

def fetch_nodes():
    # 顶点用数字表示==
    nodes2words = w2v.wv.index2word
    words2nodes = {word:idx for idx, word in enumerate(nodes2words)}
    return nodes2words, words2nodes


def build_graph(graph):
    # 将顶点加入到图中
    graph.add_nodes_from([_ for _ in range(len(nodes2words))])

    # 给图增加边，一条记录的所有关键词两两之间都有边
    for word in nodes2words:
        similarities = w2v.wv.most_similar(word, topn=3)
        for sim in similarities:
            graph.add_edge(words2nodes[word], words2nodes[sim[0]])

nodes2words, words2nodes = fetch_nodes()
build_graph(sim_graph)

INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors


In [34]:
%%time
max_components = components(sim_graph)
hits_h, hits_a = hits(sim_graph)
max_degrees = degrees(sim_graph)
pagerank = nx.pagerank_scipy(sim_graph)

Wall time: 49.5 s


In [35]:
# 计算图特征
def graph_feature(data, graph):
    data['neighbors_num_sim'] = data['key_words'].apply(lambda x: [len(set(graph.neighbors(words2nodes[_]))) if _ in words2nodes else 0. for _ in x])
    data['hits_a_sim'] = data['key_words'].apply(lambda x: [hits_a[words2nodes[_]] if _ in words2nodes else 0.  for _ in x])
    data['hits_h_sim'] = data['key_words'].apply(lambda x: [hits_h[words2nodes[_]] if _ in words2nodes else 0.  for _ in x])
    data['max_degrees_sim'] = data['key_words'].apply(lambda x: [max_degrees.get(words2nodes[_], 0)  if _ in words2nodes else 0  for _ in x])
    data['pagerank_sim'] = data['key_words'].apply(lambda x: [pagerank[words2nodes[_]] if _ in words2nodes else 0.  for _ in x])
    
    # 映射成数值
    data['max_hits_a_sim'] = data['hits_a_sim'].apply(lambda x: max(x) if len(x) > 0 else 0)
    data['mean_hits_a_sim'] = data['hits_a_sim'].apply(lambda x: sum(x)/len(x) if len(x) > 0 else 0)
    data['max_max_degrees_sim'] = data['max_degrees_sim'].apply(lambda x: max(x) if len(x) > 0 else 0)
    data['mean_max_degrees_sim'] = data['max_degrees_sim'].apply(lambda x: sum(x)/len(x) if len(x) > 0 else 0)
    data['max_pagerank_sim'] = data['pagerank_sim'].apply(lambda x: max(x) if len(x) > 0 else 0)
    data['mean_pagerank_sim'] = data['pagerank_sim'].apply(lambda x: sum(x)/len(x) if len(x) > 0 else 0)
    
    data['max_tfidf_sim'] = data['key_words_tfidf'].apply(lambda x: max([_[1] for _ in x]) if len(x) > 0 else 0)
    data['mean_tfidf_sim'] = data['key_words_tfidf'].apply(lambda x: sum([_[1] for _ in x])/len(x) if len(x) > 0 else 0)
    data['max_textrank_sim'] = data['key_words_textrank'].apply(lambda x: max([_[1] for _ in x])  if len(x) > 0 else 0)
    data['mean_textrank_sim'] = data['key_words_textrank'].apply(lambda x: sum([_[1] for _ in x])/len(x) if len(x) > 0 else 0)

graph_feature(df_raw, sim_graph)

In [21]:
# 训练doc2vec，将每一个句子表示成句向量，然后通过句向量和关键词匹配程度寻找特征
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from LAC import LAC
import logging

logging.basicConfig(level=logging.INFO)
lac = LAC(mode='seg')

# 将句子都分好词，然后包装好，生成句子向量
def wrap_sentence(data):
    sentences = []
    # 把回答做句子编码就成
    for i, row in data.iterrows():
        sentences.append(TaggedDocument(lac.run(row['d2']), tags=[i]))
    return sentences

sentences = wrap_sentence(df_cleaned)
d2v = Doc2Vec(sentences, window=5, size=300, sample=1e-3, workers=4, negative=5)
d2v.train(sentences, total_examples=d2v.corpus_count, epochs=30)

INFO:gensim.models.doc2vec:collecting all words and their counts
INFO:gensim.models.doc2vec:PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #10000, processed 65462 words (735556/s), 5813 word types, 10000 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #20000, processed 132114 words (789530/s), 8771 word types, 20000 tags
INFO:gensim.models.doc2vec:collected 9196 word types and 21585 unique tags from a corpus of 21585 examples and 142375 words
INFO:gensim.models.word2vec:Loading a fresh vocabulary
INFO:gensim.models.word2vec:effective_min_count=5 retains 1764 unique words (19% of original 9196, drops 7432)
INFO:gensim.models.word2vec:effective_min_count=5 leaves 131564 word corpus (92% of original 142375, drops 10811)
INFO:gensim.models.word2vec:deleting the raw counts dictionary of 9196 items
INFO:gensim.models.word2vec:sample=0.001 downsamples 68 most-common words
INFO:gensim.models.word2vec:downsampling leav

In [22]:
sentences[:10]

[TaggedDocument(words=['杭州市采荷第一小学', '钱江苑校区', '，', '杭州市', '钱江新城实验学校', '。'], tags=[0]),
 TaggedDocument(words=['是', '的'], tags=[1]),
 TaggedDocument(words=['这', '是', '5', '楼'], tags=[2]),
 TaggedDocument(words=['因为', '公积金', '贷款', '贷', '的', '少'], tags=[3]),
 TaggedDocument(words=['是', '呢'], tags=[4]),
 TaggedDocument(words=['这', '套', '一', '楼', '带', '院', '的', '，', '您', '看看'], tags=[5]),
 TaggedDocument(words=['房本', '都是', '五年', '外', '的'], tags=[6]),
 TaggedDocument(words=['好', '的', '?', '?', '，', '您', '先', '看', '下'], tags=[7]),
 TaggedDocument(words=['您', '是', '首套', '还是', '二套', '呢', '？'], tags=[8]),
 TaggedDocument(words=['所有', '费用', '下来', '654万'], tags=[9])]

In [23]:
# 保存好模型
d2v.save("d2v.pkt")

INFO:gensim.utils:saving Doc2Vec object under d2v.pkt, separately None
INFO:gensim.utils:saved d2v.pkt


In [44]:
# 利用词向量和句子向量之间来度量
import math
import numpy as np
import scipy.stats as stats

def distance_calc(data):
    
    def union(l1, l2):
        s1 = set(_[0] for _ in l1)
        s2 = set(_[0] for _ in l2)
        return s1.union(s2)
    
    def extractor(x):
        res = union(x['d1_key_words_tfidf'], x['d1_key_words_textrank'])
        if res == set():
            ans = list(psg.cut(str(x['d1'])))
            key_words = set([_.word for _ in ans if 'n' in _.flag or 'v' in _.flag or 'm' in _.flag or 'r' in _.flag])
            if key_words == set() and len(str(x['d1']))<=2:
                key_words.add(str(x['d1']))
            return key_words
        else:
            return res
        
    def cosine(w1, w2):
        return w1.dot(w2) / (math.sqrt((w1**2).sum()) * math.sqrt((w2**2).sum()))
    
    def euclidean(w1, w2):
        return math.sqrt(((w1-w2)**2).sum())
    
    def inner_product(w1, w2):
        return np.dot(w1, w2)
        
    # 单词之间的余弦相似度
    def ww_(x, method="cosine"):
        if method == "cosine":
            distance = cosine
        elif method == "euclidean":
            distance = euclidean
        elif "pearson" in method:
            distance = stats.pearsonr
        else:
            distance = inner_product
        cos = 0.
        num = 0
        if x['key_words'] == []:
            return 0.
        else:
            for i in range(len(x['key_words'])-1):
                for j in range(i+1, len(x['key_words'])):
                    w1, w2 = list(x['key_words'])[i], list(x['key_words'])[j]
                    if w1 not in w2v.wv.vocab or w2 not in w2v.wv.vocab:
                        break
                    w1, w2 = w2v.wv[w1], w2v.wv[w2]
                    dist = distance(w1, w2)
                    if method == "pearson_cor":
                        dist = dist[0]
                    elif method == "pearson_pvalue":
                        dist = dist[1]
                    cos += dist
                    num += 1
        return cos/num if num > 0 else 0
    
    def ww_(x, method="cosine"):
        if method == "cosine":
            distance = cosine
        elif method == "euclidean":
            distance = euclidean
        elif "pearson" in method:
            distance = stats.pearsonr
        else:
            distance = inner_product
        cos = 0.
        num = 0
        if x['key_words'] == []:
            return 0.
        else:
            for i in range(len(x['key_words'])-1):
                w1, w2 = list(x['key_words'])[i], x['d1']
                if w1 not in w2v.wv.vocab or w2 not in w2v.wv.vocab:
                    return 0
                w1, w2 = w2v.wv[w1], w2v.wv[w2]
                dist = distance(w1, w2)
                if method == "pearson_cor":
                    dist = dist[0]
                elif method == "pearson_pvalue":
                    dist = dist[1]
                cos += dist
                num += 1
        return cos/num if num > 0 else 0
        
    data['d1_key_words'] = data.apply(lambda x: extractor(x), axis=1)
    # 距离度量系列：余弦相似度、欧氏距离、皮尔逊相关系数
    data['ww_cosine'] =  data.apply(lambda x: ww_(x, method="cosine"), axis=1)
    data['ww_euclidean'] = data.apply(lambda x: ww_(x, method="euclidean"), axis=1)
    data['ww_inner_product'] = data.apply(lambda x: ww_(x, method=""), axis=1)
    data['ww_pearson_cor'] = data.apply(lambda x: ww_(x, method="pearson_cor"), axis=1)
    data['ww_pearson_pvalue'] = data.apply(lambda x: ww_(x, method="pearson_pvalue"), axis=1)
    
    # 计算关键词与文档向量的距离

distance_calc(df_raw)

In [None]:
# 共现矩阵和相似矩阵的构建
coocurance = np.zeros()

In [45]:
df_raw.head(40)

Unnamed: 0,cid,d1,rid,d2,label,d1_word_cut,d2_word_cut,d1_pos_tag,d2_pos_tag,shared_words,...,max_tfidf_sim,mean_tfidf_sim,max_textrank_sim,mean_textrank_sim,d1_key_words,ww_cosine,ww_euclidean,ww_inner_product,ww_pearson_cor,ww_pearson_pvalue
0,0,采荷一小是分校吧,0,杭州市采荷第一小学钱江苑校区，杭州市钱江新城实验学校。,1.0,"[采荷, 一小, 是, 分校, 吧]","[杭州市, 采荷, 第一, 小学, 钱江苑, 校区, ，, 杭州市, 钱江, 新城, 实验学...","[nr, d, v, n, y]","[ns, nr, m, n, nr, n, x, ns, nr, ns, n, x]",[采荷],...,2.316487,1.666371,1.0,0.667346,"{一小, 分校}",0.721863,6.525649,35.439222,0.721577,1.50332e-21
1,0,采荷一小是分校吧,1,是的,0.0,"[采荷, 一小, 是, 分校, 吧]","[是, 的]","[nr, d, v, n, y]","[v, uj]",[是],...,4.600099,4.244596,1.0,0.998063,"{一小, 分校}",0.814446,6.326831,72.129395,0.814207,2.414553e-72
2,0,采荷一小是分校吧,2,这是5楼,0.0,"[采荷, 一小, 是, 分校, 吧]","[这是, 5, 楼]","[nr, d, v, n, y]","[r, v, m, n]",[],...,4.600099,4.244596,1.0,0.998063,"{一小, 分校}",0.814446,6.326831,72.129395,0.814207,2.414553e-72
3,1,毛坯吗？,0,因为公积金贷款贷的少,0.0,"[毛坯, 吗, ？]","[因为, 公积金, 贷款, 贷, 的, 少]","[n, y, x]","[c, n, n, v, uj, n]",[],...,3.730876,2.803282,1.0,0.998063,{毛坯},0.358034,9.279583,28.669621,0.35798,0.01464102
4,1,毛坯吗？,1,是呢,0.0,"[毛坯, 吗, ？]","[是, 呢]","[n, y, x]","[v, y]",[],...,11.192627,11.192627,0.0,0.0,{毛坯},0.0,0.0,0.0,0.0,0.0
5,1,毛坯吗？,2,这套一楼带院的，您看看,0.0,"[毛坯, 吗, ？]","[这套, 一楼, 带院, 的, ，, 您, 看看]","[n, y, x]","[r, q, n, n, uj, x, zg, v]",[],...,2.988692,2.705259,1.0,0.668145,{毛坯},0.261414,8.436319,9.165813,0.264155,0.2499604
6,1,毛坯吗？,3,房本都是五年外的,0.0,"[毛坯, 吗, ？]","[房本, 都, 是, 五年, 外, 的]","[n, y, x]","[n, d, v, t, f, uj]",[],...,3.984923,3.291877,1.0,0.668145,{毛坯},0.188383,11.473662,14.102968,0.188384,0.1006249
7,1,毛坯吗？,4,好的??，您先看下,0.0,"[毛坯, 吗, ？]","[好, 的, ?, ?, ，, 您, 先, 看, 下]","[n, y, x]","[a, uj, x, x, x, r, d, v]",[],...,5.596314,5.580179,0.0,0.0,{毛坯},0.068315,9.121356,3.014279,0.068243,0.2386186
8,2,你们的佣金费大约是多少和契税是多少。,0,您是首套还是二套呢？,0.0,"[你们, 的, 佣金, 费, 大约, 是, 多少, 和, 契税, 是, 多少, 。]","[您, 是, 首套, 还是, 二套, 呢, ？]","[r, uj, n, v, d, v, m, c, n, v, m, x]","[r, v, m, c, m, y, x]",[是],...,3.263268,2.675451,1.0,0.668145,"{大约, 佣金, 契税}",0.450929,8.67743,25.828073,0.451183,3.532106e-09
9,2,你们的佣金费大约是多少和契税是多少。,1,所有费用下来654万,1.0,"[你们, 的, 佣金, 费, 大约, 是, 多少, 和, 契税, 是, 多少, 。]","[所有, 费用, 下来, 654, 万]","[r, uj, n, v, d, v, m, c, n, v, m, x]","[b, n, t, m, m]",[],...,1.631634,1.35886,1.0,0.779548,"{大约, 佣金, 契税}",0.399413,10.100416,31.169064,0.399436,0.04351321


In [63]:
from sklearn.cluster import KMeans
X = [w2v.wv[_] for k in df_raw.d1_key_words for _ in k if _ != set() and _ in w2v.wv]
kmeans = KMeans(n_clusters=20)
kmeans.fit(X)

In [91]:
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

data = df_raw[:len(train_df)][features]
target = df_raw[:len(train_df)]['label']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)
print("Train data length:", len(X_train))
print("Test data length:", len(X_test))

# 转换为Dataset数据格式
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# 参数
params = {'num_leaves': 75, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.9,#提取的特征比率
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "lambda_l1": 0.1,#l1正则
          "verbosity": -1,
          "nthread": -1,#线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc', 'f1'},##评价函数选择
          "random_state": 2019,#随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          }


# 模型训练
gbm = lgb.train(params, lgb_train, num_boost_round=500, valid_sets=lgb_eval, early_stopping_rounds=5)

# 模型保存
gbm.save_model('model.txt')

# 模型加载
gbm = lgb.Booster(model_file='model.txt')

# 模型预测
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
y_pred = (y_pred>=0.5).astype(float)

# 模型评估
print('The f1 score of prediction is:', f1_score(y_test.values, y_pred))

Train data length: 17268
Test data length: 4317
[1]	valid_0's auc: 0.750424	valid_0's binary_logloss: 0.56407
Training until validation scores don't improve for 5 rounds
[2]	valid_0's auc: 0.757913	valid_0's binary_logloss: 0.558262
[3]	valid_0's auc: 0.767214	valid_0's binary_logloss: 0.552647
[4]	valid_0's auc: 0.76969	valid_0's binary_logloss: 0.5477
[5]	valid_0's auc: 0.773798	valid_0's binary_logloss: 0.542925
[6]	valid_0's auc: 0.78068	valid_0's binary_logloss: 0.53828
[7]	valid_0's auc: 0.780022	valid_0's binary_logloss: 0.53426
[8]	valid_0's auc: 0.781914	valid_0's binary_logloss: 0.530246
[9]	valid_0's auc: 0.782775	valid_0's binary_logloss: 0.52656
[10]	valid_0's auc: 0.782727	valid_0's binary_logloss: 0.523058
[11]	valid_0's auc: 0.782382	valid_0's binary_logloss: 0.519847
[12]	valid_0's auc: 0.782734	valid_0's binary_logloss: 0.516732
[13]	valid_0's auc: 0.782552	valid_0's binary_logloss: 0.513861
[14]	valid_0's auc: 0.783923	valid_0's binary_logloss: 0.510812
[15]	valid_0'

In [None]:
# nlp特征：包括主题模型、LDA、kmeans、词表示、句子表示、词向量的各种距离度量等等
def train_lda(all_df,  n_topics=15):
    ## 使用不包含停止词的分词结果
    corpus = all_df['tokens']
    cnt = CountVectorizer()
    cntIf = cnt.fit_transform(corpus)

    lda_path = os.path.join(model_path, 'lda.pkl')

    ## 使用LDA主题模型进行分类
    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=150)
    print("正在训练LDA主题模型...")
    lda_pred = lda.fit_transform(cntIf)
    lda_classes = np.argmax(lda_pred, axis=1)
    ## 保存模型
    with open(lda_path, 'wb') as f:
        pickle.dump(lda, f)
    print("LDA主题模型已保存...")

    return lda_classes

In [64]:
# 康康关键词，大致判断有多少主题吧
df_raw.to_csv("tmp.tsv", sep='\t', index=None)