In [1]:
import numpy as np 
import pandas as pd
import time
import datetime
import gc
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2, SelectPercentile
import math
from sklearn.metrics import f1_score
import jieba
import jieba.posseg as psg
from collections import Counter
import functools
from gensim.models import word2vec
import Levenshtein


In [2]:
train_df = pd.read_table('../data/oppo_round1_train_20180929.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)
train_df['index'] = train_df.index
valid_df = pd.read_table('../data/oppo_round1_vali_20180929.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)



In [3]:
def get_float_list(x):
    return_list = []
    for temp in x:
        return_list.append(float(temp))
    return return_list

# 处理跟query_prediction相关的统计特征
def get_query_prediction_feature(df):
    df['query_prediction_dict'] = df['query_prediction'].map(lambda x : eval(x))
    df['query_prediction_keys'] = df['query_prediction_dict'].map(lambda x : list(x.keys()))
    df['query_prediction_values'] = df['query_prediction_dict'].map(lambda x : get_float_list(list(x.values())))
    df['query_prediction_number'] = df['query_prediction_keys'].map(lambda x : len(x))
    df['query_prediction_max'] = df['query_prediction_values'].map(lambda x : np.nan if len(x) == 0 else np.max(x))
    df['query_prediction_min'] = df['query_prediction_values'].map(lambda x : np.nan if len(x) == 0 else np.min(x))
    df['query_prediction_mean'] = df['query_prediction_values'].map(lambda x : np.nan if len(x) == 0 else np.mean(x))
    df['query_prediction_std'] = df['query_prediction_values'].map(lambda x : np.nan if len(x) == 0 else np.std(x))
    return df

train_df = get_query_prediction_feature(train_df)
valid_df = get_query_prediction_feature(valid_df)
print(train_df.head())



  prefix                                   query_prediction            title  \
0     小品  {"小品大全宋小宝": "0.009", "小品相亲": "0.012", "小品剧本": ...               小品   
1   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...  HCG大于1368%2C正常吗   
2   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...            1368年   
3     银耳  {"银耳汤的功效": "0.012", "银耳为什么不能天天吃": "0.009", "银耳...         银耳红枣汤的做法   
4   月经量少  {"月经量少喝红糖水好吗": "0.010", "月经量少该怎么调理": "0.016", ...         月经量少怎么调理   

  tag  label  index                              query_prediction_dict  \
0  阅读      0      0  {'小品大全宋小宝': '0.009', '小品相亲': '0.012', '小品剧本': ...   
1  健康      0      1  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
2  百科      1      2  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
3  菜谱      1      3  {'银耳汤的功效': '0.012', '银耳为什么不能天天吃': '0.009', '银耳...   
4  百科      0      4  {'月经量少喝红糖水好吗': '0.010', '月经量少该怎么调理': '0.016', ...   

                               query_prediction_keys  \
0  [小品大全宋小宝, 小品相亲,

In [4]:
def getBayesSmoothParam(origion_rate):
    origion_rate_mean = origion_rate.mean()
    origion_rate_var = origion_rate.var()
    alpha = origion_rate_mean / origion_rate_var * (origion_rate_mean * (1 - origion_rate_mean) - origion_rate_var)
    beta = (1 - origion_rate_mean) / origion_rate_var * (origion_rate_mean * (1 - origion_rate_mean) - origion_rate_var)
#     print('origion_rate_mean : ', origion_rate_mean)
#     print('origion_rate_var : ', origion_rate_var)
#     print('alpha : ', alpha)
#     print('beta : ', beta)
    return alpha, beta

skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True)

# 统计单维度的转化率特征
def get_single_dimension_rate_feature(train_df, valid_df, fea_set):
    for fea in fea_set:
        train_temp_df = pd.DataFrame()
        for index, (train_index, test_index) in enumerate(skf.split(train_df, train_df['label'])):
            temp_df = train_df[[fea, 'label']].iloc[train_index].copy()
            temp_pivot_table = pd.pivot_table(temp_df, index=fea, values='label', aggfunc={len, np.mean, np.sum})
            temp_pivot_table.reset_index(inplace=True)
            temp_pivot_table.rename(columns={'len':fea + '_count', 'mean':fea + '_rate', 'sum':fea + '_click_number'}, inplace=True)
            alpha, beta = getBayesSmoothParam(temp_pivot_table[fea + '_rate'])
            temp_pivot_table[fea + '_rate'] = (temp_pivot_table[fea + '_click_number'] + alpha) / (temp_pivot_table[fea + '_count'] + alpha + beta)
#             del temp_pivot_table[fea + '_click_number']
            fea_df = train_df.iloc[test_index].copy()
            fea_df = pd.merge(fea_df, temp_pivot_table, on=fea, how='left')
#             print(fea_df.head())
            train_temp_df = pd.concat([train_temp_df, fea_df])
        temp_df = train_df[[fea, 'label']].copy()
        temp_pivot_table = pd.pivot_table(temp_df, index=fea, values='label', aggfunc={len, np.mean, np.sum})
        temp_pivot_table.reset_index(inplace=True)
        temp_pivot_table.rename(columns={'len':fea + '_count', 'mean':fea + '_rate', 'sum':fea + '_click_number'}, inplace=True)
        alpha, beta = getBayesSmoothParam(temp_pivot_table[fea + '_rate'])
        temp_pivot_table[fea + '_rate'] = (temp_pivot_table[fea + '_click_number'] + alpha) / (temp_pivot_table[fea + '_count'] + alpha + beta)
#             del temp_pivot_table[fea + '_click_number']
        valid_df = pd.merge(valid_df, temp_pivot_table, on=fea, how='left')
        print(fea + ' : finish!!!')
        train_df = train_temp_df
        train_df.sort_index(by='index', ascending=True, inplace=True)
    return train_df, valid_df
    
fea_set = ['prefix', 'title', 'tag', 'query_prediction']
train_df, valid_df = get_single_dimension_rate_feature(train_df, valid_df, fea_set)
print(train_df.head())




prefix : finish!!!




title : finish!!!
tag : finish!!!
query_prediction : finish!!!
  prefix                                   query_prediction            title  \
0     小品  {"小品大全宋小宝": "0.009", "小品相亲": "0.012", "小品剧本": ...               小品   
0   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...  HCG大于1368%2C正常吗   
1   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...            1368年   
0     银耳  {"银耳汤的功效": "0.012", "银耳为什么不能天天吃": "0.009", "银耳...         银耳红枣汤的做法   
2   月经量少  {"月经量少喝红糖水好吗": "0.010", "月经量少该怎么调理": "0.016", ...         月经量少怎么调理   

  tag  label  index                              query_prediction_dict  \
0  阅读      0      0  {'小品大全宋小宝': '0.009', '小品相亲': '0.012', '小品剧本': ...   
0  健康      0      1  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
1  百科      1      2  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
0  菜谱      1      3  {'银耳汤的功效': '0.012', '银耳为什么不能天天吃': '0.009', '银耳...   
2  百科      0      4  {'月经量少喝红糖水好吗': '0.010', '月经量少该怎么调理': '0.016', ...   

           

In [5]:
# 统计双维度交叉转化率
def get_jiaocha_dimension_rate_feature(train_df, valid_df, fea_set):
    for i in range(len(fea_set)):
        for j in range((i+1), len(fea_set)):
            fea1 = fea_set[i]
            fea2 = fea_set[j]
            train_temp_df = pd.DataFrame()
            for index, (train_index, test_index) in enumerate(skf.split(train_df, train_df['label'])):
                temp_df = train_df[[fea1, fea2, 'label']].iloc[train_index].copy()
                temp_pivot_table = pd.pivot_table(temp_df, index=[fea1, fea2], values='label', aggfunc={len, np.mean, np.sum})
                temp_pivot_table.reset_index(inplace=True)
                temp_pivot_table.rename(columns={'len':fea1 + '_' + fea2 + '_count', 'mean':fea1 + '_' + fea2 + '_rate', 'sum':fea1 + '_' + fea2 + '_click_number'}, inplace=True)
                alpha, beta = getBayesSmoothParam(temp_pivot_table[fea1 + '_' + fea2 + '_rate'])
                temp_pivot_table[fea1 + '_' + fea2 + '_rate'] = (temp_pivot_table[fea1 + '_' + fea2 + '_click_number'] + alpha) / (temp_pivot_table[fea1 + '_' + fea2 + '_count'] + alpha + beta)
#                 del temp_pivot_table[fea1 + '_' + fea2 + '_click_number']
                fea_df = train_df.iloc[test_index].copy()
                fea_df = pd.merge(fea_df, temp_pivot_table, on=[fea1, fea2], how='left')
                train_temp_df = pd.concat([train_temp_df, fea_df])
            temp_df = train_df[[fea1, fea2, 'label']].copy()
            temp_pivot_table = pd.pivot_table(temp_df, index=[fea1, fea2], values='label', aggfunc={len, np.mean, np.sum})
            temp_pivot_table.reset_index(inplace=True)
            temp_pivot_table.rename(columns={'len':fea1 + '_' + fea2 + '_count', 'mean':fea1 + '_' + fea2 + '_rate', 'sum':fea1 + '_' + fea2 + '_click_number'}, inplace=True)
            alpha, beta = getBayesSmoothParam(temp_pivot_table[fea1 + '_' + fea2 + '_rate'])
            temp_pivot_table[fea1 + '_' + fea2 + '_rate'] = (temp_pivot_table[fea1 + '_' + fea2 + '_click_number'] + alpha) / (temp_pivot_table[fea1 + '_' + fea2 + '_count'] + alpha + beta)
#             del temp_pivot_table[fea1 + '_' + fea2 + '_click_number']
            print(fea1 + '_' + fea2 + ' : finish!!!')
            valid_df = pd.merge(valid_df, temp_pivot_table, on=[fea1, fea2], how='left')
            train_df = train_temp_df
            train_df.sort_index(by='index', ascending=True, inplace=True)
    return train_df, valid_df

jiaocha_fea_set = ['prefix', 'title', 'tag']
train_df, valid_df = get_jiaocha_dimension_rate_feature(train_df, valid_df, jiaocha_fea_set)
print(train_df.head())
    


prefix_title : finish!!!




prefix_tag : finish!!!
title_tag : finish!!!
  prefix                                   query_prediction            title  \
0     小品  {"小品大全宋小宝": "0.009", "小品相亲": "0.012", "小品剧本": ...               小品   
0   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...  HCG大于1368%2C正常吗   
1   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...            1368年   
0     银耳  {"银耳汤的功效": "0.012", "银耳为什么不能天天吃": "0.009", "银耳...         银耳红枣汤的做法   
2   月经量少  {"月经量少喝红糖水好吗": "0.010", "月经量少该怎么调理": "0.016", ...         月经量少怎么调理   

  tag  label  index                              query_prediction_dict  \
0  阅读      0      0  {'小品大全宋小宝': '0.009', '小品相亲': '0.012', '小品剧本': ...   
0  健康      0      1  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
1  百科      1      2  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
0  菜谱      1      3  {'银耳汤的功效': '0.012', '银耳为什么不能天天吃': '0.009', '银耳...   
2  百科      0      4  {'月经量少喝红糖水好吗': '0.010', '月经量少该怎么调理': '0.016', ...   

                             

In [6]:
# 统计一些是否交叉的特征
def get_is_title_in_query_feature(df):
    x = df['title']
    y = df['query_prediction_keys']
    is_title_in_query = np.nan
    if len(y) > 0:
        if x in y:
            is_title_in_query = 1
        else:
            is_title_in_query = 0
    return is_title_in_query

def get_is_prefix_in_title_feature(df):
    x = df['prefix']
    y = df['title']
    is_prefix_in_title = np.nan
    if x in y:
        is_prefix_in_title = 1
    else:
        is_prefix_in_title = 0
    return is_prefix_in_title

train_df['is_title_in_query'] = train_df[['title', 'query_prediction_keys']].apply(get_is_title_in_query_feature, axis = 1)
valid_df['is_title_in_query'] = valid_df[['title', 'query_prediction_keys']].apply(get_is_title_in_query_feature, axis = 1)

train_df['is_prefix_in_title'] = train_df[['prefix', 'title']].apply(get_is_prefix_in_title_feature, axis = 1)
valid_df['is_prefix_in_title'] = valid_df[['prefix', 'title']].apply(get_is_prefix_in_title_feature, axis = 1)



In [7]:
# 统计一些交叉种类特征
def get_jiaocha_type_feature(train_df, valid_df, jiaocha_type_list):
    for jiaocha_type in jiaocha_type_list:
        fea1 = jiaocha_type[0]
        fea2 = jiaocha_type[1]
        temp_df = pd.concat([train_df, valid_df])
        temp_pivot_table = pd.pivot_table(temp_df[[fea1, fea2, 'label']], index=[fea1, fea2], values='label', aggfunc=len)
        temp_pivot_table.reset_index(inplace=True)
        final_pivot_table = pd.pivot_table(temp_pivot_table, index=fea1, values=fea2, aggfunc=len)
        final_pivot_table.reset_index(inplace=True)
        final_pivot_table.rename(columns={fea2 : fea1 + '_' + fea2 + '_types'}, inplace=True)
        train_df = pd.merge(train_df, final_pivot_table[[fea1, fea1 + '_' + fea2 + '_types']], on=fea1, how='left')
        valid_df = pd.merge(valid_df, final_pivot_table[[fea1, fea1 + '_' + fea2 + '_types']], on=fea1, how='left')
    return train_df, valid_df

jiaocha_type_list = [['title', 'tag'], ['prefix', 'tag'], ['tag', 'title'], ['tag', 'prefix'], 
                     ['title', 'prefix'], ['prefix', 'title'], ['tag', 'query_prediction'], ['title', 'query_prediction']]
train_df, valid_df = get_jiaocha_type_feature(train_df, valid_df, jiaocha_type_list)
print(train_df.head())



  prefix                                   query_prediction            title  \
0     小品  {"小品大全宋小宝": "0.009", "小品相亲": "0.012", "小品剧本": ...               小品   
1   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...  HCG大于1368%2C正常吗   
2   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...            1368年   
3     银耳  {"银耳汤的功效": "0.012", "银耳为什么不能天天吃": "0.009", "银耳...         银耳红枣汤的做法   
4   月经量少  {"月经量少喝红糖水好吗": "0.010", "月经量少该怎么调理": "0.016", ...         月经量少怎么调理   

  tag  label  index                              query_prediction_dict  \
0  阅读      0      0  {'小品大全宋小宝': '0.009', '小品相亲': '0.012', '小品剧本': ...   
1  健康      0      1  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
2  百科      1      2  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
3  菜谱      1      3  {'银耳汤的功效': '0.012', '银耳为什么不能天天吃': '0.009', '银耳...   
4  百科      0      4  {'月经量少喝红糖水好吗': '0.010', '月经量少该怎么调理': '0.016', ...   

                               query_prediction_keys  \
0  [小品大全宋小宝, 小品相亲,

In [8]:
def get_key_len_list(x):
    return_list = []
    for temp in x:
        return_list.append(len(temp))
    return return_list

# 统计一些跟字符串长度相关的特征
def get_string_len_feature(df):
    df['prefix_len'] = df['prefix'].map(lambda x : len(x))
    df['title_len'] = df['title'].map(lambda x : len(x))
    df['query_prediction_key_len_list'] = df['query_prediction_keys'].map(lambda x : get_key_len_list(x))
    df['query_prediction_key_len_max'] = df['query_prediction_key_len_list'].map(lambda x : np.nan if len(x) == 0 else np.max(x))
    df['query_prediction_key_len_min'] = df['query_prediction_key_len_list'].map(lambda x : np.nan if len(x) == 0 else np.min(x))
    df['query_prediction_key_len_mean'] = df['query_prediction_key_len_list'].map(lambda x : np.nan if len(x) == 0 else np.mean(x))
    df['query_prediction_key_len_std'] = df['query_prediction_key_len_list'].map(lambda x : np.nan if len(x) == 0 else np.std(x))
    df['len_title-prefix'] = df['title_len'] - df['prefix_len']
    df['len_prefix/title'] = df['prefix_len'] / df['title_len']
    df['len_mean-title'] = df['query_prediction_key_len_mean'] - df['title_len']
    df['len_mean/title'] = df['query_prediction_key_len_mean'] / df['title_len']
    del df['query_prediction_key_len_list']
    return df

train_df = get_string_len_feature(train_df)
valid_df = get_string_len_feature(valid_df)
print(train_df.head())



  prefix                                   query_prediction            title  \
0     小品  {"小品大全宋小宝": "0.009", "小品相亲": "0.012", "小品剧本": ...               小品   
1   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...  HCG大于1368%2C正常吗   
2   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...            1368年   
3     银耳  {"银耳汤的功效": "0.012", "银耳为什么不能天天吃": "0.009", "银耳...         银耳红枣汤的做法   
4   月经量少  {"月经量少喝红糖水好吗": "0.010", "月经量少该怎么调理": "0.016", ...         月经量少怎么调理   

  tag  label  index                              query_prediction_dict  \
0  阅读      0      0  {'小品大全宋小宝': '0.009', '小品相亲': '0.012', '小品剧本': ...   
1  健康      0      1  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
2  百科      1      2  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
3  菜谱      1      3  {'银耳汤的功效': '0.012', '银耳为什么不能天天吃': '0.009', '银耳...   
4  百科      0      4  {'月经量少喝红糖水好吗': '0.010', '月经量少该怎么调理': '0.016', ...   

                               query_prediction_keys  \
0  [小品大全宋小宝, 小品相亲,

In [9]:
# 统计title跟prefix的编辑距离
def get_title_prefix_levenshtein_distance(df):
    title = df['title']
    prefix = df['prefix']
    return Levenshtein.distance(title, prefix)

def get_title_prefix_levenshtein_distance_rate(df):
    title_prefix_leven = df['title_prefix_leven']
    title = df['title']
    return (title_prefix_leven / (len(title) + 3))

train_df['title_prefix_leven'] = train_df[['title', 'prefix']].apply(get_title_prefix_levenshtein_distance, axis=1)
valid_df['title_prefix_leven'] = valid_df[['title', 'prefix']].apply(get_title_prefix_levenshtein_distance, axis=1)

train_df['title_prefix_leven_rate'] = train_df[['title', 'title_prefix_leven']].apply(get_title_prefix_levenshtein_distance_rate, axis=1)
valid_df['title_prefix_leven_rate'] = valid_df[['title', 'title_prefix_leven']].apply(get_title_prefix_levenshtein_distance_rate, axis=1)

print(train_df.head())


  prefix                                   query_prediction            title  \
0     小品  {"小品大全宋小宝": "0.009", "小品相亲": "0.012", "小品剧本": ...               小品   
1   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...  HCG大于1368%2C正常吗   
2   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...            1368年   
3     银耳  {"银耳汤的功效": "0.012", "银耳为什么不能天天吃": "0.009", "银耳...         银耳红枣汤的做法   
4   月经量少  {"月经量少喝红糖水好吗": "0.010", "月经量少该怎么调理": "0.016", ...         月经量少怎么调理   

  tag  label  index                              query_prediction_dict  \
0  阅读      0      0  {'小品大全宋小宝': '0.009', '小品相亲': '0.012', '小品剧本': ...   
1  健康      0      1  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
2  百科      1      2  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
3  菜谱      1      3  {'银耳汤的功效': '0.012', '银耳为什么不能天天吃': '0.009', '银耳...   
4  百科      0      4  {'月经量少喝红糖水好吗': '0.010', '月经量少该怎么调理': '0.016', ...   

                               query_prediction_keys  \
0  [小品大全宋小宝, 小品相亲,

In [10]:
# 统计title跟query_prediction编辑距离相关的特征
def get_title_query_levenshtein_distance_list(df):
    query_keys_list = df['query_prediction_keys']
    query_values_list = df['query_prediction_values']
    title = df['title']
    return_list = list()
    for i in range(len(query_keys_list)):
        distance = Levenshtein.distance(title, query_keys_list[i])
        return_list.append(distance * query_values_list[i])
    return return_list

def get_title_query_levenshtein_distance_feature(df):
    df['title_query_leven_list'] = df[['query_prediction_keys', 'query_prediction_values', 'title']].apply(get_title_query_levenshtein_distance_list, axis=1)
    df['title_query_leven_sum'] = df['title_query_leven_list'].map(lambda x : np.nan if len(x) == 0 else np.sum(x))
    df['title_query_leven_max'] = df['title_query_leven_list'].map(lambda x : np.nan if len(x) == 0 else np.max(x))
    df['title_query_leven_min'] = df['title_query_leven_list'].map(lambda x : np.nan if len(x) == 0 else np.min(x))
    df['title_query_leven_mean'] = df['title_query_leven_list'].map(lambda x : np.nan if len(x) == 0 else np.mean(x))
    df['title_query_leven_std'] = df['title_query_leven_list'].map(lambda x : np.nan if len(x) == 0 else np.std(x))
    return df

train_df = get_title_query_levenshtein_distance_feature(train_df)
valid_df = get_title_query_levenshtein_distance_feature(valid_df)
print(train_df.head())


  prefix                                   query_prediction            title  \
0     小品  {"小品大全宋小宝": "0.009", "小品相亲": "0.012", "小品剧本": ...               小品   
1   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...  HCG大于1368%2C正常吗   
2   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...            1368年   
3     银耳  {"银耳汤的功效": "0.012", "银耳为什么不能天天吃": "0.009", "银耳...         银耳红枣汤的做法   
4   月经量少  {"月经量少喝红糖水好吗": "0.010", "月经量少该怎么调理": "0.016", ...         月经量少怎么调理   

  tag  label  index                              query_prediction_dict  \
0  阅读      0      0  {'小品大全宋小宝': '0.009', '小品相亲': '0.012', '小品剧本': ...   
1  健康      0      1  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
2  百科      1      2  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
3  菜谱      1      3  {'银耳汤的功效': '0.012', '银耳为什么不能天天吃': '0.009', '银耳...   
4  百科      0      4  {'月经量少喝红糖水好吗': '0.010', '月经量少该怎么调理': '0.016', ...   

                               query_prediction_keys  \
0  [小品大全宋小宝, 小品相亲,

In [11]:
#分词方法，调用结巴接口
def jieba_seg_to_list(sentence, pos=False):
    if not pos:
        #不进行词性标注的分词方法
        seg_list = jieba.cut(sentence)
    else:
        #进行词性标注的分词方法
        seg_list = psg.cut(sentence)
    return seg_list

#去除干扰词
def jieba_word_filter(seg_list, pos=False):
    
    filter_list = []
    #根据pos参数选择是否词性过滤
    #不进行词性过滤，则将词性都标记为n，表示全部保留
    for seg in seg_list:
        if not pos:
            word = seg
            flag = 'n'
        else:
            word = seg.word
            flag = seg.flag
        if not flag.startswith('n'):
            continue
        filter_list.append(word)
    return filter_list

def jieba_word_deal(sentence, pos=False):
    #调用上面方式对数据集进行处理，处理后的每条数据仅保留非干扰词
    seg_list = jieba_seg_to_list(sentence, pos)
    filter_list = jieba_word_filter(seg_list, pos)
    return filter_list

def get_prefix_prediction_key_sentences(x):
    prefix_prediction_key_sentences = ""
    for temp in x:
        if len(prefix_prediction_key_sentences) > 0:
            prefix_prediction_key_sentences = prefix_prediction_key_sentences + temp
        else:
            prefix_prediction_key_sentences = temp
    return prefix_prediction_key_sentences

def get_max_query_key_sentences(x):
    if len(x) == 0:
        return ""
    else:
        return max(x, key=x.get)

def get_jieba_word(df):
    df['query_prediction_key_sentences'] = df['query_prediction_keys'].map(lambda x : get_prefix_prediction_key_sentences(x))
    df['query_prediction_key_max_sentences'] = df['query_prediction_dict'].map(lambda x : get_max_query_key_sentences(x))
    df['query_prediction_key_jieba_words'] = df['query_prediction_key_sentences'].map(lambda x : jieba_word_deal(x, False))
    df['query_prediction_key_max_jieba_words'] = df['query_prediction_key_max_sentences'].map(lambda x : jieba_word_deal(x, False))
    df['query_prediction_words'] = df['query_prediction_keys'].map(lambda x : [jieba_word_deal(j, False) for j in x] if len(x) > 0 else np.nan)
    df['title_jieba_words'] = df['title'].map(lambda x : jieba_word_deal(x, False))
    df['prefix_jieba_words'] = df['prefix'].map(lambda x : jieba_word_deal(x, False))
#     del df['query_prediction_key_sentences']
    return df

train_df = get_jieba_word(train_df)
valid_df = get_jieba_word(valid_df)
print(train_df.head())


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
Dump cache file failed.
Traceback (most recent call last):
  File "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/jieba/__init__.py", line 152, in initialize
    _replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmpejumvqsg' -> '/tmp/jieba.cache'
Loading model cost 2.079 seconds.
Prefix dict has been built succesfully.


  prefix                                   query_prediction            title  \
0     小品  {"小品大全宋小宝": "0.009", "小品相亲": "0.012", "小品剧本": ...               小品   
1   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...  HCG大于1368%2C正常吗   
2   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...            1368年   
3     银耳  {"银耳汤的功效": "0.012", "银耳为什么不能天天吃": "0.009", "银耳...         银耳红枣汤的做法   
4   月经量少  {"月经量少喝红糖水好吗": "0.010", "月经量少该怎么调理": "0.016", ...         月经量少怎么调理   

  tag  label  index                              query_prediction_dict  \
0  阅读      0      0  {'小品大全宋小宝': '0.009', '小品相亲': '0.012', '小品剧本': ...   
1  健康      0      1  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
2  百科      1      2  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
3  菜谱      1      3  {'银耳汤的功效': '0.012', '银耳为什么不能天天吃': '0.009', '银耳...   
4  百科      0      4  {'月经量少喝红糖水好吗': '0.010', '月经量少该怎么调理': '0.016', ...   

                               query_prediction_keys  \
0  [小品大全宋小宝, 小品相亲,

In [12]:
def word_match_share(df):
    q1words = {}
    q2words = {}
    for word in df[0]:
        q1words[word] = 1
    for word in df[1]:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def jaccard(df):
    wic = set(df[0]).intersection(set(df[1]))
    uw = set(df[0]).union(df[1])
    if len(uw) == 0:
        uw = [1]
    return (len(wic) / len(uw))

def common_words(df):
    return len(set(df[0]).intersection(set(df[1])))

def total_unique_words(df):
    return len(set(df[0]).union(df[1]))

def wc_diff(df):
    return abs(len(df[0]) - len(df[1]))

def wc_ratio(df):
    l1 = len(df[0])*1.0 
    l2 = len(df[1])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique(df):
    return abs(len(set(df[0])) - len(set(df[1])))
    
def wc_ratio_unique(df):
    l1 = len(set(df[0])) * 1.0
    l2 = len(set(df[1]))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2
    
def tfidf_word_match_share(df, weights=None):
    q1words = {}
    q2words = {}
    for word in df[0]:
        q1words[word] = 1
    for word in df[1]:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def deal_word_for_all(train_df, valid_df, fea1, fea2, func, colName):
    train_df[colName] = train_df[[fea1, fea2]].apply(func, axis=1)
    valid_df[colName] = valid_df[[fea1, fea2]].apply(func, axis=1)
    print(colName + ' finish!!!')
    return train_df, valid_df
                   
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

def get_word_statistic_feature(train_df, valid_df, col_list):
    df = pd.concat([train_df[['query_prediction_key_jieba_words', 'title_jieba_words', 'prefix_jieba_words']], valid_df[['query_prediction_key_jieba_words', 'title_jieba_words', 'prefix_jieba_words']]])
    train_qs = pd.Series(df['query_prediction_key_jieba_words'].tolist() + df['title_jieba_words'].tolist() + df['prefix_jieba_words'].tolist())
    words = [x for y in train_qs for x in y]
    counts = Counter(words)
    weights = {word: get_weight(count) for word, count in counts.items()}
    for col in col_list:
        fea1 = col[0]
        fea2 = col[1]
        train_df, valid_df = deal_word_for_all(train_df, valid_df, fea1, fea2, word_match_share, fea1[0] + '_' + fea2[0] + '_word_match')
#         train_df, valid_df = deal_word_for_all(train_df, valid_df, fea1, fea2, jaccard, fea1[0] + '_' + fea2[0] + '_jaccard')
        train_df, valid_df = deal_word_for_all(train_df, valid_df, fea1, fea2, common_words, fea1[0] + '_' + fea2[0] + '_common_words')
        train_df, valid_df = deal_word_for_all(train_df, valid_df, fea1, fea2, total_unique_words, fea1[0] + '_' + fea2[0] + '_total_unique_words')
        train_df, valid_df = deal_word_for_all(train_df, valid_df, fea1, fea2, wc_diff, fea1[0] + '_' + fea2[0] + '_wc_diff')
        train_df, valid_df = deal_word_for_all(train_df, valid_df, fea1, fea2, wc_ratio, fea1[0] + '_' + fea2[0] + '_wc_ratio')
        train_df, valid_df = deal_word_for_all(train_df, valid_df, fea1, fea2, wc_diff_unique, fea1[0] + '_' + fea2[0] + '_wc_diff_unique')
        train_df, valid_df = deal_word_for_all(train_df, valid_df, fea1, fea2, wc_ratio_unique, fea1[0] + '_' + fea2[0] + '_wc_ratio_unique')
#         f = functools.partial(tfidf_word_match_share, weights=weights)
#         train_df, valid_df = deal_word_for_all(train_df, valid_df, fea1, fea2, f, fea1[0] + '_' + fea2[0] + '_tfidf_word_match_share')
    return train_df, valid_df

col_list = [['query_prediction_key_jieba_words', 'title_jieba_words'], ['prefix_jieba_words', 'title_jieba_words'], ['prefix_jieba_words', 'query_prediction_key_jieba_words']]
train_df, valid_df = get_word_statistic_feature(train_df, valid_df, col_list)
print(train_df.head())


q_t_word_match finish!!!
q_t_common_words finish!!!
q_t_total_unique_words finish!!!
q_t_wc_diff finish!!!
q_t_wc_ratio finish!!!
q_t_wc_diff_unique finish!!!
q_t_wc_ratio_unique finish!!!
p_t_word_match finish!!!
p_t_common_words finish!!!
p_t_total_unique_words finish!!!
p_t_wc_diff finish!!!
p_t_wc_ratio finish!!!
p_t_wc_diff_unique finish!!!
p_t_wc_ratio_unique finish!!!
p_q_word_match finish!!!
p_q_common_words finish!!!
p_q_total_unique_words finish!!!
p_q_wc_diff finish!!!
p_q_wc_ratio finish!!!
p_q_wc_diff_unique finish!!!
p_q_wc_ratio_unique finish!!!
  prefix                                   query_prediction            title  \
0     小品  {"小品大全宋小宝": "0.009", "小品相亲": "0.012", "小品剧本": ...               小品   
1   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...  HCG大于1368%2C正常吗   
2   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...            1368年   
3     银耳  {"银耳汤的功效": "0.012", "银耳为什么不能天天吃": "0.009", "银耳...         银耳红枣汤的做法   
4   月经量少  {"月经量少喝红糖水好吗": "0.010",

In [13]:
# Set values for various parameters
dir_num_features = 30  # Word vector dimensionality                      
dir_min_word_count = 1  # Minimum word count                        
dir_num_workers = 20       # Number of threads to run in parallel
dir_context = 5          # Context window size                                                                                    
dir_downsampling = 1e-3   # Downsample setting for frequent words

word2vec_df = train_df[['query_prediction_words', 'title_jieba_words', 'prefix_jieba_words', 'query_prediction_number']]
word2vec_df.reset_index(inplace=True)
word2vec_list = word2vec_df['title_jieba_words'].tolist() + word2vec_df['prefix_jieba_words'].tolist() + [y for x in word2vec_df['query_prediction_words'][word2vec_df.query_prediction_number > 0] for y in x]
dir_model = word2vec.Word2Vec(word2vec_list, workers=dir_num_workers, \
            size=dir_num_features, min_count = dir_min_word_count, \
            window = dir_context, sample = dir_downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
dir_model.init_sims(replace=True)

dir_word_wv = dir_model.wv


In [14]:
def get_dir_w2v_array(word_list, word_wv, num_features):
    word_vectors = np.zeros((len(word_list), num_features))
    for i in range(len(word_list)):
        if str(word_list[i]) in word_wv.vocab.keys():
            word_vectors[i][:] = word_wv[str(word_list[i])]
    mean_array = np.mean(word_vectors, axis=0)
    return mean_array

train_df['dir_title_jieba_array'] = train_df['title_jieba_words'].map(lambda x : get_dir_w2v_array(x, dir_word_wv, dir_num_features))
valid_df['dir_title_jieba_array'] = valid_df['title_jieba_words'].map(lambda x : get_dir_w2v_array(x, dir_word_wv, dir_num_features))

train_df['dir_prefix_jieba_array'] = train_df['prefix_jieba_words'].map(lambda x : get_dir_w2v_array(x, dir_word_wv, dir_num_features))
valid_df['dir_prefix_jieba_array'] = valid_df['prefix_jieba_words'].map(lambda x : get_dir_w2v_array(x, dir_word_wv, dir_num_features))

train_df['dir_query_max_jieba_array'] = train_df['query_prediction_key_max_jieba_words'].map(lambda x : get_dir_w2v_array(x, dir_word_wv, dir_num_features))
valid_df['dir_query_max_jieba_array'] = valid_df['query_prediction_key_max_jieba_words'].map(lambda x : get_dir_w2v_array(x, dir_word_wv, dir_num_features))


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


In [15]:
def get_dir_t_p_w2v_array(df):
    dir_title_jieba_array = df['dir_title_jieba_array']
    dir_prefix_jieba_array = df['dir_prefix_jieba_array']
    return_list = list()
    if (len(dir_title_jieba_array) == 30) & (len(dir_prefix_jieba_array) == 30):
        for i in range(30):
            return_list.append(dir_title_jieba_array[i] - dir_prefix_jieba_array[i])
    return return_list

def get_dir_t_maxQ_w2v_array(df):
    dir_title_jieba_array = df['dir_title_jieba_array']
    dir_query_max_jieba_array = df['dir_query_max_jieba_array']
    return_list = list()
    if (len(dir_title_jieba_array) == 30) & (len(dir_query_max_jieba_array) == 30):
        for i in range(30):
            return_list.append(dir_title_jieba_array[i] - dir_query_max_jieba_array[i])
    return return_list

train_df['dir_t_p_array'] = train_df.apply(get_dir_t_p_w2v_array, axis=1)
valid_df['dir_t_p_array'] = valid_df.apply(get_dir_t_p_w2v_array, axis=1)

train_df['dir_t_maxQ_array'] = train_df.apply(get_dir_t_maxQ_w2v_array, axis=1)
valid_df['dir_t_maxQ_array'] = valid_df.apply(get_dir_t_maxQ_w2v_array, axis=1)



In [16]:
def get_dir_similarity_feature(df, dir_num_features):
    for i in range(dir_num_features):
        df['dir_t_p_array_' + str(i) + '_fea'] = df['dir_t_p_array'].map(lambda x: np.nan if len(x) < (i+1) else x[i])
        df['dir_t_maxQ_array' + str(i) + '_fea'] = df['dir_t_maxQ_array'].map(lambda x: np.nan if len(x) < (i+1) else x[i])
    return df

train_df = get_dir_similarity_feature(train_df, dir_num_features)
valid_df = get_dir_similarity_feature(valid_df, dir_num_features)


In [17]:
def get_dir_w2v_feature(df, dir_num_features):
    for i in range(dir_num_features):
        df['dir_t_array_' + str(i) + '_fea'] = df['dir_title_jieba_array'].map(lambda x: np.nan if len(x) < (i+1) else x[i])
        df['dir_p_array' + str(i) + '_fea'] = df['dir_prefix_jieba_array'].map(lambda x: np.nan if len(x) < (i+1) else x[i])
        df['dir_maxQ_array' + str(i) + '_fea'] = df['dir_query_max_jieba_array'].map(lambda x: np.nan if len(x) < (i+1) else x[i])
    return df

train_df = get_dir_w2v_feature(train_df, dir_num_features)
valid_df = get_dir_w2v_feature(valid_df, dir_num_features)



In [18]:
# Set values for various parameters
num_features = 500  # Word vector dimensionality                      
min_word_count = 1  # Minimum word count                        
num_workers = 20       # Number of threads to run in parallel
context = 5          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

word2vec_df = train_df[['query_prediction_words', 'title_jieba_words', 'prefix_jieba_words', 'query_prediction_number']]
word2vec_df.reset_index(inplace=True)
word2vec_list = word2vec_df['title_jieba_words'].tolist() + word2vec_df['prefix_jieba_words'].tolist() + [y for x in word2vec_df['query_prediction_words'][word2vec_df.query_prediction_number > 0] for y in x]
model = word2vec.Word2Vec(word2vec_list, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

word_wv = model.wv



In [19]:
def get_w2v_array(word_list, word_wv, num_features):
    word_vectors = np.zeros((len(word_list), num_features))
    for i in range(len(word_list)):
        if str(word_list[i]) in word_wv.vocab.keys():
            word_vectors[i][:] = word_wv[str(word_list[i])]
    mean_array = np.mean(word_vectors, axis=0)
    return mean_array

train_df['title_jieba_array'] = train_df['title_jieba_words'].map(lambda x : get_w2v_array(x, word_wv, num_features))
valid_df['title_jieba_array'] = valid_df['title_jieba_words'].map(lambda x : get_w2v_array(x, word_wv, num_features))

train_df['prefix_jieba_array'] = train_df['prefix_jieba_words'].map(lambda x : get_w2v_array(x, word_wv, num_features))
valid_df['prefix_jieba_array'] = valid_df['prefix_jieba_words'].map(lambda x : get_w2v_array(x, word_wv, num_features))




In [20]:
def get_title_prefix_similarity(df, f_similarity):
    title_array = df['title_jieba_array']
    prefix_array = df['prefix_jieba_array']
    similarity = 0
    if f_similarity == 'dot':
        similarity = np.dot(title_array, prefix_array)
    elif f_similarity == 'norm':
        similarity = np.linalg.norm(title_array - prefix_array)
    else:
        similarity = np.dot(title_array,prefix_array) / (np.linalg.norm(title_array) * np.linalg.norm(prefix_array))
    return similarity

# def get_title_query_similarity(df, f_similarity, word_wv, num_features):
#     title_array = df['title_jieba_array']
#     query_prediction_words = df['query_prediction_words']
#     query_prediction_keys = df['query_prediction_keys']
#     query_prediction_dict = df['query_prediction_dict']
#     if len(query_prediction_keys) <= 0:
#         return np.nan
#     similarity = 0
#     if f_similarity == 'dot':
#         i = 0
#         for key in query_prediction_keys:
#             key_array = get_w2v_array(query_prediction_words[i], word_wv, num_features)
#             similarity = similarity + np.dot(title_array, key_array) * float(query_prediction_dict[key])
#             i = i + 1
#     elif f_similarity == 'norm':
#         i = 0
#         for key in query_prediction_keys:
#             key_array = get_w2v_array(query_prediction_words[i], word_wv, num_features)
#             similarity = similarity + np.linalg.norm(title_array - key_array) * float(query_prediction_dict[key])
#             i = i + 1
#     else:
#         i = 0
#         for key in query_prediction_keys:
#             key_array = get_w2v_array(query_prediction_words[i], word_wv, num_features)
#             similarity = similarity + (np.dot(title_array, key_array) / (np.linalg.norm(title_array) * np.linalg.norm(key_array))) * float(query_prediction_dict[key])
#             i = i + 1
#     return similarity

def get_title_query_similarity_list(df, f_similarity, word_wv, num_features):
    title_array = df['title_jieba_array']
    query_prediction_words = df['query_prediction_words']
    query_prediction_keys = df['query_prediction_keys']
    query_prediction_dict = df['query_prediction_dict']
    similarity_list = list()
    if len(query_prediction_keys) <= 0:
        return similarity_list
    if f_similarity == 'dot':
        i = 0
        for key in query_prediction_keys:
            key_array = get_w2v_array(query_prediction_words[i], word_wv, num_features)
            similarity = np.dot(title_array, key_array) * float(query_prediction_dict[key])
            similarity_list.append(similarity)
            i = i + 1
    elif f_similarity == 'norm':
        i = 0
        for key in query_prediction_keys:
            key_array = get_w2v_array(query_prediction_words[i], word_wv, num_features)
            similarity = np.linalg.norm(title_array - key_array) * float(query_prediction_dict[key])
            similarity_list.append(similarity)
            i = i + 1
    else:
        i = 0
        for key in query_prediction_keys:
            key_array = get_w2v_array(query_prediction_words[i], word_wv, num_features)
            similarity = (np.dot(title_array, key_array) / (np.linalg.norm(title_array) * np.linalg.norm(key_array))) * float(query_prediction_dict[key])
            similarity_list.append(similarity)
            i = i + 1
    return similarity_list

def get_similarity_feature(train_df, valid_df):
    f_list = ['dot', 'norm', 'cosine']
    for fun in f_list:
        f_prefix_similarity = functools.partial(get_title_prefix_similarity, f_similarity=fun)
        train_df['title_prefix_' + fun + '_similarity'] = train_df[['title_jieba_array', 'prefix_jieba_array']].apply(f_prefix_similarity, axis=1)
        valid_df['title_prefix_' + fun + '_similarity'] = valid_df[['title_jieba_array', 'prefix_jieba_array']].apply(f_prefix_similarity, axis=1)
#         f_query_similarity = functools.partial(get_title_query_similarity, f_similarity=fun, word_wv=word_wv, num_features=num_features)
#         train_df['title_query_' + fun + '_similarity'] = train_df[['title_jieba_array', 'query_prediction_words', 'query_prediction_keys', 'query_prediction_dict']].apply(f_query_similarity, axis=1)
#         valid_df['title_query_' + fun + '_similarity'] = valid_df[['title_jieba_array', 'query_prediction_words', 'query_prediction_keys', 'query_prediction_dict']].apply(f_query_similarity, axis=1)
        f_query_similarity_list = functools.partial(get_title_query_similarity_list, f_similarity=fun, word_wv=word_wv, num_features=num_features)
        train_df['title_query_' + fun + '_similarity_list'] = train_df[['title_jieba_array', 'query_prediction_words', 'query_prediction_keys', 'query_prediction_dict']].apply(f_query_similarity_list, axis=1)
        valid_df['title_query_' + fun + '_similarity_list'] = valid_df[['title_jieba_array', 'query_prediction_words', 'query_prediction_keys', 'query_prediction_dict']].apply(f_query_similarity_list, axis=1)
        train_df['title_query_' + fun + '_similarity'] = train_df['title_query_' + fun + '_similarity_list'].map(lambda x : np.nan if len(x)==0 else np.sum(x))
        train_df['title_query_' + fun + '_similarity_max'] = train_df['title_query_' + fun + '_similarity_list'].map(lambda x : np.nan if len(x)==0 else np.max(x))
        train_df['title_query_' + fun + '_similarity_min'] = train_df['title_query_' + fun + '_similarity_list'].map(lambda x : np.nan if len(x)==0 else np.min(x))
        train_df['title_query_' + fun + '_similarity_mean'] = train_df['title_query_' + fun + '_similarity_list'].map(lambda x : np.nan if len(x)==0 else np.mean(x))
        train_df['title_query_' + fun + '_similarity_std'] = train_df['title_query_' + fun + '_similarity_list'].map(lambda x : np.nan if len(x)==0 else np.std(x))
        valid_df['title_query_' + fun + '_similarity'] = valid_df['title_query_' + fun + '_similarity_list'].map(lambda x : np.nan if len(x)==0 else np.sum(x))
        valid_df['title_query_' + fun + '_similarity_max'] = valid_df['title_query_' + fun + '_similarity_list'].map(lambda x : np.nan if len(x)==0 else np.max(x))
        valid_df['title_query_' + fun + '_similarity_min'] = valid_df['title_query_' + fun + '_similarity_list'].map(lambda x : np.nan if len(x)==0 else np.min(x))
        valid_df['title_query_' + fun + '_similarity_mean'] = valid_df['title_query_' + fun + '_similarity_list'].map(lambda x : np.nan if len(x)==0 else np.mean(x))
        valid_df['title_query_' + fun + '_similarity_std'] = valid_df['title_query_' + fun + '_similarity_list'].map(lambda x : np.nan if len(x)==0 else np.std(x))
        print(fun + ' : finish!!!')
    return train_df, valid_df

train_df, valid_df = get_similarity_feature(train_df, valid_df)




dot : finish!!!
norm : finish!!!


  # Remove the CWD from sys.path while we load stuff.
  return umr_maximum(a, axis, None, out, keepdims)
  return umr_minimum(a, axis, None, out, keepdims)


cosine : finish!!!


In [21]:
print(train_df.columns.values)


['prefix' 'query_prediction' 'title' 'tag' 'label' 'index'
 'query_prediction_dict' 'query_prediction_keys' 'query_prediction_values'
 'query_prediction_number' 'query_prediction_max' 'query_prediction_min'
 'query_prediction_mean' 'query_prediction_std' 'prefix_count'
 'prefix_rate' 'prefix_click_number' 'title_count' 'title_rate'
 'title_click_number' 'tag_count' 'tag_rate' 'tag_click_number'
 'query_prediction_count' 'query_prediction_rate'
 'query_prediction_click_number' 'prefix_title_count' 'prefix_title_rate'
 'prefix_title_click_number' 'prefix_tag_count' 'prefix_tag_rate'
 'prefix_tag_click_number' 'title_tag_count' 'title_tag_rate'
 'title_tag_click_number' 'is_title_in_query' 'is_prefix_in_title'
 'title_tag_types' 'prefix_tag_types' 'tag_title_types' 'tag_prefix_types'
 'title_prefix_types' 'prefix_title_types' 'tag_query_prediction_types'
 'title_query_prediction_types' 'prefix_len' 'title_len'
 'query_prediction_key_len_max' 'query_prediction_key_len_min'
 'query_predicti

In [26]:
fea = [
    'query_prediction_number', 'query_prediction_max', 'query_prediction_min', 'query_prediction_mean', 'query_prediction_std',
       'prefix_count', 'prefix_rate',
 'title_count', 'title_rate', 'tag_count', 'tag_rate',
 'query_prediction_count', 'query_prediction_rate', 'prefix_title_count',
 'prefix_title_rate',  'prefix_tag_count', 'prefix_tag_rate',
 'title_tag_count', 'title_tag_rate',
    'prefix_click_number', 'title_click_number', 'query_prediction_click_number', 'prefix_tag_click_number', 
    'prefix_title_click_number', 'title_tag_click_number',
    'is_title_in_query', 'is_prefix_in_title', 
    'title_tag_types', 'prefix_tag_types', 'tag_title_types', 'tag_prefix_types',
 'title_prefix_types', 'prefix_title_types', 'tag_query_prediction_types', 'title_query_prediction_types',
      'prefix_len', 'title_len',
 'query_prediction_key_len_max', 'query_prediction_key_len_min',
 'query_prediction_key_len_mean', 'query_prediction_key_len_std',
 'len_title-prefix', 'len_prefix/title', 'len_mean-title', 'len_mean/title',
    'q_t_word_match', 'q_t_common_words',
 'q_t_total_unique_words', 'q_t_wc_diff', 'q_t_wc_ratio',
 'q_t_wc_diff_unique', 'q_t_wc_ratio_unique',
 'p_t_word_match', 'p_t_common_words',
 'p_t_total_unique_words', 'p_t_wc_diff', 'p_t_wc_ratio',
 'p_t_wc_diff_unique', 'p_t_wc_ratio_unique',
 'p_q_word_match', 'p_q_common_words',
 'p_q_total_unique_words', 'p_q_wc_diff', 'p_q_wc_ratio',
 'p_q_wc_diff_unique', 'p_q_wc_ratio_unique',
    'title_prefix_dot_similarity',
 'title_query_dot_similarity', 'title_prefix_norm_similarity',
 'title_query_norm_similarity', 'title_prefix_cosine_similarity',
 'title_query_cosine_similarity',
    'title_query_dot_similarity_max', 'title_query_dot_similarity_min',
 'title_query_dot_similarity_mean', 'title_query_dot_similarity_std',
    'title_query_norm_similarity_min', 'title_query_norm_similarity_mean',
 'title_query_norm_similarity_std',
    'title_query_cosine_similarity_max', 'title_query_cosine_similarity_min',
 'title_query_cosine_similarity_mean', 'title_query_cosine_similarity_std',
    'title_prefix_leven', 'title_prefix_leven_rate',
 'title_query_leven_sum', 'title_query_leven_max', 'title_query_leven_min',
 'title_query_leven_mean', 'title_query_leven_std',
    
#      'dir_t_array_0_fea', 'dir_p_array0_fea', 'dir_maxQ_array0_fea',
#  'dir_t_array_1_fea', 'dir_p_array1_fea', 'dir_maxQ_array1_fea',
#  'dir_t_array_2_fea', 'dir_p_array2_fea', 'dir_maxQ_array2_fea',
#  'dir_t_array_3_fea', 'dir_p_array3_fea', 'dir_maxQ_array3_fea',
#  'dir_t_array_4_fea', 'dir_p_array4_fea', 'dir_maxQ_array4_fea',
#  'dir_t_array_5_fea', 'dir_p_array5_fea', 'dir_maxQ_array5_fea',
#  'dir_t_array_6_fea', 'dir_p_array6_fea', 'dir_maxQ_array6_fea',
#  'dir_t_array_7_fea', 'dir_p_array7_fea', 'dir_maxQ_array7_fea',
#  'dir_t_array_8_fea', 'dir_p_array8_fea', 'dir_maxQ_array8_fea',
#  'dir_t_array_9_fea', 'dir_p_array9_fea', 'dir_maxQ_array9_fea',
#  'dir_t_array_10_fea', 'dir_p_array10_fea', 'dir_maxQ_array10_fea',
#  'dir_t_array_11_fea', 'dir_p_array11_fea', 'dir_maxQ_array11_fea',
#  'dir_t_array_12_fea', 'dir_p_array12_fea', 'dir_maxQ_array12_fea',
#  'dir_t_array_13_fea', 'dir_p_array13_fea', 'dir_maxQ_array13_fea',
#  'dir_t_array_14_fea', 'dir_p_array14_fea', 'dir_maxQ_array14_fea',
#  'dir_t_array_15_fea', 'dir_p_array15_fea', 'dir_maxQ_array15_fea',
#  'dir_t_array_16_fea', 'dir_p_array16_fea', 'dir_maxQ_array16_fea',
#  'dir_t_array_17_fea', 'dir_p_array17_fea', 'dir_maxQ_array17_fea',
#  'dir_t_array_18_fea', 'dir_p_array18_fea', 'dir_maxQ_array18_fea',
#  'dir_t_array_19_fea', 'dir_p_array19_fea', 'dir_maxQ_array19_fea',
#  'dir_t_array_20_fea', 'dir_p_array20_fea', 'dir_maxQ_array20_fea',
#  'dir_t_array_21_fea', 'dir_p_array21_fea', 'dir_maxQ_array21_fea',
#  'dir_t_array_22_fea', 'dir_p_array22_fea', 'dir_maxQ_array22_fea',
#  'dir_t_array_23_fea', 'dir_p_array23_fea', 'dir_maxQ_array23_fea',
#  'dir_t_array_24_fea', 'dir_p_array24_fea', 'dir_maxQ_array24_fea',
#  'dir_t_array_25_fea', 'dir_p_array25_fea', 'dir_maxQ_array25_fea',
#  'dir_t_array_26_fea', 'dir_p_array26_fea', 'dir_maxQ_array26_fea',
#  'dir_t_array_27_fea', 'dir_p_array27_fea', 'dir_maxQ_array27_fea',
#  'dir_t_array_28_fea', 'dir_p_array28_fea', 'dir_maxQ_array28_fea',
#  'dir_t_array_29_fea', 'dir_p_array29_fea', 'dir_maxQ_array29_fea',
    
#     'dir_t_p_array_0_fea', 'dir_t_maxQ_array0_fea', 'dir_t_p_array_1_fea',
#  'dir_t_maxQ_array1_fea', 'dir_t_p_array_2_fea', 'dir_t_maxQ_array2_fea',
#  'dir_t_p_array_3_fea', 'dir_t_maxQ_array3_fea', 'dir_t_p_array_4_fea',
#  'dir_t_maxQ_array4_fea', 'dir_t_p_array_5_fea', 'dir_t_maxQ_array5_fea',
#  'dir_t_p_array_6_fea', 'dir_t_maxQ_array6_fea', 'dir_t_p_array_7_fea',
#  'dir_t_maxQ_array7_fea', 'dir_t_p_array_8_fea', 'dir_t_maxQ_array8_fea',
#  'dir_t_p_array_9_fea', 'dir_t_maxQ_array9_fea', 'dir_t_p_array_10_fea',
#  'dir_t_maxQ_array10_fea', 'dir_t_p_array_11_fea', 'dir_t_maxQ_array11_fea',
#  'dir_t_p_array_12_fea', 'dir_t_maxQ_array12_fea', 'dir_t_p_array_13_fea',
#  'dir_t_maxQ_array13_fea', 'dir_t_p_array_14_fea', 'dir_t_maxQ_array14_fea',
#  'dir_t_p_array_15_fea', 'dir_t_maxQ_array15_fea', 'dir_t_p_array_16_fea',
#  'dir_t_maxQ_array16_fea', 'dir_t_p_array_17_fea', 'dir_t_maxQ_array17_fea',
#  'dir_t_p_array_18_fea', 'dir_t_maxQ_array18_fea', 'dir_t_p_array_19_fea',
#  'dir_t_maxQ_array19_fea', 'dir_t_p_array_20_fea', 'dir_t_maxQ_array20_fea',
#  'dir_t_p_array_21_fea', 'dir_t_maxQ_array21_fea', 'dir_t_p_array_22_fea',
#  'dir_t_maxQ_array22_fea', 'dir_t_p_array_23_fea', 'dir_t_maxQ_array23_fea',
#  'dir_t_p_array_24_fea', 'dir_t_maxQ_array24_fea', 'dir_t_p_array_25_fea',
#  'dir_t_maxQ_array25_fea', 'dir_t_p_array_26_fea', 'dir_t_maxQ_array26_fea',
#  'dir_t_p_array_27_fea', 'dir_t_maxQ_array27_fea', 'dir_t_p_array_28_fea',
#  'dir_t_maxQ_array28_fea', 'dir_t_p_array_29_fea', 'dir_t_maxQ_array29_fea',
    
      ]


In [27]:
lgb_model = lgb.LGBMClassifier(
    boosting_type='gbdt', num_leaves=127, max_depth=-1, n_estimators=5000, objective='binary',
    subsample=0.8, colsample_bytree=1, subsample_freq=1,
    learning_rate=0.01, random_state=2018, n_jobs=-1
)

valid_df['predicted_score'] = 0

lgb_model.fit(train_df[fea], train_df['label'], eval_set=[(train_df[fea], train_df['label']),
                            (valid_df[fea], valid_df['label'])], early_stopping_rounds=50, eval_metric='auc')
valid_pred = lgb_model.predict_proba(valid_df[fea], num_iteration=lgb_model.best_iteration_)[:, 1]
print(np.mean(valid_pred))

fscore = lgb_model.booster_.feature_importance()
feaNames = lgb_model.booster_.feature_name()
scoreDf = pd.DataFrame(index=feaNames, columns=['importance'], data=fscore)
print(scoreDf.sort_index(by=['importance'], ascending=False))



[1]	valid_0's auc: 0.870449	valid_1's auc: 0.860477
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.871108	valid_1's auc: 0.864553
[3]	valid_0's auc: 0.87155	valid_1's auc: 0.865492
[4]	valid_0's auc: 0.871637	valid_1's auc: 0.865038
[5]	valid_0's auc: 0.871715	valid_1's auc: 0.865244
[6]	valid_0's auc: 0.871844	valid_1's auc: 0.865219
[7]	valid_0's auc: 0.871886	valid_1's auc: 0.864944
[8]	valid_0's auc: 0.872025	valid_1's auc: 0.865179
[9]	valid_0's auc: 0.872161	valid_1's auc: 0.865336
[10]	valid_0's auc: 0.872398	valid_1's auc: 0.865709
[11]	valid_0's auc: 0.872446	valid_1's auc: 0.865949
[12]	valid_0's auc: 0.872471	valid_1's auc: 0.865832
[13]	valid_0's auc: 0.872504	valid_1's auc: 0.865984
[14]	valid_0's auc: 0.872601	valid_1's auc: 0.866228
[15]	valid_0's auc: 0.872665	valid_1's auc: 0.866172
[16]	valid_0's auc: 0.872667	valid_1's auc: 0.866035
[17]	valid_0's auc: 0.872768	valid_1's auc: 0.866252
[18]	valid_0's auc: 0.872825	valid_1's auc: 0.8

[155]	valid_0's auc: 0.876875	valid_1's auc: 0.871427
[156]	valid_0's auc: 0.876892	valid_1's auc: 0.871435
[157]	valid_0's auc: 0.876917	valid_1's auc: 0.871461
[158]	valid_0's auc: 0.87694	valid_1's auc: 0.8715
[159]	valid_0's auc: 0.876961	valid_1's auc: 0.871521
[160]	valid_0's auc: 0.876979	valid_1's auc: 0.871537
[161]	valid_0's auc: 0.877002	valid_1's auc: 0.871567
[162]	valid_0's auc: 0.877019	valid_1's auc: 0.87159
[163]	valid_0's auc: 0.877045	valid_1's auc: 0.871618
[164]	valid_0's auc: 0.87706	valid_1's auc: 0.871632
[165]	valid_0's auc: 0.87708	valid_1's auc: 0.871669
[166]	valid_0's auc: 0.877097	valid_1's auc: 0.871687
[167]	valid_0's auc: 0.877115	valid_1's auc: 0.871701
[168]	valid_0's auc: 0.877137	valid_1's auc: 0.871739
[169]	valid_0's auc: 0.877153	valid_1's auc: 0.87174
[170]	valid_0's auc: 0.877171	valid_1's auc: 0.871766
[171]	valid_0's auc: 0.877192	valid_1's auc: 0.871794
[172]	valid_0's auc: 0.877208	valid_1's auc: 0.871805
[173]	valid_0's auc: 0.877231	valid

[308]	valid_0's auc: 0.879347	valid_1's auc: 0.873079
[309]	valid_0's auc: 0.879361	valid_1's auc: 0.873089
[310]	valid_0's auc: 0.879374	valid_1's auc: 0.873089
[311]	valid_0's auc: 0.879386	valid_1's auc: 0.873088
[312]	valid_0's auc: 0.879399	valid_1's auc: 0.873096
[313]	valid_0's auc: 0.879411	valid_1's auc: 0.873102
[314]	valid_0's auc: 0.879424	valid_1's auc: 0.873107
[315]	valid_0's auc: 0.879434	valid_1's auc: 0.873107
[316]	valid_0's auc: 0.879448	valid_1's auc: 0.873118
[317]	valid_0's auc: 0.87946	valid_1's auc: 0.873119
[318]	valid_0's auc: 0.879472	valid_1's auc: 0.873115
[319]	valid_0's auc: 0.879484	valid_1's auc: 0.873116
[320]	valid_0's auc: 0.8795	valid_1's auc: 0.873131
[321]	valid_0's auc: 0.879512	valid_1's auc: 0.873125
[322]	valid_0's auc: 0.879523	valid_1's auc: 0.87312
[323]	valid_0's auc: 0.879535	valid_1's auc: 0.873125
[324]	valid_0's auc: 0.879548	valid_1's auc: 0.873133
[325]	valid_0's auc: 0.87956	valid_1's auc: 0.873139
[326]	valid_0's auc: 0.879573	val

[461]	valid_0's auc: 0.881318	valid_1's auc: 0.873995
[462]	valid_0's auc: 0.881333	valid_1's auc: 0.874004
[463]	valid_0's auc: 0.881344	valid_1's auc: 0.87401
[464]	valid_0's auc: 0.881356	valid_1's auc: 0.874014
[465]	valid_0's auc: 0.881368	valid_1's auc: 0.874013
[466]	valid_0's auc: 0.881381	valid_1's auc: 0.874027
[467]	valid_0's auc: 0.88139	valid_1's auc: 0.87403
[468]	valid_0's auc: 0.881402	valid_1's auc: 0.87404
[469]	valid_0's auc: 0.881412	valid_1's auc: 0.874044
[470]	valid_0's auc: 0.881423	valid_1's auc: 0.874048
[471]	valid_0's auc: 0.881436	valid_1's auc: 0.874043
[472]	valid_0's auc: 0.881447	valid_1's auc: 0.874044
[473]	valid_0's auc: 0.881459	valid_1's auc: 0.874051
[474]	valid_0's auc: 0.881471	valid_1's auc: 0.874052
[475]	valid_0's auc: 0.881485	valid_1's auc: 0.874058
[476]	valid_0's auc: 0.881499	valid_1's auc: 0.874064
[477]	valid_0's auc: 0.881511	valid_1's auc: 0.874067
[478]	valid_0's auc: 0.881522	valid_1's auc: 0.874073
[479]	valid_0's auc: 0.881535	va

[614]	valid_0's auc: 0.882812	valid_1's auc: 0.874304
[615]	valid_0's auc: 0.882821	valid_1's auc: 0.874303
[616]	valid_0's auc: 0.882829	valid_1's auc: 0.874308
[617]	valid_0's auc: 0.882838	valid_1's auc: 0.874306
[618]	valid_0's auc: 0.882847	valid_1's auc: 0.874309
[619]	valid_0's auc: 0.882854	valid_1's auc: 0.874306
[620]	valid_0's auc: 0.882862	valid_1's auc: 0.874309
[621]	valid_0's auc: 0.882871	valid_1's auc: 0.874306
[622]	valid_0's auc: 0.88288	valid_1's auc: 0.874291
[623]	valid_0's auc: 0.882888	valid_1's auc: 0.87429
[624]	valid_0's auc: 0.882896	valid_1's auc: 0.874288
[625]	valid_0's auc: 0.882904	valid_1's auc: 0.874293
[626]	valid_0's auc: 0.882911	valid_1's auc: 0.874294
[627]	valid_0's auc: 0.882918	valid_1's auc: 0.874292
[628]	valid_0's auc: 0.882925	valid_1's auc: 0.874291
[629]	valid_0's auc: 0.882932	valid_1's auc: 0.874289
[630]	valid_0's auc: 0.88294	valid_1's auc: 0.874291
[631]	valid_0's auc: 0.882946	valid_1's auc: 0.87429
[632]	valid_0's auc: 0.882954	va



In [28]:
valid_df['predicted_score'] = valid_pred

train_prefix_set = set(train_df['prefix'])

valid_df['is_prefix_in_train'] = valid_df['prefix'].map(lambda x : 1 if x in train_prefix_set else 0)
print(np.mean(valid_df[valid_df.is_prefix_in_train == 1]['predicted_score']))
print(np.mean(valid_df[valid_df.is_prefix_in_train == 0]['predicted_score']))


0.3717271210240937
0.4541913149487511


In [29]:
# # 导出预测结果
# def exportResult(df, fileName):
#     df.to_csv('../result/%s.csv' % fileName, header=True, index=False)

# exportResult(valid_df[['predicted_score', 'is_prefix_in_train']], 'valid_29_pred')


In [27]:
valid_prefix0_df = valid_df[valid_df.is_prefix_in_train == 1].copy()

#定义调整函数
def resultAdjustment(result_df, t):
    result_df_temp = result_df.copy()
    result_df_temp['x'] = result_df_temp.predicted_score.map(lambda x: -(math.log(((1 - x) / x), math.e)))
    result_df_temp['adjust_result'] = result_df_temp.x.map(lambda x: 1 / (1 + math.exp(-(x + t)))) 
    print(result_df_temp['adjust_result'].mean())
    return result_df_temp['adjust_result']

print('original mean : ', valid_prefix0_df['predicted_score'].mean())
valid_df_after = resultAdjustment(valid_prefix0_df, 0.61455)


original mean :  0.3728237517271243
0.45672466695017666


In [29]:
valid_df['predicted_score'][valid_df.is_prefix_in_train == 1] = valid_df_after
print(np.mean(valid_df['predicted_score'][valid_df.is_prefix_in_train == 0]))
print(np.mean(valid_df['predicted_score'][valid_df.is_prefix_in_train == 1]))


0.456716771801509
0.45672466695017666


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [30]:
yuzhi_dict = {}
# 定义搜索方法获取最佳F1对应的阈值
for yuzhi in range(350, 700, 2):
    real_yuzhi = yuzhi / 1000
    valid_df['predicted_label'] = valid_df['predicted_score'].map(lambda x : 1 if x > real_yuzhi else 0)
    f1 = f1_score(valid_df['label'], valid_df['predicted_label'])
    yuzhi_dict[str(real_yuzhi)] = f1
print(yuzhi_dict)


{'0.35': 0.7226205354206408, '0.352': 0.7231884057971014, '0.354': 0.7236888419570574, '0.356': 0.7242314049586777, '0.358': 0.7247355986840652, '0.36': 0.7253583436559901, '0.362': 0.725551910630375, '0.364': 0.7256904360181156, '0.366': 0.7260359238840477, '0.368': 0.7265233374178456, '0.37': 0.7270575097710776, '0.372': 0.7278507937218727, '0.374': 0.7281165919282511, '0.376': 0.7284649240995239, '0.378': 0.7286964739767331, '0.38': 0.7287894843640791, '0.382': 0.729214650228063, '0.384': 0.7295017077197983, '0.386': 0.7301205366263365, '0.388': 0.730686183148851, '0.39': 0.7315783475783475, '0.392': 0.7323043200292263, '0.394': 0.7327412561801867, '0.396': 0.7331987705858066, '0.398': 0.7336106068613708, '0.4': 0.7342390052958784, '0.402': 0.7344272888827345, '0.404': 0.734647652945663, '0.406': 0.7342628190282089, '0.408': 0.7344619666048238, '0.41': 0.7348835048433181, '0.412': 0.7354022239799003, '0.414': 0.735841141099147, '0.416': 0.7358983937243182, '0.418': 0.736148127367092