In [1]:
import numpy as np 
import pandas as pd
import time
import datetime
import gc
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2, SelectPercentile
import math
from sklearn.metrics import f1_score
import jieba
import jieba.posseg as psg
from collections import Counter
import functools
from gensim.models import word2vec



In [2]:
train_df = pd.read_table('../data/oppo_round1_train_20180929.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)
valid_df = pd.read_table('../data/oppo_round1_vali_20180929.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, quoting=3)
test_df = pd.read_table('../data/oppo_round1_test_A_20180929.txt', names=['prefix', 'query_prediction', 'title', 'tag'], header=None, quoting=3)
print(test_df.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 4 columns):
prefix              50000 non-null object
query_prediction    50000 non-null object
title               50000 non-null object
tag                 50000 non-null object
dtypes: object(4)
memory usage: 1.5+ MB
None


In [3]:
def get_float_list(x):
    return_list = []
    for temp in x:
        return_list.append(float(temp))
    return return_list

# 处理跟query_prediction相关的统计特征
def get_query_prediction_feature(df):
    df['query_prediction_dict'] = df['query_prediction'].map(lambda x : eval(x))
    df['query_prediction_keys'] = df['query_prediction_dict'].map(lambda x : list(x.keys()))
    df['query_prediction_values'] = df['query_prediction_dict'].map(lambda x : get_float_list(list(x.values())))
    df['query_prediction_number'] = df['query_prediction_keys'].map(lambda x : len(x))
    df['query_prediction_max'] = df['query_prediction_values'].map(lambda x : np.nan if len(x) == 0 else np.max(x))
    df['query_prediction_min'] = df['query_prediction_values'].map(lambda x : np.nan if len(x) == 0 else np.min(x))
    df['query_prediction_mean'] = df['query_prediction_values'].map(lambda x : np.nan if len(x) == 0 else np.mean(x))
    df['query_prediction_std'] = df['query_prediction_values'].map(lambda x : np.nan if len(x) == 0 else np.std(x))
    return df

train_df = get_query_prediction_feature(train_df)
valid_df = get_query_prediction_feature(valid_df)
test_df = get_query_prediction_feature(test_df)
print(train_df.head())
    


  prefix                                   query_prediction            title  \
0     小品  {"小品大全宋小宝": "0.009", "小品相亲": "0.012", "小品剧本": ...               小品   
1   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...  HCG大于1368%2C正常吗   
2   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...            1368年   
3     银耳  {"银耳汤的功效": "0.012", "银耳为什么不能天天吃": "0.009", "银耳...         银耳红枣汤的做法   
4   月经量少  {"月经量少喝红糖水好吗": "0.010", "月经量少该怎么调理": "0.016", ...         月经量少怎么调理   

  tag  label                              query_prediction_dict  \
0  阅读      0  {'小品大全宋小宝': '0.009', '小品相亲': '0.012', '小品剧本': ...   
1  健康      0  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
2  百科      1  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
3  菜谱      1  {'银耳汤的功效': '0.012', '银耳为什么不能天天吃': '0.009', '银耳...   
4  百科      0  {'月经量少喝红糖水好吗': '0.010', '月经量少该怎么调理': '0.016', ...   

                               query_prediction_keys  \
0  [小品大全宋小宝, 小品相亲, 小品剧本, 小品搞笑大全, 小品不差钱, 小品搞笑大全剧本,...   
1  [

In [4]:
# 增加一个冷启动特征，根据prefix
prefix_pivot_table = pd.pivot_table(train_df, index='prefix', values='label', aggfunc=len)
prefix_pivot_table.reset_index(inplace=True)
prefix_pivot_table.rename(columns={'label' : 'prefix_number'}, inplace=True)
prefix_repeat_set = set(prefix_pivot_table['prefix'][prefix_pivot_table.prefix_number > 1])
train_df['is_repeat_prefix'] = train_df['prefix'].map(lambda x : 1 if x in prefix_repeat_set else 0)
valid_df['is_repeat_prefix'] = valid_df['prefix'].map(lambda x : 1 if x in prefix_repeat_set else 0)
test_df['is_repeat_prefix'] = test_df['prefix'].map(lambda x : 1 if x in prefix_repeat_set else 0)



In [5]:
def getBayesSmoothParam(origion_rate):
    origion_rate_mean = origion_rate.mean()
    origion_rate_var = origion_rate.var()
    alpha = origion_rate_mean / origion_rate_var * (origion_rate_mean * (1 - origion_rate_mean) - origion_rate_var)
    beta = (1 - origion_rate_mean) / origion_rate_var * (origion_rate_mean * (1 - origion_rate_mean) - origion_rate_var)
    print('origion_rate_mean : ', origion_rate_mean)
    print('origion_rate_var : ', origion_rate_var)
    print('alpha : ', alpha)
    print('beta : ', beta)
    return alpha, beta

# 统计单维度的转化率特征
def get_single_dimension_rate_feature(train_df, valid_df, test_df, fea_set):
    for fea in fea_set:
        temp_df = train_df[[fea, 'label']].copy()
#         temp_df = train_df[[fea, 'label']][train_df.is_repeat_prefix == 1].copy()
        temp_pivot_table = pd.pivot_table(temp_df, index=fea, values='label', aggfunc={len, np.mean, np.sum})
        temp_pivot_table.reset_index(inplace=True)
        temp_pivot_table.rename(columns={'len':fea + '_count', 'mean':fea + '_rate', 'sum':fea + '_click_number'}, inplace=True)
        alpha, beta = getBayesSmoothParam(temp_pivot_table[fea + '_rate'])
        temp_pivot_table[fea + '_rate'] = (temp_pivot_table[fea + '_click_number'] + alpha) / (temp_pivot_table[fea + '_count'] + alpha + beta)
#         del temp_pivot_table[fea + '_click_number']
        train_df = pd.merge(train_df, temp_pivot_table, on=fea, how='left')
        valid_df = pd.merge(valid_df, temp_pivot_table, on=fea, how='left')
        test_df = pd.merge(test_df, temp_pivot_table, on=fea, how='left')
    return train_df, valid_df, test_df
    
fea_set = ['prefix', 'title', 'tag', 'query_prediction']
train_df, valid_df, test_df = get_single_dimension_rate_feature(train_df, valid_df, test_df, fea_set)
print(train_df.head())


origion_rate_mean :  0.40002767378326565
origion_rate_var :  0.09707128998059503
alpha :  0.5890274369157756
beta :  0.8834392835614426
origion_rate_mean :  0.37554872005889733
origion_rate_var :  0.18375012255029244
alpha :  0.10374693833333963
beta :  0.17250733386087524
origion_rate_mean :  0.3155117577588836
origion_rate_var :  0.02654243870294424
alpha :  2.2516679174047067
beta :  4.884889950988419
origion_rate_mean :  0.39952573312087963
origion_rate_var :  0.09550917645024579
alpha :  0.6040238029749967
beta :  0.9078282578589535
  prefix                                   query_prediction            title  \
0     小品  {"小品大全宋小宝": "0.009", "小品相亲": "0.012", "小品剧本": ...               小品   
1   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...  HCG大于1368%2C正常吗   
2   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...            1368年   
3     银耳  {"银耳汤的功效": "0.012", "银耳为什么不能天天吃": "0.009", "银耳...         银耳红枣汤的做法   
4   月经量少  {"月经量少喝红糖水好吗": "0.010", "月经量少该怎么调理": "0.016", 

In [6]:
# 统计双维度交叉转化率
def get_jiaocha_dimension_rate_feature(train_df, valid_df, test_df, fea_set):
    for i in range(len(fea_set)):
        for j in range((i+1), len(fea_set)):
            fea1 = fea_set[i]
            fea2 = fea_set[j]
            temp_df = train_df[[fea1, fea2, 'label']].copy()
            temp_pivot_table = pd.pivot_table(temp_df, index=[fea1, fea2], values='label', aggfunc={len, np.mean, np.sum})
            temp_pivot_table.reset_index(inplace=True)
            temp_pivot_table.rename(columns={'len':fea1 + '_' + fea2 + '_count', 'mean':fea1 + '_' + fea2 + '_rate', 'sum':fea1 + '_' + fea2 + '_click_number'}, inplace=True)
            alpha, beta = getBayesSmoothParam(temp_pivot_table[fea1 + '_' + fea2 + '_rate'])
            temp_pivot_table[fea1 + '_' + fea2 + '_rate'] = (temp_pivot_table[fea1 + '_' + fea2 + '_click_number'] + alpha) / (temp_pivot_table[fea1 + '_' + fea2 + '_count'] + alpha + beta)
#             del temp_pivot_table[fea1 + '_' + fea2 + '_click_number']
            train_df = pd.merge(train_df, temp_pivot_table, on=[fea1, fea2], how='left')
            valid_df = pd.merge(valid_df, temp_pivot_table, on=[fea1, fea2], how='left')
            test_df = pd.merge(test_df, temp_pivot_table, on=[fea1, fea2], how='left')
    return train_df, valid_df, test_df

jiaocha_fea_set = ['prefix', 'title', 'tag']
train_df, valid_df, test_df = get_jiaocha_dimension_rate_feature(train_df, valid_df, test_df, jiaocha_fea_set)
print(train_df.head())
    


origion_rate_mean :  0.3739739185109122
origion_rate_var :  0.18828707562599176
alpha :  0.09102778803323584
beta :  0.15237899390409795
origion_rate_mean :  0.38081147231232
origion_rate_var :  0.1613523282902859
alpha :  0.17569178599550558
beta :  0.2856698030571823
origion_rate_mean :  0.37681270456893634
origion_rate_var :  0.18916100599546115
alpha :  0.09096341835574066
beta :  0.15043878823864787
  prefix                                   query_prediction            title  \
0     小品  {"小品大全宋小宝": "0.009", "小品相亲": "0.012", "小品剧本": ...               小品   
1   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...  HCG大于1368%2C正常吗   
2   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...            1368年   
3     银耳  {"银耳汤的功效": "0.012", "银耳为什么不能天天吃": "0.009", "银耳...         银耳红枣汤的做法   
4   月经量少  {"月经量少喝红糖水好吗": "0.010", "月经量少该怎么调理": "0.016", ...         月经量少怎么调理   

  tag  label                              query_prediction_dict  \
0  阅读      0  {'小品大全宋小宝': '0.009', '小品相亲': '

In [7]:
# 统计一些是否交叉的特征
def get_is_title_in_query_feature(df):
    x = df['title']
    y = df['query_prediction_keys']
    is_title_in_query = np.nan
    if len(y) > 0:
        if x in y:
            is_title_in_query = 1
        else:
            is_title_in_query = 0
    return is_title_in_query

def get_is_prefix_in_title_feature(df):
    x = df['prefix']
    y = df['title']
    is_prefix_in_title = np.nan
    if x in y:
        is_prefix_in_title = 1
    else:
        is_prefix_in_title = 0
    return is_prefix_in_title

train_df['is_title_in_query'] = train_df[['title', 'query_prediction_keys']].apply(get_is_title_in_query_feature, axis = 1)
valid_df['is_title_in_query'] = valid_df[['title', 'query_prediction_keys']].apply(get_is_title_in_query_feature, axis = 1)
test_df['is_title_in_query'] = test_df[['title', 'query_prediction_keys']].apply(get_is_title_in_query_feature, axis = 1)

train_df['is_prefix_in_title'] = train_df[['prefix', 'title']].apply(get_is_prefix_in_title_feature, axis = 1)
valid_df['is_prefix_in_title'] = valid_df[['prefix', 'title']].apply(get_is_prefix_in_title_feature, axis = 1)
test_df['is_prefix_in_title'] = test_df[['prefix', 'title']].apply(get_is_prefix_in_title_feature, axis = 1)


In [8]:
# 统计一些交叉种类特征
def get_jiaocha_type_feature(train_df, valid_df, test_df, jiaocha_type_list):
    for jiaocha_type in jiaocha_type_list:
        fea1 = jiaocha_type[0]
        fea2 = jiaocha_type[1]
        temp_pivot_table = pd.pivot_table(train_df[[fea1, fea2, 'label']], index=[fea1, fea2], values='label', aggfunc=len)
        temp_pivot_table.reset_index(inplace=True)
        final_pivot_table = pd.pivot_table(temp_pivot_table, index=fea1, values=fea2, aggfunc=len)
        final_pivot_table.reset_index(inplace=True)
        final_pivot_table.rename(columns={fea2 : fea1 + '_' + fea2 + '_types'}, inplace=True)
        train_df = pd.merge(train_df, final_pivot_table[[fea1, fea1 + '_' + fea2 + '_types']], on=fea1, how='left')
        valid_df = pd.merge(valid_df, final_pivot_table[[fea1, fea1 + '_' + fea2 + '_types']], on=fea1, how='left')
        test_df = pd.merge(test_df, final_pivot_table[[fea1, fea1 + '_' + fea2 + '_types']], on=fea1, how='left')
    return train_df, valid_df, test_df

jiaocha_type_list = [['title', 'tag'], ['prefix', 'tag'], ['tag', 'title'], ['tag', 'prefix'], 
                     ['title', 'prefix'], ['prefix', 'title'], ['tag', 'query_prediction'], ['title', 'query_prediction']]
train_df, valid_df, test_df = get_jiaocha_type_feature(train_df, valid_df, test_df, jiaocha_type_list)
print(train_df.head())



  prefix                                   query_prediction            title  \
0     小品  {"小品大全宋小宝": "0.009", "小品相亲": "0.012", "小品剧本": ...               小品   
1   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...  HCG大于1368%2C正常吗   
2   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...            1368年   
3     银耳  {"银耳汤的功效": "0.012", "银耳为什么不能天天吃": "0.009", "银耳...         银耳红枣汤的做法   
4   月经量少  {"月经量少喝红糖水好吗": "0.010", "月经量少该怎么调理": "0.016", ...         月经量少怎么调理   

  tag  label                              query_prediction_dict  \
0  阅读      0  {'小品大全宋小宝': '0.009', '小品相亲': '0.012', '小品剧本': ...   
1  健康      0  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
2  百科      1  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
3  菜谱      1  {'银耳汤的功效': '0.012', '银耳为什么不能天天吃': '0.009', '银耳...   
4  百科      0  {'月经量少喝红糖水好吗': '0.010', '月经量少该怎么调理': '0.016', ...   

                               query_prediction_keys  \
0  [小品大全宋小宝, 小品相亲, 小品剧本, 小品搞笑大全, 小品不差钱, 小品搞笑大全剧本,...   
1  [

In [9]:
def get_key_len_list(x):
    return_list = []
    for temp in x:
        return_list.append(len(temp))
    return return_list

# 统计一些跟字符串长度相关的特征
def get_string_len_feature(df):
    df['prefix_len'] = df['prefix'].map(lambda x : len(x))
    df['title_len'] = df['title'].map(lambda x : len(x))
    df['query_prediction_key_len_list'] = df['query_prediction_keys'].map(lambda x : get_key_len_list(x))
    df['query_prediction_key_len_max'] = df['query_prediction_key_len_list'].map(lambda x : np.nan if len(x) == 0 else np.max(x))
    df['query_prediction_key_len_min'] = df['query_prediction_key_len_list'].map(lambda x : np.nan if len(x) == 0 else np.min(x))
    df['query_prediction_key_len_mean'] = df['query_prediction_key_len_list'].map(lambda x : np.nan if len(x) == 0 else np.mean(x))
    df['query_prediction_key_len_std'] = df['query_prediction_key_len_list'].map(lambda x : np.nan if len(x) == 0 else np.std(x))
    df['len_title-prefix'] = df['title_len'] - df['prefix_len']
    df['len_prefix/title'] = df['prefix_len'] / df['title_len']
    df['len_mean-title'] = df['query_prediction_key_len_mean'] - df['title_len']
    df['len_mean/title'] = df['query_prediction_key_len_mean'] / df['title_len']
    del df['query_prediction_key_len_list']
    return df

train_df = get_string_len_feature(train_df)
valid_df = get_string_len_feature(valid_df)
test_df = get_string_len_feature(test_df)
print(train_df.head())



  prefix                                   query_prediction            title  \
0     小品  {"小品大全宋小宝": "0.009", "小品相亲": "0.012", "小品剧本": ...               小品   
1   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...  HCG大于1368%2C正常吗   
2   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...            1368年   
3     银耳  {"银耳汤的功效": "0.012", "银耳为什么不能天天吃": "0.009", "银耳...         银耳红枣汤的做法   
4   月经量少  {"月经量少喝红糖水好吗": "0.010", "月经量少该怎么调理": "0.016", ...         月经量少怎么调理   

  tag  label                              query_prediction_dict  \
0  阅读      0  {'小品大全宋小宝': '0.009', '小品相亲': '0.012', '小品剧本': ...   
1  健康      0  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
2  百科      1  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
3  菜谱      1  {'银耳汤的功效': '0.012', '银耳为什么不能天天吃': '0.009', '银耳...   
4  百科      0  {'月经量少喝红糖水好吗': '0.010', '月经量少该怎么调理': '0.016', ...   

                               query_prediction_keys  \
0  [小品大全宋小宝, 小品相亲, 小品剧本, 小品搞笑大全, 小品不差钱, 小品搞笑大全剧本,...   
1  [

In [10]:
#分词方法，调用结巴接口
def jieba_seg_to_list(sentence, pos=False):
    if not pos:
        #不进行词性标注的分词方法
        seg_list = jieba.cut(sentence)
    else:
        #进行词性标注的分词方法
        seg_list = psg.cut(sentence)
    return seg_list

#去除干扰词
def jieba_word_filter(seg_list, pos=False):
    
    filter_list = []
    #根据pos参数选择是否词性过滤
    #不进行词性过滤，则将词性都标记为n，表示全部保留
    for seg in seg_list:
        if not pos:
            word = seg
            flag = 'n'
        else:
            word = seg.word
            flag = seg.flag
        if not flag.startswith('n'):
            continue
        filter_list.append(word)
    return filter_list

def jieba_word_deal(sentence, pos=False):
    #调用上面方式对数据集进行处理，处理后的每条数据仅保留非干扰词
    seg_list = jieba_seg_to_list(sentence, pos)
    filter_list = jieba_word_filter(seg_list, pos)
    return filter_list

def get_prefix_prediction_key_sentences(x):
    prefix_prediction_key_sentences = ""
    for temp in x:
        if len(prefix_prediction_key_sentences) > 0:
            prefix_prediction_key_sentences = prefix_prediction_key_sentences + temp
        else:
            prefix_prediction_key_sentences = temp
    return prefix_prediction_key_sentences

def get_max_query_key_sentences(x):
    if len(x) == 0:
        return ""
    else:
        return max(x, key=x.get)

def get_jieba_word(df):
#     df['query_prediction_key_sentences'] = df['query_prediction_keys'].map(lambda x : get_prefix_prediction_key_sentences(x))
#     df['query_prediction_key_sentences'] = df['query_prediction_dict'].map(lambda x : get_max_query_key_sentences(x))
#     df['query_prediction_key_jieba_words'] = df['query_prediction_key_sentences'].map(lambda x : jieba_word_deal(x, False))
    df['query_prediction_words'] = df['query_prediction_keys'].map(lambda x : [jieba_word_deal(j, False) for j in x] if len(x) > 0 else np.nan)
    df['title_jieba_words'] = df['title'].map(lambda x : jieba_word_deal(x, False))
    df['prefix_jieba_words'] = df['prefix'].map(lambda x : jieba_word_deal(x, False))
#     del df['query_prediction_key_sentences']
    return df

train_df = get_jieba_word(train_df)
valid_df = get_jieba_word(valid_df)
test_df = get_jieba_word(test_df)
print(train_df.head())



Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
Dump cache file failed.
Traceback (most recent call last):
  File "/home/lab-zhao.yinhu/anaconda3/lib/python3.6/site-packages/jieba/__init__.py", line 152, in initialize
    _replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmp1srf1tvs' -> '/tmp/jieba.cache'
Loading model cost 2.023 seconds.
Prefix dict has been built succesfully.


  prefix                                   query_prediction            title  \
0     小品  {"小品大全宋小宝": "0.009", "小品相亲": "0.012", "小品剧本": ...               小品   
1   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...  HCG大于1368%2C正常吗   
2   1368  {"13688cc赛马会": "0.059", "13685367892": "0.124"...            1368年   
3     银耳  {"银耳汤的功效": "0.012", "银耳为什么不能天天吃": "0.009", "银耳...         银耳红枣汤的做法   
4   月经量少  {"月经量少喝红糖水好吗": "0.010", "月经量少该怎么调理": "0.016", ...         月经量少怎么调理   

  tag  label                              query_prediction_dict  \
0  阅读      0  {'小品大全宋小宝': '0.009', '小品相亲': '0.012', '小品剧本': ...   
1  健康      0  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
2  百科      1  {'13688cc赛马会': '0.059', '13685367892': '0.124'...   
3  菜谱      1  {'银耳汤的功效': '0.012', '银耳为什么不能天天吃': '0.009', '银耳...   
4  百科      0  {'月经量少喝红糖水好吗': '0.010', '月经量少该怎么调理': '0.016', ...   

                               query_prediction_keys  \
0  [小品大全宋小宝, 小品相亲, 小品剧本, 小品搞笑大全, 小品不差钱, 小品搞笑大全剧本,...   
1  [

In [11]:
def word_match_share(df):
    q1words = {}
    q2words = {}
    for word in df[0]:
        q1words[word] = 1
    for word in df[1]:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def jaccard(df):
    wic = set(df[0]).intersection(set(df[1]))
    uw = set(df[0]).union(df[1])
    if len(uw) == 0:
        uw = [1]
    return (len(wic) / len(uw))

def common_words(df):
    return len(set(df[0]).intersection(set(df[1])))

def total_unique_words(df):
    return len(set(df[0]).union(df[1]))

def wc_diff(df):
    return abs(len(df[0]) - len(df[1]))

def wc_ratio(df):
    l1 = len(df[0])*1.0 
    l2 = len(df[1])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique(df):
    return abs(len(set(df[0])) - len(set(df[1])))
    
def wc_ratio_unique(df):
    l1 = len(set(df[0])) * 1.0
    l2 = len(set(df[1]))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2
    
def tfidf_word_match_share(df, weights=None):
    q1words = {}
    q2words = {}
    for word in df[0]:
        q1words[word] = 1
    for word in df[1]:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def deal_word_for_all(train_df, valid_df, test_df, fea1, fea2, func, colName):
    train_df[colName] = train_df[[fea1, fea2]].apply(func, axis=1)
    valid_df[colName] = valid_df[[fea1, fea2]].apply(func, axis=1)
    test_df[colName] = test_df[[fea1, fea2]].apply(func, axis=1)
    print(colName + ' finish!!!')
    return train_df, valid_df, test_df
                   
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

def get_word_statistic_feature(train_df, valid_df, test_df, col_list):
    df = pd.concat([train_df[['query_prediction_key_jieba_words', 'title_jieba_words', 'prefix_jieba_words']], valid_df[['query_prediction_key_jieba_words', 'title_jieba_words', 'prefix_jieba_words']], test_df[['query_prediction_key_jieba_words', 'title_jieba_words', 'prefix_jieba_words']]])
    train_qs = pd.Series(df['query_prediction_key_jieba_words'].tolist() + df['title_jieba_words'].tolist() + df['prefix_jieba_words'].tolist())
    words = [x for y in train_qs for x in y]
    counts = Counter(words)
    weights = {word: get_weight(count) for word, count in counts.items()}
    for col in col_list:
        fea1 = col[0]
        fea2 = col[1]
        train_df, valid_df, test_df = deal_word_for_all(train_df, valid_df, test_df, fea1, fea2, word_match_share, fea1[0] + '_' + fea2[0] + '_word_match')
        train_df, valid_df, test_df = deal_word_for_all(train_df, valid_df, test_df, fea1, fea2, jaccard, fea1[0] + '_' + fea2[0] + '_jaccard')
        train_df, valid_df, test_df = deal_word_for_all(train_df, valid_df, test_df, fea1, fea2, common_words, fea1[0] + '_' + fea2[0] + '_common_words')
        train_df, valid_df, test_df = deal_word_for_all(train_df, valid_df, test_df, fea1, fea2, total_unique_words, fea1[0] + '_' + fea2[0] + '_total_unique_words')
        train_df, valid_df, test_df = deal_word_for_all(train_df, valid_df, test_df, fea1, fea2, wc_diff, fea1[0] + '_' + fea2[0] + '_wc_diff')
        train_df, valid_df, test_df = deal_word_for_all(train_df, valid_df, test_df, fea1, fea2, wc_ratio, fea1[0] + '_' + fea2[0] + '_wc_ratio')
        train_df, valid_df, test_df = deal_word_for_all(train_df, valid_df, test_df, fea1, fea2, wc_diff_unique, fea1[0] + '_' + fea2[0] + '_wc_diff_unique')
        train_df, valid_df, test_df = deal_word_for_all(train_df, valid_df, test_df, fea1, fea2, wc_ratio_unique, fea1[0] + '_' + fea2[0] + '_wc_ratio_unique')
        f = functools.partial(tfidf_word_match_share, weights=weights)
        train_df, valid_df, test_df = deal_word_for_all(train_df, valid_df, test_df, fea1, fea2, f, fea1[0] + '_' + fea2[0] + '_tfidf_word_match_share')
    return train_df, valid_df, test_df

col_list = [['query_prediction_key_jieba_words', 'title_jieba_words'], ['prefix_jieba_words', 'title_jieba_words'], ['prefix_jieba_words', 'query_prediction_key_jieba_words']]
train_df, valid_df, test_df = get_word_statistic_feature(train_df, valid_df, test_df, col_list)
print(train_df.head())
                   
                   

KeyError: "['query_prediction_key_jieba_words'] not in index"

In [12]:
# Set values for various parameters
num_features = 500  # Word vector dimensionality                      
min_word_count = 1  # Minimum word count                        
num_workers = 20       # Number of threads to run in parallel
context = 5          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

word2vec_df = pd.concat([train_df[['query_prediction_words', 'title_jieba_words', 'prefix_jieba_words', 'query_prediction_number']], valid_df[['query_prediction_words', 'title_jieba_words', 'prefix_jieba_words', 'query_prediction_number']], test_df[['query_prediction_words', 'title_jieba_words', 'prefix_jieba_words', 'query_prediction_number']]])
word2vec_df.reset_index(inplace=True)
word2vec_list = word2vec_df['title_jieba_words'].tolist() + word2vec_df['prefix_jieba_words'].tolist() + [y for x in word2vec_df['query_prediction_words'][word2vec_df.query_prediction_number > 0] for y in x]
model = word2vec.Word2Vec(word2vec_list, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

word_wv = model.wv




In [13]:
def get_w2v_array(word_list, word_wv, num_features):
    word_vectors = np.zeros((len(word_list), num_features))
    for i in range(len(word_list)):
        word_vectors[i][:] = word_wv[str(word_list[i])]
    mean_array = np.mean(word_vectors, axis=0)
    return mean_array

train_df['title_jieba_array'] = train_df['title_jieba_words'].map(lambda x : get_w2v_array(x, word_wv, num_features))
valid_df['title_jieba_array'] = valid_df['title_jieba_words'].map(lambda x : get_w2v_array(x, word_wv, num_features))
test_df['title_jieba_array'] = test_df['title_jieba_words'].map(lambda x : get_w2v_array(x, word_wv, num_features))

train_df['prefix_jieba_array'] = train_df['prefix_jieba_words'].map(lambda x : get_w2v_array(x, word_wv, num_features))
valid_df['prefix_jieba_array'] = valid_df['prefix_jieba_words'].map(lambda x : get_w2v_array(x, word_wv, num_features))
test_df['prefix_jieba_array'] = test_df['prefix_jieba_words'].map(lambda x : get_w2v_array(x, word_wv, num_features))


In [14]:
# def get_query_jieba_array_dict(df, word_wv, num_features):
#     word_array_dict = {}
#     query_prediction_words = df['query_prediction_words']
#     query_prediction_keys = df['query_prediction_keys']
#     if len(query_prediction_keys) > 0:
#         for i in range(len(query_prediction_keys)):
#             word_array_dict[query_prediction_keys[i]] = get_w2v_array(query_prediction_words[i], word_wv, num_features)
#     return word_array_dict

def get_title_prefix_similarity(df, f_similarity):
    title_array = df['title_jieba_array']
    prefix_array = df['prefix_jieba_array']
    similarity = 0
    if f_similarity == 'dot':
        similarity = np.dot(title_array, prefix_array)
    elif f_similarity == 'norm':
        similarity = np.linalg.norm(title_array - prefix_array)
    else:
        similarity = np.dot(title_array,prefix_array) / (np.linalg.norm(title_array) * np.linalg.norm(prefix_array))
    return similarity

def get_title_query_similarity(df, f_similarity, word_wv, num_features):
    title_array = df['title_jieba_array']
    query_prediction_words = df['query_prediction_words']
    query_prediction_keys = df['query_prediction_keys']
    query_prediction_dict = df['query_prediction_dict']
    if len(query_prediction_keys) <= 0:
        return np.nan
    similarity = 0
    if f_similarity == 'dot':
        i = 0
        for key in query_prediction_keys:
            key_array = get_w2v_array(query_prediction_words[i], word_wv, num_features)
            similarity = similarity + np.dot(title_array, key_array) * float(query_prediction_dict[key])
            i = i + 1
    elif f_similarity == 'norm':
        i = 0
        for key in query_prediction_keys:
            key_array = get_w2v_array(query_prediction_words[i], word_wv, num_features)
            similarity = similarity + np.linalg.norm(title_array - key_array) * float(query_prediction_dict[key])
            i = i + 1
    else:
        i = 0
        for key in query_prediction_keys:
            key_array = get_w2v_array(query_prediction_words[i], word_wv, num_features)
            similarity = similarity + (np.dot(title_array, key_array) / (np.linalg.norm(title_array) * np.linalg.norm(key_array))) * float(query_prediction_dict[key])
            i = i + 1
    return similarity

# f_query_jieba_array_dict = functools.partial(get_query_jieba_array_dict, word_wv=word_wv, num_features=num_features)
# train_df['query_jieba_array_dict'] = train_df[['query_prediction_words', 'query_prediction_keys']].apply(f_query_jieba_array_dict, axis=1)
# valid_df['query_jieba_array_dict'] = valid_df[['query_prediction_words', 'query_prediction_keys']].apply(f_query_jieba_array_dict, axis=1)
# test_df['query_jieba_array_dict'] = test_df[['query_prediction_words', 'query_prediction_keys']].apply(f_query_jieba_array_dict, axis=1)
# print(train_df.head())

def get_similarity_feature(train_df, valid_df, test_df):
    f_list = ['dot', 'norm', 'cosine']
    for fun in f_list:
        f_prefix_similarity = functools.partial(get_title_prefix_similarity, f_similarity=fun)
        train_df['title_prefix_' + fun + '_similarity'] = train_df[['title_jieba_array', 'prefix_jieba_array']].apply(f_prefix_similarity, axis=1)
        valid_df['title_prefix_' + fun + '_similarity'] = valid_df[['title_jieba_array', 'prefix_jieba_array']].apply(f_prefix_similarity, axis=1)
        test_df['title_prefix_' + fun + '_similarity'] = test_df[['title_jieba_array', 'prefix_jieba_array']].apply(f_prefix_similarity, axis=1)
        f_query_similarity = functools.partial(get_title_query_similarity, f_similarity=fun, word_wv=word_wv, num_features=num_features)
        train_df['title_query_' + fun + '_similarity'] = train_df[['title_jieba_array', 'query_prediction_words', 'query_prediction_keys', 'query_prediction_dict']].apply(f_query_similarity, axis=1)
        valid_df['title_query_' + fun + '_similarity'] = valid_df[['title_jieba_array', 'query_prediction_words', 'query_prediction_keys', 'query_prediction_dict']].apply(f_query_similarity, axis=1)
        test_df['title_query_' + fun + '_similarity'] = test_df[['title_jieba_array', 'query_prediction_words', 'query_prediction_keys', 'query_prediction_dict']].apply(f_query_similarity, axis=1)
        print(fun + ' : finish!!!')
    return train_df, valid_df, test_df

train_df, valid_df, test_df = get_similarity_feature(train_df, valid_df, test_df)
    


dot : finish!!!
norm : finish!!!
cosine : finish!!!


In [15]:
print(train_df.columns.values)



['prefix' 'query_prediction' 'title' 'tag' 'label' 'query_prediction_dict'
 'query_prediction_keys' 'query_prediction_values'
 'query_prediction_number' 'query_prediction_max' 'query_prediction_min'
 'query_prediction_mean' 'query_prediction_std' 'is_repeat_prefix'
 'prefix_count' 'prefix_rate' 'prefix_click_number' 'title_count'
 'title_rate' 'title_click_number' 'tag_count' 'tag_rate'
 'tag_click_number' 'query_prediction_count' 'query_prediction_rate'
 'query_prediction_click_number' 'prefix_title_count' 'prefix_title_rate'
 'prefix_title_click_number' 'prefix_tag_count' 'prefix_tag_rate'
 'prefix_tag_click_number' 'title_tag_count' 'title_tag_rate'
 'title_tag_click_number' 'is_title_in_query' 'is_prefix_in_title'
 'title_tag_types' 'prefix_tag_types' 'tag_title_types' 'tag_prefix_types'
 'title_prefix_types' 'prefix_title_types' 'tag_query_prediction_types'
 'title_query_prediction_types' 'prefix_len' 'title_len'
 'query_prediction_key_len_max' 'query_prediction_key_len_min'
 'que

In [18]:
from scipy.stats import mode, pearsonr

# 计算转化皮尔森系数
def getFeaPearsonr(df, cols):
    resultDf = pd.DataFrame(index=cols, columns=['pearsonr','p_values'])
    for c in cols:
        tempDf = df.dropna(subset=[c])
        result = pearsonr(tempDf[c].values, tempDf['label'].values)
        resultDf.loc[c,:] = result
    return resultDf

cols = ['query_prediction_number', 'query_prediction_max', 'query_prediction_min', 'query_prediction_mean', 'query_prediction_std',
       'is_repeat_prefix', 'prefix_count', 'prefix_rate',
 'title_count', 'title_rate', 'tag_count', 'tag_rate',
 'query_prediction_count', 'query_prediction_rate', 'prefix_title_count',
 'prefix_title_rate',  'prefix_tag_count', 'prefix_tag_rate',
 'title_tag_count', 'title_tag_rate',
    'prefix_click_number', 'title_click_number', 'query_prediction_click_number', 'prefix_tag_click_number', 
    'prefix_title_click_number', 'title_tag_click_number',
    'is_title_in_query', 'is_prefix_in_title', 
    'tag_title_types', 'tag_prefix_types', 'title_prefix_types', 'prefix_title_types',
       'prefix_len', 'title_len',
 'query_prediction_key_len_max', 'query_prediction_key_len_min',
 'query_prediction_key_len_mean', 'query_prediction_key_len_std',
 'len_title-prefix', 'len_prefix/title', 'len_mean-title', 'len_mean/title',
#        'q_t_word_match', 'q_t_jaccard', 'q_t_common_words',
#  'q_t_total_unique_words', 'q_t_wc_diff', 'q_t_wc_ratio',
#  'q_t_wc_diff_unique', 'q_t_wc_ratio_unique', 'q_t_tfidf_word_match_share',
#  'p_t_word_match', 'p_t_jaccard', 'p_t_common_words',
#  'p_t_total_unique_words', 'p_t_wc_diff', 'p_t_wc_ratio',
#  'p_t_wc_diff_unique', 'p_t_wc_ratio_unique', 'p_t_tfidf_word_match_share',
#  'p_q_word_match', 'p_q_jaccard', 'p_q_common_words',
#  'p_q_total_unique_words', 'p_q_wc_diff', 'p_q_wc_ratio',
#  'p_q_wc_diff_unique', 'p_q_wc_ratio_unique', 'p_q_tfidf_word_match_share',
        'title_prefix_dot_similarity',
 'title_query_dot_similarity', 'title_prefix_norm_similarity',
 'title_query_norm_similarity', 'title_prefix_cosine_similarity',
 'title_query_cosine_similarity',
       ]
resultDf = getFeaPearsonr(train_df, cols)
print(resultDf)



                                  pearsonr      p_values
query_prediction_number         -0.0162134  2.29532e-116
query_prediction_max            0.00897021   9.35332e-37
query_prediction_min             0.0079775    2.0096e-29
query_prediction_mean            0.0117999   2.59087e-62
query_prediction_std             0.0078163   2.59257e-28
is_repeat_prefix                -0.0173927  1.30175e-133
prefix_count                   -0.00603852   1.34422e-17
prefix_rate                       0.237067             0
title_count                    -0.00321917   5.29863e-06
title_rate                        0.636893             0
tag_count                         0.040607             0
tag_rate                          0.171706             0
query_prediction_count         -0.00522041   1.54971e-13
query_prediction_rate             0.233632             0
prefix_title_count               0.0030172   1.98149e-05
prefix_title_rate                 0.681393             0
prefix_tag_count               

In [16]:
fea = [
    'query_prediction_number', 'query_prediction_max', 'query_prediction_min', 'query_prediction_mean', 'query_prediction_std',
       'is_repeat_prefix', 'prefix_count', 'prefix_rate',
 'title_count', 'title_rate', 'tag_count', 'tag_rate',
 'query_prediction_count', 'query_prediction_rate', 'prefix_title_count',
 'prefix_title_rate',  'prefix_tag_count', 'prefix_tag_rate',
 'title_tag_count', 'title_tag_rate',
    'prefix_click_number', 'title_click_number', 'query_prediction_click_number', 'prefix_tag_click_number', 
    'prefix_title_click_number', 'title_tag_click_number',
    'is_title_in_query', 'is_prefix_in_title', 
    'title_tag_types', 'prefix_tag_types', 'tag_title_types', 'tag_prefix_types',
 'title_prefix_types', 'prefix_title_types', 'tag_query_prediction_types', 'title_query_prediction_types',
      'prefix_len', 'title_len',
 'query_prediction_key_len_max', 'query_prediction_key_len_min',
 'query_prediction_key_len_mean', 'query_prediction_key_len_std',
 'len_title-prefix', 'len_prefix/title', 'len_mean-title', 'len_mean/title',
#     'q_t_word_match', 'q_t_jaccard', 'q_t_common_words',
#  'q_t_total_unique_words', 'q_t_wc_diff', 'q_t_wc_ratio',
#  'q_t_wc_diff_unique', 'q_t_wc_ratio_unique', 'q_t_tfidf_word_match_share',
#  'p_t_word_match', 'p_t_jaccard', 'p_t_common_words',
#  'p_t_total_unique_words', 'p_t_wc_diff', 'p_t_wc_ratio',
#  'p_t_wc_diff_unique', 'p_t_wc_ratio_unique', 'p_t_tfidf_word_match_share',
#  'p_q_word_match', 'p_q_jaccard', 'p_q_common_words',
#  'p_q_total_unique_words', 'p_q_wc_diff', 'p_q_wc_ratio',
#  'p_q_wc_diff_unique', 'p_q_wc_ratio_unique', 'p_q_tfidf_word_match_share',
    'title_prefix_dot_similarity',
 'title_query_dot_similarity', 'title_prefix_norm_similarity',
 'title_query_norm_similarity', 'title_prefix_cosine_similarity',
 'title_query_cosine_similarity',
      ]



In [36]:
lgb_model = lgb.LGBMClassifier(
    boosting_type='gbdt', num_leaves=32, max_depth=-1, n_estimators=5000, objective='binary',
    subsample=0.8, colsample_bytree=1, subsample_freq=1,
    learning_rate=0.01, random_state=2018, n_jobs=-1
)

valid_df['predicted_score'] = 0

lgb_model.fit(train_df[fea], train_df['label'], eval_set=[(train_df[fea], train_df['label']),
                            (valid_df[fea], valid_df['label'])], early_stopping_rounds=50, eval_metric='auc')
valid_pred = lgb_model.predict_proba(valid_df[fea], num_iteration=lgb_model.best_iteration_)[:, 1]
print(np.mean(valid_pred))

fscore = lgb_model.booster_.feature_importance()
feaNames = lgb_model.booster_.feature_name()
scoreDf = pd.DataFrame(index=feaNames, columns=['importance'], data=fscore)
print(scoreDf.sort_index(by=['importance'], ascending=False))


[1]	valid_0's auc: 0.921196	valid_1's auc: 0.789941
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.92168	valid_1's auc: 0.791152
[3]	valid_0's auc: 0.921731	valid_1's auc: 0.791153
[4]	valid_0's auc: 0.921759	valid_1's auc: 0.791219
[5]	valid_0's auc: 0.921895	valid_1's auc: 0.791868
[6]	valid_0's auc: 0.921915	valid_1's auc: 0.791853
[7]	valid_0's auc: 0.923303	valid_1's auc: 0.784389
[8]	valid_0's auc: 0.923334	valid_1's auc: 0.783834
[9]	valid_0's auc: 0.92346	valid_1's auc: 0.783601
[10]	valid_0's auc: 0.923568	valid_1's auc: 0.783928
[11]	valid_0's auc: 0.92359	valid_1's auc: 0.783646
[12]	valid_0's auc: 0.923612	valid_1's auc: 0.783826
[13]	valid_0's auc: 0.923622	valid_1's auc: 0.784327
[14]	valid_0's auc: 0.923635	valid_1's auc: 0.783861
[15]	valid_0's auc: 0.923662	valid_1's auc: 0.783813
[16]	valid_0's auc: 0.923678	valid_1's auc: 0.783823
[17]	valid_0's auc: 0.923704	valid_1's auc: 0.783858
[18]	valid_0's auc: 0.923738	valid_1's auc: 0.783



In [19]:
valid_df['predicted_score'] = valid_pred
print(np.mean(valid_df['predicted_score'][valid_df.is_repeat_prefix == 0]))
print(np.mean(valid_df['predicted_score'][valid_df.is_repeat_prefix == 1]))
print(np.mean(valid_df['label'][valid_df.is_repeat_prefix == 0]))
print(np.mean(valid_df['label'][valid_df.is_repeat_prefix == 1]))


0.1795042485057676
0.4037811010487482
0.3771786726195587
0.37003624227570203


In [22]:
valid_prefix0_df = valid_df[valid_df.is_repeat_prefix == 0].copy()

#定义调整函数
def resultAdjustment(result_df, t):
    result_df_temp = result_df.copy()
    result_df_temp['x'] = result_df_temp.predicted_score.map(lambda x: -(math.log(((1 - x) / x), math.e)))
    result_df_temp['adjust_result'] = result_df_temp.x.map(lambda x: 1 / (1 + math.exp(-(x + t)))) 
    print(result_df_temp['adjust_result'].mean())
    return result_df_temp['adjust_result']

print('original mean : ', valid_prefix0_df['predicted_score'].mean())
valid_df_after = resultAdjustment(valid_prefix0_df, 1.22085)



original mean :  0.1795042485057676
0.4034150988840599


In [23]:
valid_df['predicted_score'][valid_df.is_repeat_prefix == 0] = valid_df_after
print(np.mean(valid_df['predicted_score'][valid_df.is_repeat_prefix == 0]))
print(np.mean(valid_df['predicted_score'][valid_df.is_repeat_prefix == 1]))
print(np.mean(valid_df['label'][valid_df.is_repeat_prefix == 0]))
print(np.mean(valid_df['label'][valid_df.is_repeat_prefix == 1]))



0.4034150988840599
0.4037811010487482
0.3771786726195587
0.37003624227570203


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [24]:
yuzhi_dict = {}
# 定义搜索方法获取最佳F1对应的阈值
for yuzhi in range(350, 450, 2):
    real_yuzhi = yuzhi / 1000
    valid_df['predicted_label'] = valid_df['predicted_score'].map(lambda x : 1 if x > real_yuzhi else 0)
    f1 = f1_score(valid_df['label'], valid_df['predicted_label'])
    yuzhi_dict[str(real_yuzhi)] = f1
print(yuzhi_dict)



{'0.35': 0.6884588053337792, '0.352': 0.6883901867692885, '0.354': 0.6886324123918645, '0.356': 0.6890555100497764, '0.358': 0.7003655252020008, '0.36': 0.7006958322297931, '0.362': 0.7008946873417417, '0.364': 0.7009248750332037, '0.366': 0.7012892769889456, '0.368': 0.708543946606468, '0.37': 0.7087867547716722, '0.372': 0.709702853200611, '0.374': 0.7130736017672457, '0.376': 0.7146073099785001, '0.378': 0.7210792928770805, '0.38': 0.722459169874129, '0.382': 0.7220522475876677, '0.384': 0.7224778481844715, '0.386': 0.7228833192923337, '0.388': 0.7223571654873799, '0.39': 0.7218458933107536, '0.392': 0.7211862383617602, '0.394': 0.7210414452709885, '0.396': 0.7080971995687393, '0.398': 0.7077681191517634, '0.4': 0.7075327843854834, '0.402': 0.7073644486375492, '0.404': 0.7069751919012126, '0.406': 0.706478747702078, '0.408': 0.706401272996287, '0.41': 0.7063729743345742, '0.412': 0.7060144256405939, '0.414': 0.7056010352491068, '0.416': 0.7056405057587789, '0.418': 0.701452729542617

In [29]:
test_pred = lgb_model.predict_proba(test_df[fea], num_iteration=lgb_model.best_iteration_)[:, 1]
test_df['predicted_score'] = test_pred

test_prefix0_df = test_df[test_df.is_repeat_prefix == 0].copy()

#定义调整函数
def resultAdjustment(result_df, t):
    result_df_temp = result_df.copy()
    result_df_temp['x'] = result_df_temp.predicted_score.map(lambda x: -(math.log(((1 - x) / x), math.e)))
    result_df_temp['adjust_result'] = result_df_temp.x.map(lambda x: 1 / (1 + math.exp(-(x + t)))) 
    print(result_df_temp['adjust_result'].mean())
    return result_df_temp['adjust_result']

print('original mean : ', test_prefix0_df['predicted_score'].mean())
test_df_after = resultAdjustment(test_prefix0_df, 1.21085)




original mean :  0.18111353307202657
0.4024094698755495


In [30]:
test_df['predicted_score'][test_df.is_repeat_prefix == 0] = test_df_after
print(np.mean(test_df['predicted_score'][test_df.is_repeat_prefix == 0]))
print(np.mean(test_df['predicted_score'][test_df.is_repeat_prefix == 1]))


0.4024094698755495
0.40235523040784643


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [31]:
test_df['predicted_label'] = test_df['predicted_score'].map(lambda x : 1 if x > 0.382 else 0)

print(np.mean(valid_df['label']))
print(np.mean(test_df['predicted_label']))


0.3717
0.3898


In [32]:
# 导出预测结果
def exportResult(df, fileName):
    df.to_csv('../result/%s.csv' % fileName, header=False, index=False)

exportResult(test_df[['predicted_label']], 'lgb_keng_10_28')

