In [1]:
import numpy as np 
import pandas as pd
import time
import datetime
import gc
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2, SelectPercentile
import math
from sklearn.metrics import f1_score
import jieba
import jieba.posseg as psg
from collections import Counter
import functools
from gensim.models import word2vec
import Levenshtein


In [2]:
train_dataset = pd.read_csv('../temp/train_online_alldata_df.csv')
test_dataset = pd.read_csv('../temp/test_online_alldata_df.csv')
test_dataset_29 = pd.read_csv('../temp/test_online_df.csv')
test_dataset = test_dataset[(len(test_dataset) - len(test_dataset_29)):]


In [3]:
fea = [
    'query_prediction_number', 'query_prediction_max', 'query_prediction_min', 'query_prediction_mean', 'query_prediction_std',
       'prefix_count', 'prefix_rate',
 'title_count', 'title_rate', 'tag_count', 'tag_rate',
 'query_prediction_count', 'query_prediction_rate', 'prefix_title_count',
 'prefix_title_rate',  'prefix_tag_count', 'prefix_tag_rate',
 'title_tag_count', 'title_tag_rate',
    'prefix_click_number', 'title_click_number', 'query_prediction_click_number', 'prefix_tag_click_number', 
    'prefix_title_click_number', 'title_tag_click_number',
    'is_title_in_query', 'is_prefix_in_title', 
    'title_tag_types', 'prefix_tag_types', 'tag_title_types', 'tag_prefix_types',
 'title_prefix_types', 'prefix_title_types', 'tag_query_prediction_types', 'title_query_prediction_types',
      'prefix_len', 'title_len',
 'query_prediction_key_len_max', 'query_prediction_key_len_min',
 'query_prediction_key_len_mean', 'query_prediction_key_len_std',
 'len_title-prefix', 'len_prefix/title', 'len_mean-title', 'len_mean/title',
    'q_t_word_match', 'q_t_jaccard', 'q_t_common_words',
 'q_t_total_unique_words', 'q_t_wc_diff', 'q_t_wc_ratio',
 'q_t_wc_diff_unique', 'q_t_wc_ratio_unique', 'q_t_tfidf_word_match_share',
 'p_t_word_match', 'p_t_jaccard', 'p_t_common_words',
 'p_t_total_unique_words', 'p_t_wc_diff', 'p_t_wc_ratio',
 'p_t_wc_diff_unique', 'p_t_wc_ratio_unique', 'p_t_tfidf_word_match_share',
 'p_q_word_match', 'p_q_jaccard', 'p_q_common_words',
 'p_q_total_unique_words', 'p_q_wc_diff', 'p_q_wc_ratio',
 'p_q_wc_diff_unique', 'p_q_wc_ratio_unique', 'p_q_tfidf_word_match_share',
    'title_prefix_dot_similarity',
 'title_query_dot_similarity', 'title_prefix_norm_similarity',
 'title_query_norm_similarity', 'title_prefix_cosine_similarity',
 'title_query_cosine_similarity',
    'title_query_dot_similarity_max', 'title_query_dot_similarity_min',
 'title_query_dot_similarity_mean', 'title_query_dot_similarity_std',
    'title_query_norm_similarity_min', 'title_query_norm_similarity_mean',
 'title_query_norm_similarity_std', 'title_prefix_cosine_similarity',
    'title_query_cosine_similarity_max', 'title_query_cosine_similarity_min',
 'title_query_cosine_similarity_mean', 'title_query_cosine_similarity_std',
    'title_prefix_leven', 'title_prefix_leven_rate',
 'title_query_leven_sum', 'title_query_leven_max', 'title_query_leven_min',
 'title_query_leven_mean', 'title_query_leven_std',
      ]




In [4]:
lgb_model = lgb.LGBMClassifier(
    boosting_type='gbdt', num_leaves=127, max_depth=-1, n_estimators=5000, objective='binary',
    subsample=0.8, colsample_bytree=1, subsample_freq=1,
    learning_rate=0.01, random_state=2018, n_jobs=-1, num_boost_round=710
)

test_dataset['predicted_score'] = 0

# lgb_model.fit(train_df[fea], train_df['label'], eval_set=[(train_df[fea], train_df['label']),
#                             (valid_df[fea], valid_df['label'])], early_stopping_rounds=50, eval_metric='auc')
lgb_model.fit(train_dataset[fea], train_dataset['label'], eval_metric='auc')
test_pred = lgb_model.predict_proba(test_dataset[fea], num_iteration=710)[:, 1]
print(np.mean(test_pred))

fscore = lgb_model.booster_.feature_importance()
feaNames = lgb_model.booster_.feature_name()
scoreDf = pd.DataFrame(index=feaNames, columns=['importance'], data=fscore)
print(scoreDf.sort_index(by=['importance'], ascending=False))






0.37901486345508767
                                  importance
prefix_title_rate                      10894
prefix_tag_rate                         8246
prefix_rate                             5820
query_prediction_rate                   5368
prefix_click_number                     4187
title_tag_rate                          3935
prefix_title_count                      2837
title_rate                              2708
prefix_title_click_number               2300
prefix_tag_click_number                 2220
tag_rate                                1976
prefix_title_types                      1937
is_title_in_query                       1784
title_tag_count                         1681
prefix_tag_count                        1671
prefix_count                            1331
query_prediction_count                  1284
tag_count                               1191
query_prediction_click_number           1115
prefix_tag_types                        1114
title_tag_click_number             



In [7]:
test_dataset['predicted_score'] = test_pred

train_prefix_set = set(train_dataset['prefix'])

test_dataset['is_prefix_in_train'] = test_dataset['prefix'].map(lambda x : 1 if x in train_prefix_set else 0)
print(np.mean(test_dataset[test_dataset.is_prefix_in_train == 1]['predicted_score']))
print(np.mean(test_dataset[test_dataset.is_prefix_in_train == 0]['predicted_score']))




0.3742659758197111
0.44615346651564886


In [10]:
test_prefix0_df = test_dataset[test_dataset.is_prefix_in_train == 1].copy()

#定义调整函数
def resultAdjustment(result_df, t):
    result_df_temp = result_df.copy()
    result_df_temp['x'] = result_df_temp.predicted_score.map(lambda x: -(math.log(((1 - x) / x), math.e)))
    result_df_temp['adjust_result'] = result_df_temp.x.map(lambda x: 1 / (1 + math.exp(-(x + t)))) 
    print(result_df_temp['adjust_result'].mean())
    return result_df_temp['adjust_result']

print('original mean : ', test_prefix0_df['predicted_score'].mean())
test_df_after = resultAdjustment(test_prefix0_df, 0.55585)


original mean :  0.3742659758197111
0.4461603692668604


In [11]:
test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 1] = test_df_after
print(np.mean(test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 0]))
print(np.mean(test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 1]))



0.44615346651564886
0.4461603692668604


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
test_dataset['predicted_label'] = test_dataset['predicted_score'].map(lambda x : 1 if x > 0.515 else 0)
print(np.mean(test_dataset['predicted_label']))



0.40802


In [15]:
# 导出预测结果
def exportResult(df, fileName):
    df.to_csv('../result/%s.csv' % fileName, header=False, index=False)

exportResult(test_dataset[['predicted_label']], 'lgb_yi_alldata_11_3')

