In [1]:
import numpy as np 
import pandas as pd
import time
import datetime
import gc
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2, SelectPercentile
import math
from sklearn.metrics import f1_score
import jieba
import jieba.posseg as psg
from collections import Counter
import functools
from gensim.models import word2vec
import Levenshtein


In [2]:
train_dataset = pd.read_csv('../temp/train_online_df.csv')
test_dataset = pd.read_csv('../temp/test_online_df.csv')



In [3]:
fea = [
    'query_prediction_number', 'query_prediction_max', 'query_prediction_min', 'query_prediction_mean', 'query_prediction_std',
       'prefix_count', 'prefix_rate',
 'title_count', 'title_rate', 'tag_count', 'tag_rate',
 'query_prediction_count', 'query_prediction_rate', 'prefix_title_count',
 'prefix_title_rate',  'prefix_tag_count', 'prefix_tag_rate',
 'title_tag_count', 'title_tag_rate',
    'prefix_click_number', 'title_click_number', 'query_prediction_click_number', 'prefix_tag_click_number', 
    'prefix_title_click_number', 'title_tag_click_number',
    'is_title_in_query', 'is_prefix_in_title', 
    'title_tag_types', 'prefix_tag_types', 'tag_title_types', 'tag_prefix_types',
 'title_prefix_types', 'prefix_title_types', 'tag_query_prediction_types', 'title_query_prediction_types',
      'prefix_len', 'title_len',
 'query_prediction_key_len_max', 'query_prediction_key_len_min',
 'query_prediction_key_len_mean', 'query_prediction_key_len_std',
 'len_title-prefix', 'len_prefix/title', 'len_mean-title', 'len_mean/title',
    'q_t_word_match', 'q_t_jaccard', 'q_t_common_words',
 'q_t_total_unique_words', 'q_t_wc_diff', 'q_t_wc_ratio',
 'q_t_wc_diff_unique', 'q_t_wc_ratio_unique', 'q_t_tfidf_word_match_share',
 'p_t_word_match', 'p_t_jaccard', 'p_t_common_words',
 'p_t_total_unique_words', 'p_t_wc_diff', 'p_t_wc_ratio',
 'p_t_wc_diff_unique', 'p_t_wc_ratio_unique', 'p_t_tfidf_word_match_share',
 'p_q_word_match', 'p_q_jaccard', 'p_q_common_words',
 'p_q_total_unique_words', 'p_q_wc_diff', 'p_q_wc_ratio',
 'p_q_wc_diff_unique', 'p_q_wc_ratio_unique', 'p_q_tfidf_word_match_share',
    'title_prefix_dot_similarity',
 'title_query_dot_similarity', 'title_prefix_norm_similarity',
 'title_query_norm_similarity', 'title_prefix_cosine_similarity',
 'title_query_cosine_similarity',
    'title_query_dot_similarity_max', 'title_query_dot_similarity_min',
 'title_query_dot_similarity_mean', 'title_query_dot_similarity_std',
    'title_query_norm_similarity_min', 'title_query_norm_similarity_mean',
 'title_query_norm_similarity_std', 'title_prefix_cosine_similarity',
    'title_query_cosine_similarity_max', 'title_query_cosine_similarity_min',
 'title_query_cosine_similarity_mean', 'title_query_cosine_similarity_std',
    'title_prefix_leven', 'title_prefix_leven_rate',
 'title_query_leven_sum', 'title_query_leven_max', 'title_query_leven_min',
 'title_query_leven_mean', 'title_query_leven_std',
      ]



In [4]:
lgb_model = lgb.LGBMClassifier(
    boosting_type='gbdt', num_leaves=127, max_depth=-1, n_estimators=5000, objective='binary',
    subsample=0.8, colsample_bytree=1, subsample_freq=1,
    learning_rate=0.01, random_state=2018, n_jobs=-1, num_boost_round=700
)

test_dataset['predicted_score'] = 0

# lgb_model.fit(train_df[fea], train_df['label'], eval_set=[(train_df[fea], train_df['label']),
#                             (valid_df[fea], valid_df['label'])], early_stopping_rounds=50, eval_metric='auc')
lgb_model.fit(train_dataset[fea], train_dataset['label'], eval_metric='auc')
test_pred = lgb_model.predict_proba(test_dataset[fea], num_iteration=700)[:, 1]
print(np.mean(test_pred))

fscore = lgb_model.booster_.feature_importance()
feaNames = lgb_model.booster_.feature_name()
scoreDf = pd.DataFrame(index=feaNames, columns=['importance'], data=fscore)
print(scoreDf.sort_index(by=['importance'], ascending=False))





0.3820973024838574
                                   importance
prefix_title_rate                        8445
prefix_tag_rate                          6665
prefix_rate                              5300
title_tag_rate                           3884
query_prediction_rate                    2738
title_rate                               2670
tag_rate                                 2659
prefix_click_number                      2436
prefix_title_count                       1948
prefix_title_click_number                1867
title_tag_count                          1783
tag_count                                1709
prefix_title_types                       1597
q_t_word_match                           1499
prefix_tag_count                         1430
p_t_tfidf_word_match_share               1359
title_query_norm_similarity_std          1337
prefix_tag_click_number                  1293
query_prediction_click_number            1138
title_tag_click_number                   1074
title_query_nor



In [5]:
test_dataset['predicted_score'] = test_pred

train_prefix_set = set(train_dataset['prefix'])

test_dataset['is_prefix_in_train'] = test_dataset['prefix'].map(lambda x : 1 if x in train_prefix_set else 0)
print(np.mean(test_dataset[test_dataset.is_prefix_in_train == 1]['predicted_score']))
print(np.mean(test_dataset[test_dataset.is_prefix_in_train == 0]['predicted_score']))



0.37606418001418024
0.4446379748502132


In [27]:
test_df_copy = test_dataset.copy()

test_df_1 = test_df_copy[test_df_copy.is_prefix_in_train == 1]
test_df_1['predict_label'] = test_df_1['predicted_score'].map(lambda x : 1 if x > 0.395 else 0)
print(np.mean(test_df_1['predict_label']))

test_df_0 = test_df_copy[test_df_copy.is_prefix_in_train == 0]
test_df_0['predict_label'] = test_df_0['predicted_score'].map(lambda x : 1 if x > 0.5115 else 0)
print(np.mean(test_df_0['predict_label']))



0.4004078857919782
0.4010002273243919


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [28]:
test_df_copy['predict_label'] = 0

test_df_copy['predict_label'][test_df_copy.is_prefix_in_train == 1] = test_df_1['predict_label']
test_df_copy['predict_label'][test_df_copy.is_prefix_in_train == 0] = test_df_0['predict_label']
print(np.mean(test_df_copy[test_df_copy.is_prefix_in_train == 1]['predict_label']))
print(np.mean(test_df_copy[test_df_copy.is_prefix_in_train == 0]['predict_label']))
print(np.mean(test_df_copy['predict_label']))



0.4004078857919782
0.4010002273243919
0.40046


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [29]:
# 导出预测结果
def exportResult(df, fileName):
    df.to_csv('../result/%s.csv' % fileName, header=False, index=False)

exportResult(test_df_copy[['predict_label']], 'lgb_wen_11_1')



In [35]:
# test_df_copy['predict_label'] = test_df_copy['predicted_score'].map(lambda x : 1 if x > 0.394 else 0)
# print(np.mean(test_df_copy['predict_label']))



0.41028


In [36]:
# exportResult(test_df_copy[['predict_label']], 'lgb_wen_10_30')


In [6]:
test_prefix0_df = test_dataset[test_dataset.is_prefix_in_train == 1].copy()

#定义调整函数
def resultAdjustment(result_df, t):
    result_df_temp = result_df.copy()
    result_df_temp['x'] = result_df_temp.predicted_score.map(lambda x: -(math.log(((1 - x) / x), math.e)))
    result_df_temp['adjust_result'] = result_df_temp.x.map(lambda x: 1 / (1 + math.exp(-(x + t)))) 
    print(result_df_temp['adjust_result'].mean())
    return result_df_temp['adjust_result']

print('original mean : ', test_prefix0_df['predicted_score'].mean())
test_df_after = resultAdjustment(test_prefix0_df, 0.49985)




original mean :  0.37606418001418024
0.4446454488950832


In [7]:
test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 1] = test_df_after
print(np.mean(test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 0]))
print(np.mean(test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 1]))


0.4446379748502132
0.4446454488950832


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [9]:
# # 导出预测结果
# def exportScore(df, fileName):
#     df.to_csv('../result/%s.csv' % fileName, header=True, index=False)

# exportScore(test_dataset[['is_prefix_in_train', 'predicted_score']], 'keng_score')


In [11]:
test_dataset['predicted_label'] = test_dataset['predicted_score'].map(lambda x : 1 if x > 0.509 else 0)
print(np.mean(test_dataset['predicted_label']))


0.40544


In [12]:
# 导出预测结果
def exportResult(df, fileName):
    df.to_csv('../result/%s.csv' % fileName, header=False, index=False)

exportResult(test_dataset[['predicted_label']], 'lgb_keng_11_2')



In [23]:
# 五折模型结果
test_dataset['predicted_score'] = 0

skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True)
early_stopping_dict = {'0' : 670, '1' : 560, '2' : 700, '3' : 680, '4' : 680}
for index, (train_index, test_index) in enumerate(skf.split(train_dataset, train_dataset['label'])):
    num_boost_round = early_stopping_dict[str(index)]
    lgb_model = lgb.LGBMClassifier(
        boosting_type='gbdt', num_leaves=127, max_depth=-1, n_estimators=5000, objective='binary',
        subsample=0.8, colsample_bytree=1, subsample_freq=1,
        learning_rate=0.01, random_state=2018, n_jobs=-1, num_boost_round=num_boost_round
    )
    lgb_model.fit(train_dataset[fea].iloc[train_index], train_dataset['label'][train_index], eval_metric='auc')
    test_pred = lgb_model.predict_proba(test_dataset[fea], num_iteration=num_boost_round)[:, 1]
    
    print('test mean:', test_pred.mean())
    test_dataset['predicted_score'] = test_dataset['predicted_score'] + test_pred

test_dataset['predicted_score'] = test_dataset['predicted_score'] / 5
mean = test_dataset['predicted_score'].mean()
print('mean:', mean)





test mean: 0.38187212610870797




test mean: 0.3810200871274055




test mean: 0.3818233445511438




test mean: 0.38186220531599846




test mean: 0.3818072348278654
mean: 0.3816769995862319


In [25]:
print(len(test_dataset[test_dataset.is_prefix_in_train == 0]))
print(np.mean(test_dataset[test_dataset.is_prefix_in_train == 1]['predicted_score']))
print(np.mean(test_dataset[test_dataset.is_prefix_in_train == 0]['predicted_score']))



4399
0.3757038099658089
0.44359639487622804


In [29]:
test_prefix0_df = test_dataset[test_dataset.is_prefix_in_train == 1].copy()

#定义调整函数
def resultAdjustment(result_df, t):
    result_df_temp = result_df.copy()
    result_df_temp['x'] = result_df_temp.predicted_score.map(lambda x: -(math.log(((1 - x) / x), math.e)))
    result_df_temp['adjust_result'] = result_df_temp.x.map(lambda x: 1 / (1 + math.exp(-(x + t)))) 
    print(result_df_temp['adjust_result'].mean())
    return result_df_temp['adjust_result']

print('original mean : ', test_prefix0_df['predicted_score'].mean())
test_df_after = resultAdjustment(test_prefix0_df, 0.49285)



original mean :  0.3757038099658089
0.44352972613919045


In [30]:
test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 1] = test_df_after
print(np.mean(test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 0]))
print(np.mean(test_dataset['predicted_score'][test_dataset.is_prefix_in_train == 1]))


0.44359639487622804
0.44352972613919045


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [31]:
test_dataset['predicted_label'] = test_dataset['predicted_score'].map(lambda x : 1 if x > 0.5 else 0)
print(np.mean(test_dataset['predicted_label']))



0.40992


In [32]:
exportResult(test_dataset[['predicted_label']], 'lgb_yi_10_31')
