In [1]:
import numpy as np 
import pandas as pd
import time
import datetime
import gc
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2, SelectPercentile
import math
from sklearn.metrics import f1_score
import jieba
import jieba.posseg as psg
from collections import Counter
import functools
from gensim.models import word2vec
import Levenshtein


In [2]:
keng_result_df = pd.read_csv('../result/keng_score.csv')
yuna_result_df = pd.read_csv('../result/lgb1_select_pred.csv')
print(keng_result_df.head())


   is_prefix_in_train  predicted_score
0                   1         0.060153
1                   1         0.702566
2                   1         0.678097
3                   1         0.286608
4                   1         0.092973


In [3]:
yuna_result_df['is_prefix_in_train'] = keng_result_df['is_prefix_in_train']
yuna_result_df.rename(columns={'pred':'predicted_score'}, inplace=True)


In [4]:
print(np.mean(keng_result_df['predicted_score'][keng_result_df.is_prefix_in_train == 0]))
print(np.mean(keng_result_df['predicted_score'][keng_result_df.is_prefix_in_train == 1]))


0.4446379748502132
0.4446454488950832


In [5]:
yuna_test_prefix0_df = yuna_result_df[yuna_result_df.is_prefix_in_train == 0].copy()
yuna_test_prefix1_df = yuna_result_df[yuna_result_df.is_prefix_in_train == 1].copy()

#定义调整函数
def resultAdjustment(result_df, t):
    result_df_temp = result_df.copy()
    result_df_temp['x'] = result_df_temp.predicted_score.map(lambda x: -(math.log(((1 - x) / x), math.e)))
    result_df_temp['adjust_result'] = result_df_temp.x.map(lambda x: 1 / (1 + math.exp(-(x + t)))) 
    print(result_df_temp['adjust_result'].mean())
    return result_df_temp['adjust_result']

print('original mean : ', yuna_test_prefix0_df['predicted_score'].mean())
yuna_test_df_after0 = resultAdjustment(yuna_test_prefix0_df, -0.0231)
yuna_test_df_after1 = resultAdjustment(yuna_test_prefix1_df, 0.49635)


original mean :  0.4493946473263437
0.44464061240463637
0.44462622176804295


In [6]:
yuna_result_df['predicted_score'][yuna_result_df.is_prefix_in_train == 0] = yuna_test_df_after0
yuna_result_df['predicted_score'][yuna_result_df.is_prefix_in_train == 1] = yuna_test_df_after1
print(np.mean(yuna_result_df['predicted_score'][yuna_result_df.is_prefix_in_train == 0]))
print(np.mean(yuna_result_df['predicted_score'][yuna_result_df.is_prefix_in_train == 1]))


0.44464061240463637
0.44462622176804295


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [9]:
keng_result_df.rename(columns={'predicted_score':'keng_pred'}, inplace=True)
keng_result_df['yuna_pred'] = yuna_result_df['predicted_score']
print(np.mean(keng_result_df['yuna_pred']))


0.44462748785625167


In [12]:
def get_max_pred(df):
    keng_pred = df['keng_pred']
    yuna_pred = df['yuna_pred']
    if keng_pred > yuna_pred:
        return keng_pred
    else:
        return yuna_pred
    
keng_result_df['predicted_score'] = keng_result_df.apply(get_max_pred, axis=1)
print(keng_result_df.head())


   is_prefix_in_train  keng_pred  yuna_pred  predicted_score
0                   1   0.060153   0.060377         0.060377
1                   1   0.702566   0.686186         0.702566
2                   1   0.678097   0.683564         0.683564
3                   1   0.286608   0.264104         0.286608
4                   1   0.092973   0.094173         0.094173


In [30]:
keng_result_df['predicted_score'] = keng_result_df['keng_pred'] * 0.5 + keng_result_df['yuna_pred'] * 0.5
print(keng_result_df.head())


   is_prefix_in_train  keng_pred  yuna_pred  predicted_score
0                   1   0.060153   0.060377         0.060265
1                   1   0.702566   0.686186         0.694376
2                   1   0.678097   0.683564         0.680831
3                   1   0.286608   0.264104         0.275356
4                   1   0.092973   0.094173         0.093573


In [17]:
keng_result_df['predicted_label'] = keng_result_df['predicted_score'].map(lambda x : 1 if x > 0.519 else 0)
print(np.mean(keng_result_df['predicted_label']))


0.4081


In [18]:
# 导出预测结果
def exportResult(df, fileName):
    df.to_csv('../result/%s.csv' % fileName, header=False, index=False)

exportResult(keng_result_df[['predicted_label']], 'keng_yuna_ronghe_11_5')
