In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

In [2]:
valid_result = pd.read_csv('../../yuna/result/xkl_b_vali.csv')
train_df = pd.read_table('../../data/oppo_data_ronud2_20181107/data_train.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, na_values='', keep_default_na=False, encoding='utf-8', quoting=3)
valid_df = pd.read_table('../../data/oppo_data_ronud2_20181107/data_vali.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, na_values='', keep_default_na=False, encoding='utf-8', quoting=3)
test_df = pd.read_table('../../data/oppo_data_ronud2_20181107/data_testB.txt', names=['prefix', 'query_prediction', 'title', 'tag', 'label'], header=None, na_values='', keep_default_na=False, encoding='utf-8', quoting=3)
    


In [3]:
temp_df = pd.concat([train_df, valid_df, test_df])
temp_df_pivot_table = pd.pivot_table(temp_df, index='prefix', values='title', aggfunc=len)
temp_df_pivot_table.reset_index(inplace=True)
temp_df_pivot_table.rename(columns={'title':'prefix_number'}, inplace=True)


In [4]:
valid_df['pred'] = valid_result['pred']
valid_df = pd.merge(valid_df, temp_df_pivot_table, on='prefix', how='left')



In [5]:
train_pivot_table = pd.pivot_table(train_df, index='prefix', values='label', aggfunc=np.mean)
train_pivot_table.reset_index(inplace=True)
train_pivot_table.rename(columns={'label':'label_mean'}, inplace=True)


In [6]:
valid_df = pd.merge(valid_df, train_pivot_table, on='prefix', how='left')


In [7]:
train_prefix_set = set(train_df['prefix'])
valid_df['is_prefix_in'] = valid_df['prefix'].map(lambda x : 1 if x in train_prefix_set else 0)
test_df['is_prefix_in'] = test_df['prefix'].map(lambda x : 1 if x in train_prefix_set else 0)

train_title_set = set(train_df['title'])
valid_df['is_title_in'] = valid_df['title'].map(lambda x : 1 if x in train_title_set else 0)
test_df['is_title_in'] = test_df['title'].map(lambda x : 1 if x in train_title_set else 0)


In [8]:
print(len(valid_df))
print(len(valid_df[(valid_df.is_prefix_in == 1) & (valid_df.is_title_in == 0) & (valid_df.prefix_number <= 3)]))


50000
966


In [15]:
def result_deal(df):
    is_prefix_in = df['is_prefix_in']
    is_title_in = df['is_title_in']
    prefix_number = df['prefix_number']
    pred = df['pred']
    label_mean = df['label_mean']
    if (is_prefix_in == 1) & (is_title_in == 0) & (prefix_number == 3):
        if label_mean > 0:
            pred = pred / 2
        else:
            pred = pred + (1 - pred) / 2
    return pred

valid_df['pred_after'] = valid_df.apply(result_deal, axis=1)
        


In [16]:
valid_df['is_deal'] = (valid_df['pred'] != valid_df['pred_after'])
print(len(valid_df[valid_df.is_deal]))


317


In [17]:
def getPredLabel(predArr, threshold=None, tops=None):
    '''
    根据阈值返回分类预测结果
    '''
    if tops is not None :
        temp = np.sort(np.array(predArr))
        if tops < 1:
            threshold = temp[-1*round(len(temp)*tops)]
        else:
            threshold = temp[-round(tops)]
    if threshold is None:
        print('[Error] could not get threshold value.')
        exit()
    return (predArr>=threshold).astype(int)

def findF1Threshold(predictList, labelList, thrList=None):
    '''
    寻找F1最佳阈值
    '''
    tempDf = pd.DataFrame({'predict':predictList, 'label':labelList})
    trueNum = len(tempDf[tempDf.label==1])
    if thrList is None:
        thrList = np.unique(tempDf['predict'])
    f1List = []
    for thr in thrList:
        tempDf['temp'] = getPredLabel(tempDf['predict'], thr)
        TP = len(tempDf[(tempDf.label==1)&(tempDf.temp==1)])
        if TP==0:
            break
        positiveNum = len(tempDf[tempDf.temp==1])
        precise = TP / positiveNum
        recall = TP / trueNum
        f1 = 2 * precise * recall / (precise + recall)
        f1List.append(f1)
    f1Df = pd.DataFrame({'thr':thrList[:len(f1List)], 'f1':f1List}).sort_values(by=['f1','thr'], ascending=[False,True])
    if thrList is None:
        averThr = f1Df.head(5).sort_values(by=['thr']).head(4)['thr'].mean()    # 取前5，去掉最大阈值后取平均
        return averThr
    else:
        bestThr = thrList[f1List.index(max(f1List))]
        return bestThr


In [18]:
pred_f1_list = [f1_score(valid_df['label'], getPredLabel(valid_df['pred'], x * 0.01)) for x in range(30, 45)]
print(pred_f1_list)


[0.7375424304141209, 0.7390985373011142, 0.7396650118312996, 0.7406554681076003, 0.7420597185241536, 0.7449977070309671, 0.7455494317904698, 0.7476580218913322, 0.74749484049034, 0.7478932584269662, 0.7462679014219928, 0.7426820791390984, 0.7422181770901053, 0.740455414930692, 0.7398795053377021]


In [19]:
pred_after_f1_list = [f1_score(valid_df['label'], getPredLabel(valid_df['pred_after'], x * 0.01)) for x in range(30, 45)]
print(pred_after_f1_list)


[0.7374830239927569, 0.7390387084938542, 0.7396338492238439, 0.7405719119340209, 0.7419577260514991, 0.7449068263010524, 0.7454212900865748, 0.7475902871933934, 0.747488811536549, 0.7479185474972414, 0.7462746983074859, 0.7426703390048649, 0.7422063908894724, 0.7404438156619245, 0.7398678996036989]


In [20]:
print(len(valid_df[(valid_df.pred_after > 0.5) & (valid_df.is_deal) & (valid_df.label == 0)]))
print(len(valid_df[(valid_df.pred_after < 0.5) & (valid_df.is_deal) & (valid_df.label == 1)]))



27
27
