In [85]:
import random
from functools import partial

import numpy as np
import pandas as pd
from numba import jit


data = pd.read_csv("/Users/kun/Downloads/pred_mul_lgb_b20190704_235241.csv")

def exp_multi_f1(pred, int_preds, weights=None, silent=True):
    int_preds_dummy = pd.get_dummies(pd.Series(int_preds))
    pred = pd.DataFrame(pred, columns=int_preds_dummy.columns)
    scores = []
    for c in pred.columns:
        n = pred[c].sum()
        m = int_preds_dummy[c].sum()
        r = pred[int_preds_dummy[c] == 1][c].sum()
        f1 = 2 * r / (m + n)
        if not silent:
            print("{}       :   {}".format(c, f1))
        scores.append(f1)
    return np.average(scores, weights=weights)

#差分
# @jit(nopython=True)
@jit
def derivative(arg, p):
    m, n, r = arg
    s = m + n
    return 2 * (p * s - r) / (s + 1) / s

def get_multi_f1_threshold_di(preds, int_preds, preds_flag):
    lenn = int_preds.shape[0]
    int_preds_matrix = np.zeros([lenn, 12])
    int_preds_matrix[np.arange(lenn), int_preds] = 1
    para_dict = {}
    for i in range(preds.shape[1]):
        m = preds[:, i].sum()                               ##真实个数（估值）
        n = int_preds_matrix[:, i].sum()                    ##预测个数
        r = preds[int_preds_matrix[:, i] == 1, i].sum()     ##正确个数（估值）
        para_dict[i] = [m, n, r]
    for i in range(preds.shape[0]):
        if preds_flag[i]:
            continue
        else:
            temp = np.argmax([derivative(para_dict[j],preds[i,j])*weights[j] for j in range(preds.shape[1])])
            orig = int_preds[i]
            if temp != orig:
                m, n, r = para_dict[temp]
                para_dict[temp] = (m,n+1,r+preds[i,temp])
                m, n, r = para_dict[orig]
                para_dict[orig] = (m, n - 1, r - preds[i, orig])
                int_preds[i] = temp
    return int_preds

# 多分类f1最佳阈值
def get_multi_f1_threshold(preds, weights=None, n_round=3):
    '''
    :param preds: 二维的概率矩阵
    :param weight: 每个种类所占的比重，默认为1（权重一样）
    :param n_round: 优化循环次数
    :return:
    '''
    if weights is None :
        weights = np.ones(preds.shape[1])

    int_preds = preds.argmax(axis=1)
    preds_flag = list(preds.max(axis=1)>0.65)
    for i in range(n_round):
        int_preds = get_multi_f1_threshold_di(preds, int_preds, preds_flag)
    return int_preds


# import time
# start_time = time.time()
# res_old = get_multi_f1_threshold(data.iloc[:,2:14].values)
# print(time.time() - start_time)

# 期望的分：    0.5465103303103639
# 期望的分：    0.5486109523853776
# 期望的分：    0.5486139530125725
# 15.98586392402649s


# 改动点1: 属于全变为Numpy和python内置数据类型
# 改动点2: 利用numba, derivative函数用jit装饰
# 改动点3: 函数放外面，不嵌套
import time
start_time = time.time()
res_numba = get_multi_f1_threshold(data.iloc[:,2:14].values)
print(time.time() - start_time)
# 8.370285987854004s

print(np.mean(res_old == res_numba))

In [88]:
import time
start_time = time.time()
res_numba = get_multi_f1_threshold(data.iloc[:,2:14].values)
print(time.time() - start_time)

7.561756134033203
