In [3]:
import numpy as np
import pandas as pd

In [4]:
%load_ext Cython

In [69]:
# 第一版
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [70]:
y_name = np.random.randint(2, size=(5000, 1))
x_name = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y_name, x_name], axis=1), columns=['y_name', 'x_name'])
result_1 = target_mean_v1(data, 'y_name', 'x_name')

In [71]:
%%timeit
target_mean_v1(data, 'y_name', 'x_name')

1 loop, best of 3: 25 s per loop


### 分析版本1效率损失：group by对此进行改进

In [72]:
# 第二版
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name] 
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        # 核心，去掉x_name对应的y_name,求mean
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [73]:
y_name = np.random.randint(2, size=(500, 1))
x_name = np.random.randint(10, size=(500, 1))
data = pd.DataFrame(np.concatenate([y_name, x_name], axis=1), columns=['y_name', 'x_name'])
result_2 = target_mean_v2(data, 'y_name', 'x_name')

In [74]:
%%timeit
target_mean_v2(data, 'y_name', 'x_name')

10 loops, best of 3: 26.7 ms per loop


In [92]:
# 第三版   改Cython
%%cython -a
import numpy as np
cimport numpy as np
import pandas as pd
cpdef target_mean_v3_cython(x_name, y_name):
    len = y_name.shape[0]
    result = np.zeros(len)
    value_dict = dict()
    count_dict = dict()
    for i in range(len):
        x = x_name[i, 0]
        y = y_name[i, 0]
        if x not in value_dict.keys():
            value_dict[x] = y
            count_dict[x] = 1
        else:
            value_dict[x] += y
            count_dict[x] += 1
    for i in range(len):
        result[i] = (value_dict[x] - y) / (count_dict[x] - 1)
        return result

In [93]:
y_name = np.random.randint(2, size=(5000, 1))
x_name = np.random.randint(10, size=(5000, 1))

In [94]:
%%timeit
result_3_cython = target_mean_v3_cython(x_name,y_name)

100 loops, best of 3: 3.59 ms per loop


In [86]:
# 第四版
%%cython -a

cimport cython
cimport numpy as cnp
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)

cpdef target_mean_v4(cnp.ndarray data):
  result = np.zeros(data.shape[0])
  value_dict = dict()
  count_dict = dict()
  for i in range(data.shape[0]):
    if data[i][1] not in value_dict:
      value_dict[data[i][1]] = data[i][0]
      count_dict[data[i][1]] = 1
    else:
      value_dict[data[i][1]] += data[i][0]
      count_dict[data[i][1]] += 1
  for i in range(data.shape[0]):
    result[i] = (value_dict[data[i][1]] - data[i][0]) / (count_dict[data[i][1]] - 1)
  return result

In [87]:
y_name = np.random.randint(2, size=(5000, 1))
x_name = np.random.randint(10, size=(5000, 1))
data = np.concatenate([y_name, x_name], axis=1)

In [88]:
%%timeit
result_4_cython = target_mean_v4(data)

100 loops, best of 3: 12.6 ms per loop


In [108]:
# 第五版
%%cython -a
import cython
cimport cython
cimport numpy as cnp
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)

cpdef target_mean_v5(cnp.ndarray[long,ndim=2] data):
  cdef int len =data.shape[0]
  cdef cnp.ndarray[double] result = np.zeros(data.shape[0])
  cdef dict value_dict = {}
  cdef dict count_dict = {}
  cdef int i

  for i in range(len):
    x = data[i, 0]
    y = data[i, 0]
    if x not in value_dict.keys():
      value_dict[x] = y
      count_dict[x] = 1
    else:
      value_dict[x] += y
      count_dict[x] += 1
  for i in range(len):
    result[i] = (value_dict[x] - y) / (count_dict[x] - 1)
  return result

In [109]:
y_name = np.random.randint(2, size=(5000, 1))
x_name = np.random.randint(10, size=(5000, 1))
data = np.concatenate([y_name, x_name], axis=1)

In [110]:
%%timeit
result_5_cython = target_mean_v5(data)

1000 loops, best of 3: 1.11 ms per loop


In [201]:
# 第六版
%%cython
from cython.parallel import prange
import numpy as np
cimport numpy as cnp
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)

cpdef target_mean_v6(cnp.ndarray[long,ndim=2] data):
    cdef int len = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(len,dtype=np.double)
    cdef cnp.ndarray[long] y_name = data[:,0].astype(np.int_)
    cdef cnp.ndarray[long] x_name = data[:,1].astype(np.int_)
    cdef cnp.ndarray[long] value_map = np.zeros(len).astype(np.int_)
    cdef cnp.ndarray[long] count_map = np.zeros(len).astype(np.int_)
    cdef int i
    for i in prange(len, nogil=True):
        value_map[x_name[i]] += y_name[i]
        count_map[x_name[i]] += 1
    for i in prange(len, nogil=True):
        result[i] = (value_map[x_name[i]] - y_name[i]) / (count_map[x_name[i]] - 1)
    return result

In [202]:
y_name = np.random.randint(2, size=(5000, 1))
x_name = np.random.randint(10, size=(5000, 1))
data = np.concatenate([y_name, x_name], axis=1)

In [203]:
%%timeit
result_6_cython = target_mean_v6(data)

The slowest run took 6.72 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 49.9 µs per loop
