# prepare testcase

In [3]:
import numpy as np
import pandas as pd
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

# baseline version

In [4]:
import numpy as np

def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

## base line performance

In [3]:
%%timeit
target_mean_v2(data, 'y', 'x')

308 ms ± 12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## my solution

In [2]:
%load_ext Cython

In [18]:
%%cython -a

cimport cython
cimport numpy as cnp
import numpy as np


@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v7(data, str y_name, str x_name):
    cdef int l = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(l, dtype=float)
    cdef cnp.ndarray[double] sums = np.zeros(10)
    cdef cnp.ndarray[double] counts = np.zeros(10)
    
    cdef cnp.ndarray[long] x = data[x_name].values
    cdef cnp.ndarray[long] y = data[y_name].values

    cdef int i = 0
    
    for i in range(l):
        sums[x[i]] += y[i]
        counts[x[i]] += 1
    
    for i in range(l):
        result[i] = (sums[x[i]] - y[i]) / (counts[x[i]] - 1)
    return result

## confirm that the result is correct

In [61]:
assert (target_mean_v7(data, 'y', 'x') == target_mean_v2(data, 'y', 'x')).all()

## check the performance of solution

In [63]:
%%timeit
target_mean_v7(data, 'y', 'x')

47.4 µs ± 577 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
