In [1]:
# Disable the IPython pager
# https://gist.github.com/minrk/7715212
from IPython.core import page
page.page = print

In [2]:
# enable the %lprun magic
%load_ext line_profiler

# 05 - Performance (solutions)

This notebook contains sample solutions to exercises from [exercises/05_performance.ipynb](../exercises/05_performance.ipynb)

Setup (the same as in the exercise notebook)

In [3]:
import math
import numpy as np
import numba 

In [4]:
N = 1_000_000

In [5]:
X = np.random.randn(N)
Y = np.random.randn(N)

In [6]:
@numba.njit
def rmse_numba(x, y):
    N = len(x)
    errors = (x - y) ** 2
    mean = errors.sum() / N
    return math.sqrt(mean)

## Parallel execution

Using [`parallel=True`](https://numba.pydata.org/numba-doc/dev/user/parallel.html) in `@njit` can let some operatons run in parallel and better utilize the CPU

In [7]:
@numba.njit(parallel=True)
def rmse_numba_parallel(x, y):
    N = len(x)
    errors = (x - y) ** 2
    mean = errors.sum() / N
    return math.sqrt(mean)

In [8]:
%timeit rmse_numba(X, Y)

8.74 ms ± 509 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%timeit rmse_numba_parallel(X, Y)

1.03 ms ± 106 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Profile and optimize `calc_lift()`

In [10]:
def calc_lift(values, weights):
    cm_value = 0
    cm_value_pct = 0
    cm_weight = 0
    cm_weight_pct = 0
    
    lift = 0
    
    for value,weight in zip(values, weights):
        cm_value += value
        cm_value_pct = cm_value / values.sum()
        cm_weight += weight
        cm_weight_pct = cm_weight / weights.sum()
        
        lift += 2 * (cm_value_pct - cm_weight_pct) / weights.sum()
        
    return lift

In [11]:
M = 10_000

In [12]:
V = np.random.randn(M)
W = np.ones(M)

In [13]:
%timeit calc_lift(V, W)

888 ms ± 301 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%lprun -f calc_lift calc_lift(V, W)

Timer unit: 1e-07 s

Total time: 1.3794 s
File: <ipython-input-10-115a3bb8d352>
Function: calc_lift at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def calc_lift(values, weights):
     2         1         34.0     34.0      0.0      cm_value = 0
     3         1         18.0     18.0      0.0      cm_value_pct = 0
     4         1         15.0     15.0      0.0      cm_weight = 0
     5         1         15.0     15.0      0.0      cm_weight_pct = 0
     6                                               
     7         1         15.0     15.0      0.0      lift = 0
     8                                               
     9     10001     433167.0     43.3      3.1      for value,weight in zip(values, weights):
    10     10000     281699.0     28.2      2.0          cm_value += value
    11     10000    4111433.0    411.1     29.8          cm_value_pct = cm_value / values.sum()
    12     10000     434137.0     4

As the easiest option, let's just add `@numba.njit`

In [15]:
@numba.njit
def lift_numba(values, weights):
    cm_value = 0
    cm_value_pct = 0
    cm_weight = 0
    cm_weight_pct = 0
    
    lift = 0
    
    for value,weight in zip(values, weights):
        cm_value += value
        cm_value_pct = cm_value / values.sum()
        cm_weight += weight
        cm_weight_pct = cm_weight / weights.sum()
        
        lift += 2 * (cm_value_pct - cm_weight_pct) / weights.sum()
        
    return lift

In [16]:
%timeit lift_numba(V, W)

440 ms ± 18.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


We can achieve better results (and with less code) by using functinoality already available in Numpy:
* vectorized operations and 
* cummulative sum ([`cumsum()`](https://docs.scipy.org/doc/numpy/reference/generated/numpy.cumsum.html))

In [17]:
def lift_numpy(values, weights):
    cm_pct_actual = (values / values.sum()).cumsum()
    cm_pct_weight = (weights / weights.sum()).cumsum()
    lift = 2 * (cm_pct_actual - cm_pct_weight) / weights.sum()
    
    return lift.sum()

In [18]:
%timeit lift_numpy(V, W)

528 µs ± 118 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


As a final experiment, let's try to JIT that

In [19]:
@numba.njit
def lift_numpy_numba(values, weights):
    cm_pct_actual = (values / values.sum()).cumsum()
    cm_pct_weight = (weights / weights.sum()).cumsum()
    lift = 2 * (cm_pct_actual - cm_pct_weight) / weights.sum()
    
    return lift.sum()

In [20]:
%timeit lift_numpy_numba(V, W)

252 µs ± 51.2 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
