### competition metric

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn import metrics
from tqdm import tqdm

In [3]:
def metric(df, preds, verbose=True):
    
    if verbose:
        iterator = lambda x: tqdm(x)
    else:
        iterator = list
        
    df["prediction"] = list(preds)
    maes = []
    for t in iterator(df.type.unique()):
        y_true = df[df.type==t].scalar_coupling_constant.values
        y_pred = df[df.type==t].prediction.values
        mae = np.log(metrics.mean_absolute_error(y_true, y_pred))
        maes.append(mae)
    return np.mean(maes)

In [4]:
def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    """
    Fast metric computation for this competition: https://www.kaggle.com/c/champs-scalar-coupling
    Code is from this kernel: https://www.kaggle.com/uberkinder/efficient-metric
    """
    maes = (y_true-y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

In [5]:
def mean_log_mae(y_true, y_pred, types, verbose=True):
    if verbose:
        iterator = lambda x: tqdm(x)
    else:
        iterator = list
    
    per_type_data = {
        t : {
            'true': [],
            'pred': []
        } 
        for t in list(set(types))
    }
    for true, pred, t in iterator(zip(y_true, y_pred, types)):
        per_type_data[t]['true'].append(true)
        per_type_data[t]['pred'].append(pred)
        
    maes = []
    for t in iterator(set(types)):
        maes.append(np.log(metrics.mean_absolute_error(per_type_data[t]['true'], per_type_data[t]['pred'])))
        
    return np.mean(maes)
        

In [6]:
def speedup_mean_log_mae(y_true, y_pred, types, verbose=True):
    if verbose:
        iterator = lambda x: tqdm(x)
    else:
        iterator = list
    
    per_type_data = {
        t : {
            'true': [],
            'pred': []
        } 
        for t in list(set(types))
    }
    for true, pred, t in iterator(zip(y_true, y_pred, types)):
        per_type_data[t]['true'].append(true)
        per_type_data[t]['pred'].append(pred)
        
    maes = []
    for t in iterator(set(types)):
        maes.append(
            jit_log_mae(
                np.array(per_type_data[t]['true'], dtype=np.float32),
                np.array(per_type_data[t]['pred'], dtype=np.float32)
            )
        )
        
    return np.mean(maes)
        

In [7]:
from numba import jit, float32

In [8]:
@jit(float32(float32[:], float32[:]))
def jit_log_mae(y_true: np.ndarray, y_pred: np.ndarray):
    n = y_true.shape[0]
    return np.log(np.sum(np.absolute(y_true - y_pred))/n)

In [9]:
@jit
def jit_mean_log_mae(y_true, y_pred, types):
    
    uniq_types: np.ndarray = np.unique(types)
    
    per_type_data = dict()
    for t in uniq_types:
        per_type_data[t] = {
            'true': [],
            'pred': []
        }
    
    for true, pred, t in zip(y_true, y_pred, types):
        per_type_data[t]['true'].append(true)
        per_type_data[t]['pred'].append(pred)
        
    maes = []
    for t in uniq_types:
        maes.append(jit_log_mae(np.array(per_type_data[t]['true'], dtype=np.float32), np.array(per_type_data[t]['pred'], dtype=np.float32)))
        
    return np.mean(maes)
        

In [11]:
train = pd.read_csv("../data/raw/train.csv")

In [12]:
train = train.sample(frac=1000/(len(train))).reset_index(drop=True)

In [13]:
jit_mean_log_mae(train.scalar_coupling_constant, np.zeros(len(train)), train.type)

Compilation is falling back to object mode WITH looplifting enabled because Function "jit_mean_log_mae" failed type inference due to: non-precise type pyobject
[1] During: typing of argument at <ipython-input-9-a9f264e3121e> (4)

File "<ipython-input-9-a9f264e3121e>", line 4:
def jit_mean_log_mae(y_true, y_pred, types):
    <source elided>
    
    uniq_types: np.ndarray = np.unique(types)
    ^

  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "jit_mean_log_mae" failed type inference due to: cannot determine Numba type of <class 'numba.dispatcher.LiftedLoop'>

File "<ipython-input-9-a9f264e3121e>", line 7:
def jit_mean_log_mae(y_true, y_pred, types):
    <source elided>
    per_type_data = dict()
    for t in uniq_types:
    ^

  @jit

File "<ipython-input-9-a9f264e3121e>", line 2:
@jit
def jit_mean_log_mae(y_true, y_pred, types):
^

  self.func_ir.loc))
Fall-back from the nopython compilation path to the object mode compilation path has b

1.9612855417653918

In [14]:
zeros = np.zeros(len(train))

In [19]:
%%timeit -n 1000
mean_log_mae(train.scalar_coupling_constant.values, zeros, train.type.values, verbose=False)

1.42 ms ± 5.94 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [17]:
%%timeit -n 1000
group_mean_log_mae(train.scalar_coupling_constant, zeros, train.type)

1.86 ms ± 21.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [18]:
%%timeit -n 1000
speedup_mean_log_mae(train.scalar_coupling_constant.values, zeros, train.type.values, verbose=False)

610 µs ± 669 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
%%timeit -n 1000
jit_mean_log_mae(train.scalar_coupling_constant.values, zeros, train.type.values)

Compilation is falling back to object mode WITH looplifting enabled because Function "jit_mean_log_mae" failed type inference due to: non-precise type pyobject
[1] During: typing of argument at <ipython-input-7-a9f264e3121e> (4)

File "<ipython-input-7-a9f264e3121e>", line 4:
def jit_mean_log_mae(y_true, y_pred, types):
    <source elided>
    
    uniq_types: np.ndarray = np.unique(types)
    ^

  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "jit_mean_log_mae" failed type inference due to: cannot determine Numba type of <class 'numba.dispatcher.LiftedLoop'>

File "<ipython-input-7-a9f264e3121e>", line 7:
def jit_mean_log_mae(y_true, y_pred, types):
    <source elided>
    per_type_data = dict()
    for t in uniq_types:
    ^

  @jit

File "<ipython-input-7-a9f264e3121e>", line 2:
@jit
def jit_mean_log_mae(y_true, y_pred, types):
^

  self.func_ir.loc))
Fall-back from the nopython compilation path to the object mode compilation path has b

11.6 ms ± 149 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [15]:
%%timeit -n 1000
metric(train, np.zeros(len(train)), verbose=False)

20.5 ms ± 149 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
