### competition metric

In [82]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn import metrics
from tqdm import tqdm

In [54]:
def metric(df, preds, verbose=True):
    
    if verbose:
        iterator = lambda x: tqdm(x)
    else:
        iterator = list
        
    df["prediction"] = list(preds)
    maes = []
    for t in iterator(df.type.unique()):
        y_true = df[df.type==t].scalar_coupling_constant.values
        y_pred = df[df.type==t].prediction.values
        mae = np.log(metrics.mean_absolute_error(y_true, y_pred))
        maes.append(mae)
    return np.mean(maes)

In [94]:
def mean_log_mae(y_true, y_pred, types, verbose=True):
    if verbose:
        iterator = lambda x: tqdm(x)
    else:
        iterator = list
    
    per_type_data = {
        t : {
            'true': [],
            'pred': []
        } 
        for t in list(set(types))
    }
    for true, pred, t in iterator(zip(y_true, y_pred, types)):
        per_type_data[t]['true'].append(true)
        per_type_data[t]['pred'].append(pred)
        
    maes = []
    for t in iterator(set(types)):
        maes.append(np.log(metrics.mean_absolute_error(per_type_data[t]['true'], per_type_data[t]['pred'])))
        
    return np.mean(maes)
        

In [133]:
def speedup_mean_log_mae(y_true, y_pred, types, verbose=True):
    if verbose:
        iterator = lambda x: tqdm(x)
    else:
        iterator = list
    
    per_type_data = {
        t : {
            'true': [],
            'pred': []
        } 
        for t in list(set(types))
    }
    for true, pred, t in iterator(zip(y_true, y_pred, types)):
        per_type_data[t]['true'].append(true)
        per_type_data[t]['pred'].append(pred)
        
    maes = []
    for t in iterator(set(types)):
        maes.append(
            jit_log_mae(
                np.array(per_type_data[t]['true'], dtype=np.float32),
                np.array(per_type_data[t]['pred'], dtype=np.float32)
            )
        )
        
    return np.mean(maes)
        

In [125]:
from numba import jit, float32

In [126]:
@jit(float32(float32[:], float32[:]))
def jit_log_mae(y_true: np.ndarray, y_pred: np.ndarray):
    n = y_true.shape[0]
    return np.log(np.sum(np.absolute(y_true - y_pred))/n)

In [127]:
@jit
def jit_mean_log_mae(y_true, y_pred, types):
    
    uniq_types: np.ndarray = np.unique(types)
    
    per_type_data = {}
    for t in uniq_types:
        per_type_data[t] = {
            'true': [],
            'pred': []
        }
    
    for true, pred, t in zip(y_true, y_pred, types):
        per_type_data[t]['true'].append(true)
        per_type_data[t]['pred'].append(pred)
        
    maes = []
    for t in uniq_types:
        maes.append(jit_log_mae(np.array(per_type_data[t]['true'], dtype=np.float32), np.array(per_type_data[t]['pred'], dtype=np.float32)))
        
    return np.mean(maes)
        

In [105]:
train = pd.read_csv("../data/train.csv")

In [106]:
train = train.sample(frac=10000/(len(train))).reset_index(drop=True)

In [129]:
jit_mean_log_mae(train.scalar_coupling_constant, np.zeros(len(train)), train.type)

2.0083879344165325

In [99]:
zeros = np.zeros(len(train))

In [135]:
%%timeit -n 1000
mean_log_mae(train.scalar_coupling_constant.values, zeros, train.type.values, verbose=False)

1.78 ms ± 6.33 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [136]:
%%timeit -n 1000
speedup_mean_log_mae(train.scalar_coupling_constant.values, zeros, train.type.values, verbose=False)

956 µs ± 10.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [137]:
%%timeit -n 1000
jit_mean_log_mae(train.scalar_coupling_constant.values, zeros, train.type.values)

5.67 ms ± 40 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [138]:
%%timeit -n 1000
metric(train, np.zeros(len(train)), verbose=False)

18.8 ms ± 137 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
