In [1]:
import os
os.chdir('./../')

In [2]:
import numpy as np
import pandas as pd

In [4]:
%%writefile './scripts/categorical_transformer.py'
import numpy as np
import pandas as pd

class CategoricalTransformer:
    
    def __init__(self, index_of_col: int):
        self._index_of_col = index_of_col
        self._a = None
        self._b = None
    
    def fit(self, x, y):
        if isinstance(x, pd.DataFrame):
            x = x.values
        self._a = y[x[:, self._index_of_col] == 'A'].mean()
        self._b = y[x[:, self._index_of_col] == 'B'].mean()
    
    def transform(self, x):
        if isinstance(x, pd.DataFrame):
            x = x.values
        x[:, self._index_of_col] = np.where(x[:, self._index_of_col] == 'A', self._a, self._b)
        return x
        
    def fit_transform(self, x, y):
        self.fit(x, y)
        return self.transform(x)

Overwriting ./scripts/categorical_transformer.py


In [5]:
%%writefile './scripts/outlier_remover.py'
import numpy as np

class OutlierRemover:
    
    def __init__(self):
        self._lower_lims = {}
        self._upper_lims = {}
    
    def fit(self, x: np.ndarray, y: np.ndarray):
        for i in range(x.shape[1]):
            data = x[:, i].copy()
            mean = data.mean()
            std = data.std()
            self._lower_lims[i] = mean - 6*std
            self._upper_lims[i] = mean + 6*std
    
    def transform(self, x: np.ndarray):
        for i in range(x.shape[1]):
            x[:, i] = np.where(((x[:, i] > self._upper_lims[i]) | (x[:, i] < self._lower_lims[i])), np.nan, x[:, i])
        return x
        
    def fit_transform(self, x: np.ndarray, y: np.ndarray):
        self.fit(x, y)
        return self.transform(x)

Writing ./scripts/outlier_remover.py


In [13]:
%%writefile './scripts/loss.py'
import numpy as np
import pandas as pd
from sklearn.metrics import make_scorer

def _balanced_log_loss(y_true: pd.Series, y_pred: pd.Series, **kwargs):
    # Extracting class labels from y_true
    y_true = y_true.astype(int)
    if len(y_pred.shape) == 1:
        y_pred = np.array((1-y_pred, y_pred)).T
    
    # Computing the number of observations for each class
    N0 = np.sum(y_true == 0)
    N1 = np.sum(y_true == 1)
    
    # Calculating the inverse prevalence weights
    w0 = 1 / N0
    w1 = 1 / N1
    
    # Rescaling the predicted probabilities
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    y_pred /= y_pred.sum(axis=1, keepdims=True)
    
    # Calculating the logarithmic loss for each class
    log_loss_0 = np.sum((1-y_true) * np.log(y_pred[:, 0])) / N0
    log_loss_1 = np.sum(y_true * np.log(y_pred[:, 1])) / N1
    
    # Computing the balanced logarithmic loss
    balanced_log_loss = (-w0 * log_loss_0 - w1 * log_loss_1)/(w0+w1)
    
    return balanced_log_loss

def get_bal_log_loss():
    return make_scorer(_balanced_log_loss, 
                       greater_is_better=False,
                       needs_proba=True)

Overwriting ./scripts/loss.py
