In [1]:
def score_stat_ci(
    y_true,
    y_preds,
    score_fun,
    stat_fun=np.mean,
    n_bootstraps=2000,
    confidence_level=0.95,
    seed=None,
    reject_one_class_samples=True,
):
    """
    Computes log(p-value) for hypothesis that two ROC AUCs are different
    Parameters:
        y_true: 1D list or array of labels.
        y_preds: A list of lists or 2D array of predictions corresponding to elements in y_true.
        score_fun: Score function for which confidence interval is computed. (e.g. sklearn.metrics.accuracy_score)
        stat_fun: Statistic for which confidence interval is computed. (e.g. np.mean)
        n_bootstraps: The number of bootstraps. (default: 2000)
        confidence_level: Confidence level for computing confidence interval. (default: 0.95)
        seed: Random seed for reproducibility. (default: None)
        reject_one_class_samples: Whether to reject bootstrapped samples with only one label. For scores like AUC we need at least one positive and one negative sample. (default: True)
        
        return: lower confidence interval, upper confidence interval
    """

    y_true = np.array(y_true)
    y_preds = np.atleast_2d(y_preds)
    assert all(len(y_true) == len(y) for y in y_preds)

    np.random.seed(seed)
    scores = []
    for i in range(n_bootstraps):
        readers = np.random.randint(0, len(y_preds), len(y_preds))
        indices = np.random.randint(0, len(y_true), len(y_true))
        if reject_one_class_samples and len(np.unique(y_true[indices])) < 2:
            continue
        reader_scores = []
        for r in readers:
            reader_scores.append(score_fun(y_true[indices], y_preds[r][indices]))
        scores.append(stat_fun(reader_scores))

    mean_score = np.mean(scores)
    sorted_scores = np.array(sorted(scores))
    alpha = (1.0 - confidence_level) / 2.0
    ci_lower = sorted_scores[int(round(alpha * len(sorted_scores)))]
    ci_upper = sorted_scores[int(round((1.0 - alpha) * len(sorted_scores)))]
    
    ci_boot = np.array([ci_lower,ci_upper])
    
    return ci_boot

def pvalue(
    y_true,
    y_preds1,
    y_preds2,
    score_fun,
    stat_fun=np.mean,
    n_bootstraps=2000,
    seed=None,
):
    """
    Compute p-value for hypothesis that given statistic of score function for model I predictions is higher than for
    model II predictions using bootstrapping.
    Parameters:
        y_true: 1D list or array of labels.
        y_preds1: A 1D array of predictions for model I corresponding to elements in y_true.
        y_preds2: A 1D array of predictions for model II corresponding to elements in y_true.
        score_fun: Score function for which confidence interval is computed. (e.g. sklearn.metrics.roc_auc_score)
        stat_fun: Statistic for which p-value is computed. (default : mean)
        n_bootstraps: The number of bootstraps. (default: 2000)
        two_tailed: Whether to use two-tailed test. (default: True)
        seed: Random seed for reproducibility. (default: None)
        
        return: Computed p-value, array of bootstrapped differences of scores.
    """

    np.random.seed(seed)

    m = []
    s1 = []
    sd = []
    score1 = []
    score2 = []

    for i in range(n_bootstraps):
        readers1 = np.random.randint(0, len(y_preds1), len(y_preds1))
        readers2 = np.random.randint(0, len(y_preds2), len(y_preds2))
        indices = np.random.randint(0, len(y_true), len(y_true))
        score_1 = roc_auc_score(y_true[indices], y_preds1[indices])
        score_2 = roc_auc_score(y_true[indices], y_preds2[indices])
        s1.append(score_1-score_2)
        score1.append(roc_auc_score(y_true[indices], y_preds1[indices]))
        score2.append(roc_auc_score(y_true[indices], y_preds2[indices]))
                
    m.append(stat_fun(score1) - stat_fun(score2))
    sd.append(stdev(s1))
    
    Z = m[0]/sd[0]
    
    p = norm.cdf(abs(Z))

    return p