From feeead419f9d375e79a2f4e6883473652d0fae14 Mon Sep 17 00:00:00 2001 From: "Turkunov Y." <55660526+turkunov@users.noreply.github.com> Date: Sat, 18 Jan 2025 11:56:57 +0300 Subject: [PATCH 1/3] added q94 ROC AUC Implementation of ROC Area Under Curve, a metric used for measuring predictive quality of a binary classifier. Input: true labels and predicted probabilities Output: ROC AUC rounded to 5 floating points --- Problems/94_roc_auc/learn.md | 64 +++++++++++++++++++++++ Problems/94_roc_auc/solution.py | 90 +++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 Problems/94_roc_auc/learn.md create mode 100644 Problems/94_roc_auc/solution.py diff --git a/Problems/94_roc_auc/learn.md b/Problems/94_roc_auc/learn.md new file mode 100644 index 00000000..84c518e4 --- /dev/null +++ b/Problems/94_roc_auc/learn.md @@ -0,0 +1,64 @@ +## Overview +ROC-AUC is a metric used for measuring predictive quality of a binary classifier with the highest value being $1$ and the lowest being $0$. + +## $TPR$ and $FPR$ +Consider a trivial case, when we have true binary labels $y_i\in\{0, 1\}$ and our predicted labels by the model $\hat{y_i}\in\{0, 1\}$. We also denote any arbitrary example labeled as $1$ as positive and $0$ as negative. Using $(y_i, \hat{y_i})$ combinations we build a set of $Y$, with the help of which we then generate statistics such as $TP$ (True Positive), $TN$ (True Negative), $FP$ (False Positive) and $FN$ (False Negative): + +| Total population = P + N | Predicted positive (PP) | Predicted negative (PN) | +|-----------------------------------|----------------------------------------------------|-------------------------------------------------------| +| **Actual positive (P)** | $TP=\#\{Y\|y_i=\hat{y_i}=1\}$ | $FN=\#\{Y\|y_i=1;\hat{y_i}=0\}$ (also called type II error) | +| **Actual negative (N)** | $FP=\#\{Y\|y_i=0;\hat{y_i}=1\}$ (also called type I error) | $TN=\#\{Y\|y_i=\hat{y_i}=0\}$ | + +This table, also referenced as **confusion matrix**, could provide an overview of the model's performance for this particular task. Now with the help of these statistics we can calculate the following estimates: +$$ +TPR=\frac{TP}{TP+FN}\quad(\text{also called a recall}) \\ + +FPR=\frac{FP}{FP+TN} +$$ + +Intuition-wise, **TPR** shows the model's sensitivity to positive cases, where the true label $y_i=1$. In some cases, for example in credit scoring or cancer detection tasks, we even neglect other metrics in favor of recall, since any $FN$-case could turn out a very costly mistake. **FPR**, on the other hand, shows how biased are we towards positive cases at the expense of $y_i=0$. + +## Thershold +Now recall that we originally obtain a vector $\hat{y_i}\in\{0, 1\}$ of predicted labels. But the model itself is not able to directly output either $0$ or $1$. Instead we look at the probability $z_i$ the model has provided us with and compare it with empirically chosen threshold $t$. For example, for a chosen $t=0.7$ we would have the following decision rule: +$$ +\hat{y_i}=\begin{cases} 1, & \text{if } z_i\gt 0.7 \\ 0, & \text{otherwise } \end{cases} +$$ + +With this idea in mind, we can see that for every $t$ our previous estimates of $TPR$ and $FPR$ would change as well, so we can actually denote them as $TPR(t)$ and $FPR(t)$. But we also want our model to be robust and not be dependent on what thershold we choose. That is why when we need to measure the quality of our model, we often look at the **ROC** curve $TPR(FPR | t)$, which shows $TPR$ and $FPR$ under various thresholds. + +## ROC curve +Each point of this curve is obtained via this algorithm: +$$ + + +\begin{array}{l} +\textbf{Input}: y\_true, y\_pred \text{ (true labels and output probabilities)} \\ +\textbf{Output: } \text{points} \text{ (a set of (x, y) coordinates)} \\ +\text{\textbf{function} roc\_points}(y\_true, y\_pred): \\ +\quad thresholds \leftarrow y\_pred \cup \{0\} \\ +\quad points \leftarrow [\quad ] \\ +\quad \textbf{for } t\in\{thresholds\,:\ t_i\ge t_{i+1}\} \textbf{ do}: \\ +\quad \quad y \leftarrow TPR(t) \\ +\quad \quad x \leftarrow FPR(t) \\ +\quad \quad \text{points.append}((x, y)) \\ +\quad \textbf{end for} \\ +\textbf{end function} +\end{array} +$$ + +ROC curve's domain stays within $[0, 1]$. To break it down, first consider a thershold $t=1$. Then it is impossible to assign any label to our predictions, unless it is $0$. Therefore $TP=0\implies TPR=0$ and $FP=0\implies FPR=0$ (since all negative examples are going to be assigned a correct label). On the other hand if we have $t=0$, then $FN = 0\implies TPR=\frac{TP}{TP}=1$ and $TN = 0 \implies FPR=\frac{FP}{FP}=1$, since there is no way we can assign $0$ to any prediction. + +The best case cenario is when with increasing thershold $t$ our sensitivity increases without disregarding the bias ($FPR$ does not change or is around 0 and $TPR$ is always high). The worst case cenario is when the model is random and we follow an $FPR=TPR$ diagonal line. + +## ROC-AUC +If you consider two ROC curves mentioned above, you could see that the space underneath the first one is greater than the second one. This is why we usually calculate **ROC-AUC** - area under the ROC curve. You might think that the larger is the AUC, the better is the model, but in fact it's a common misconception. + +Consider you want to choose a model between model #1 with $AUC_{ROC}=0.6$ and model #2 with $AUC_{ROC}=0.3$. The correct answer is actually #2, since we can always invert our decision rule in favor of the ROC-AUC and our $AUC_{ROC}$ for model #2 would actually become $0.7$. Therefore, when looking at the ROC AUC, we should consider how large is the **absolute** difference between the area of $0.5$ (worst case performance) and the one our model has generated. + +## Calculating AUC +There are also various ways for calculating an area under the curve. The most applicable one, which is also used in scikit-learn, is the trapezoidal rule: +$$ +\int f(x)=\sum_i\frac{1}{2}\Delta x_i * (f(x_i)-f(x_{i-1})) , +$$ + +where $\Delta x_i=x_i-x_{i-1}$. This method breaks a total area under the curve into a sum of $90^\circ$-rotated trapezoids that make up the convex curve. \ No newline at end of file diff --git a/Problems/94_roc_auc/solution.py b/Problems/94_roc_auc/solution.py new file mode 100644 index 00000000..1dd1f841 --- /dev/null +++ b/Problems/94_roc_auc/solution.py @@ -0,0 +1,90 @@ +import numpy as np + + +def roc_auc(y_true: list[float], probas: list[float]) -> float: + """ + Parameters + ---------- + y_true : list[float] + True labels + probas : list[float] + Output probabilities of our binary classifier + + Returns + ------- + auc : float + ROC AUC rounded to 5 floating points + """ + thresh = sorted(probas + [0], reverse=True) + y_true, probas = np.array(y_true), np.array(probas) + + fpr, tpr = [0], [0] + auc = 0 + + for t in thresh: + y_pred = np.where(probas < t, 0, 1) + tp = ((y_true == 1) & (y_pred == 1)).sum() + fn = (y_true == 1).sum() - tp + + fp = (y_pred == 1).sum() - tp + tn = (y_true == 0).sum() - fp + + fpr.append(fp / (fp + tn)) + tpr.append(tp / (tp + fn)) + + auc += (fpr[-1] - fpr[-2]) * (tpr[-1] + tpr[-2]) + + return round(1/2 * auc, 5) + + +def test_roc_auc(): + # Test 1 + y = [0, 0, 1, 1] + y_proba = [0.1, 0.4, 0.35, 0.8] + assert roc_auc(y, y_proba) == .75, 'Test case 1 failed' + + # Test 2 + y = [1, 1, 1, 0, 1, 0, 0, 0, 1, 1] + y_proba = [ + 0.9945685360621648, + 0.9937332904188113, + 0.9958526266087151, + 4.391062222999706e-09, + 0.9959272720187046, + 0.10851446498385146, + 0.001096202856869512, + 4.995474609174945e-06, + 0.9921605697799972, + 0.9826790537446354 + ] + assert roc_auc(y, y_proba) == 1.0, 'Test case 2 failed' + + # Test 3 + y = [0, 0, 0, 0, 0, 1, 1, 1, 0, 1] + y_proba = [ + 0.8318040739657637, + 0.421445304232661, + 0.003309769194418868, + 0.015529393142531172, + 0.0001635684705459328, + 0.6988867797464966, + 0.9534132112895218, + 0.8471417487716292, + 0.0005832121647006822, + 0.9990059733653113 + ] + assert roc_auc(y, y_proba) == 0.95833, 'Test case 3 failed' + + # Test 4 + y = [0, 0, 1, 1, 1, 0, 1] + y_proba = [ + 8.99e-1,9.95e-1,5e-3, + 2.3e-4,1e-4,9e-1,2.1e-4 + ] + assert roc_auc(y, y_proba) == 0.0, 'Test case 4 failed' + + print('All tests passed') + + +if __name__ == '__main__': + test_roc_auc() \ No newline at end of file From f036168d3a8c85373f4d71a27004a7ff5d637a87 Mon Sep 17 00:00:00 2001 From: "Turkunov Y." <55660526+turkunov@users.noreply.github.com> Date: Sat, 18 Jan 2025 12:01:00 +0300 Subject: [PATCH 2/3] renamed dir 94_roc_auc -> roc_auc --- Problems/{94_roc_auc => roc_auc}/learn.md | 0 Problems/{94_roc_auc => roc_auc}/solution.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename Problems/{94_roc_auc => roc_auc}/learn.md (100%) rename Problems/{94_roc_auc => roc_auc}/solution.py (100%) diff --git a/Problems/94_roc_auc/learn.md b/Problems/roc_auc/learn.md similarity index 100% rename from Problems/94_roc_auc/learn.md rename to Problems/roc_auc/learn.md diff --git a/Problems/94_roc_auc/solution.py b/Problems/roc_auc/solution.py similarity index 100% rename from Problems/94_roc_auc/solution.py rename to Problems/roc_auc/solution.py From bd19adde8021c6c425e9db8e14d53e055591408f Mon Sep 17 00:00:00 2001 From: "Turkunov Y." <55660526+turkunov@users.noreply.github.com> Date: Wed, 5 Feb 2025 00:37:54 +0300 Subject: [PATCH 3/3] moved roc auc files to roc_auc branch --- Problems/roc_auc/learn.md | 64 ------------------------- Problems/roc_auc/solution.py | 90 ------------------------------------ 2 files changed, 154 deletions(-) delete mode 100644 Problems/roc_auc/learn.md delete mode 100644 Problems/roc_auc/solution.py diff --git a/Problems/roc_auc/learn.md b/Problems/roc_auc/learn.md deleted file mode 100644 index 84c518e4..00000000 --- a/Problems/roc_auc/learn.md +++ /dev/null @@ -1,64 +0,0 @@ -## Overview -ROC-AUC is a metric used for measuring predictive quality of a binary classifier with the highest value being $1$ and the lowest being $0$. - -## $TPR$ and $FPR$ -Consider a trivial case, when we have true binary labels $y_i\in\{0, 1\}$ and our predicted labels by the model $\hat{y_i}\in\{0, 1\}$. We also denote any arbitrary example labeled as $1$ as positive and $0$ as negative. Using $(y_i, \hat{y_i})$ combinations we build a set of $Y$, with the help of which we then generate statistics such as $TP$ (True Positive), $TN$ (True Negative), $FP$ (False Positive) and $FN$ (False Negative): - -| Total population = P + N | Predicted positive (PP) | Predicted negative (PN) | -|-----------------------------------|----------------------------------------------------|-------------------------------------------------------| -| **Actual positive (P)** | $TP=\#\{Y\|y_i=\hat{y_i}=1\}$ | $FN=\#\{Y\|y_i=1;\hat{y_i}=0\}$ (also called type II error) | -| **Actual negative (N)** | $FP=\#\{Y\|y_i=0;\hat{y_i}=1\}$ (also called type I error) | $TN=\#\{Y\|y_i=\hat{y_i}=0\}$ | - -This table, also referenced as **confusion matrix**, could provide an overview of the model's performance for this particular task. Now with the help of these statistics we can calculate the following estimates: -$$ -TPR=\frac{TP}{TP+FN}\quad(\text{also called a recall}) \\ - -FPR=\frac{FP}{FP+TN} -$$ - -Intuition-wise, **TPR** shows the model's sensitivity to positive cases, where the true label $y_i=1$. In some cases, for example in credit scoring or cancer detection tasks, we even neglect other metrics in favor of recall, since any $FN$-case could turn out a very costly mistake. **FPR**, on the other hand, shows how biased are we towards positive cases at the expense of $y_i=0$. - -## Thershold -Now recall that we originally obtain a vector $\hat{y_i}\in\{0, 1\}$ of predicted labels. But the model itself is not able to directly output either $0$ or $1$. Instead we look at the probability $z_i$ the model has provided us with and compare it with empirically chosen threshold $t$. For example, for a chosen $t=0.7$ we would have the following decision rule: -$$ -\hat{y_i}=\begin{cases} 1, & \text{if } z_i\gt 0.7 \\ 0, & \text{otherwise } \end{cases} -$$ - -With this idea in mind, we can see that for every $t$ our previous estimates of $TPR$ and $FPR$ would change as well, so we can actually denote them as $TPR(t)$ and $FPR(t)$. But we also want our model to be robust and not be dependent on what thershold we choose. That is why when we need to measure the quality of our model, we often look at the **ROC** curve $TPR(FPR | t)$, which shows $TPR$ and $FPR$ under various thresholds. - -## ROC curve -Each point of this curve is obtained via this algorithm: -$$ - - -\begin{array}{l} -\textbf{Input}: y\_true, y\_pred \text{ (true labels and output probabilities)} \\ -\textbf{Output: } \text{points} \text{ (a set of (x, y) coordinates)} \\ -\text{\textbf{function} roc\_points}(y\_true, y\_pred): \\ -\quad thresholds \leftarrow y\_pred \cup \{0\} \\ -\quad points \leftarrow [\quad ] \\ -\quad \textbf{for } t\in\{thresholds\,:\ t_i\ge t_{i+1}\} \textbf{ do}: \\ -\quad \quad y \leftarrow TPR(t) \\ -\quad \quad x \leftarrow FPR(t) \\ -\quad \quad \text{points.append}((x, y)) \\ -\quad \textbf{end for} \\ -\textbf{end function} -\end{array} -$$ - -ROC curve's domain stays within $[0, 1]$. To break it down, first consider a thershold $t=1$. Then it is impossible to assign any label to our predictions, unless it is $0$. Therefore $TP=0\implies TPR=0$ and $FP=0\implies FPR=0$ (since all negative examples are going to be assigned a correct label). On the other hand if we have $t=0$, then $FN = 0\implies TPR=\frac{TP}{TP}=1$ and $TN = 0 \implies FPR=\frac{FP}{FP}=1$, since there is no way we can assign $0$ to any prediction. - -The best case cenario is when with increasing thershold $t$ our sensitivity increases without disregarding the bias ($FPR$ does not change or is around 0 and $TPR$ is always high). The worst case cenario is when the model is random and we follow an $FPR=TPR$ diagonal line. - -## ROC-AUC -If you consider two ROC curves mentioned above, you could see that the space underneath the first one is greater than the second one. This is why we usually calculate **ROC-AUC** - area under the ROC curve. You might think that the larger is the AUC, the better is the model, but in fact it's a common misconception. - -Consider you want to choose a model between model #1 with $AUC_{ROC}=0.6$ and model #2 with $AUC_{ROC}=0.3$. The correct answer is actually #2, since we can always invert our decision rule in favor of the ROC-AUC and our $AUC_{ROC}$ for model #2 would actually become $0.7$. Therefore, when looking at the ROC AUC, we should consider how large is the **absolute** difference between the area of $0.5$ (worst case performance) and the one our model has generated. - -## Calculating AUC -There are also various ways for calculating an area under the curve. The most applicable one, which is also used in scikit-learn, is the trapezoidal rule: -$$ -\int f(x)=\sum_i\frac{1}{2}\Delta x_i * (f(x_i)-f(x_{i-1})) , -$$ - -where $\Delta x_i=x_i-x_{i-1}$. This method breaks a total area under the curve into a sum of $90^\circ$-rotated trapezoids that make up the convex curve. \ No newline at end of file diff --git a/Problems/roc_auc/solution.py b/Problems/roc_auc/solution.py deleted file mode 100644 index 1dd1f841..00000000 --- a/Problems/roc_auc/solution.py +++ /dev/null @@ -1,90 +0,0 @@ -import numpy as np - - -def roc_auc(y_true: list[float], probas: list[float]) -> float: - """ - Parameters - ---------- - y_true : list[float] - True labels - probas : list[float] - Output probabilities of our binary classifier - - Returns - ------- - auc : float - ROC AUC rounded to 5 floating points - """ - thresh = sorted(probas + [0], reverse=True) - y_true, probas = np.array(y_true), np.array(probas) - - fpr, tpr = [0], [0] - auc = 0 - - for t in thresh: - y_pred = np.where(probas < t, 0, 1) - tp = ((y_true == 1) & (y_pred == 1)).sum() - fn = (y_true == 1).sum() - tp - - fp = (y_pred == 1).sum() - tp - tn = (y_true == 0).sum() - fp - - fpr.append(fp / (fp + tn)) - tpr.append(tp / (tp + fn)) - - auc += (fpr[-1] - fpr[-2]) * (tpr[-1] + tpr[-2]) - - return round(1/2 * auc, 5) - - -def test_roc_auc(): - # Test 1 - y = [0, 0, 1, 1] - y_proba = [0.1, 0.4, 0.35, 0.8] - assert roc_auc(y, y_proba) == .75, 'Test case 1 failed' - - # Test 2 - y = [1, 1, 1, 0, 1, 0, 0, 0, 1, 1] - y_proba = [ - 0.9945685360621648, - 0.9937332904188113, - 0.9958526266087151, - 4.391062222999706e-09, - 0.9959272720187046, - 0.10851446498385146, - 0.001096202856869512, - 4.995474609174945e-06, - 0.9921605697799972, - 0.9826790537446354 - ] - assert roc_auc(y, y_proba) == 1.0, 'Test case 2 failed' - - # Test 3 - y = [0, 0, 0, 0, 0, 1, 1, 1, 0, 1] - y_proba = [ - 0.8318040739657637, - 0.421445304232661, - 0.003309769194418868, - 0.015529393142531172, - 0.0001635684705459328, - 0.6988867797464966, - 0.9534132112895218, - 0.8471417487716292, - 0.0005832121647006822, - 0.9990059733653113 - ] - assert roc_auc(y, y_proba) == 0.95833, 'Test case 3 failed' - - # Test 4 - y = [0, 0, 1, 1, 1, 0, 1] - y_proba = [ - 8.99e-1,9.95e-1,5e-3, - 2.3e-4,1e-4,9e-1,2.1e-4 - ] - assert roc_auc(y, y_proba) == 0.0, 'Test case 4 failed' - - print('All tests passed') - - -if __name__ == '__main__': - test_roc_auc() \ No newline at end of file