From 2f5a119b831b3e4ffa203c3ce82c35e26c4152ee Mon Sep 17 00:00:00 2001 From: Natalia Date: Wed, 16 Aug 2023 14:35:07 +0300 Subject: [PATCH] updated the Multilabel and Pairwise documentation --- .../multilabel/binary_relevance.py | 67 +++++++++-------- .../aggregation/pairwise/bradley_terry.py | 71 ++++++++++--------- crowdkit/aggregation/pairwise/noisy_bt.py | 62 +++++++++------- 3 files changed, 112 insertions(+), 88 deletions(-) diff --git a/crowdkit/aggregation/multilabel/binary_relevance.py b/crowdkit/aggregation/multilabel/binary_relevance.py index be0242b9..5cfc1599 100644 --- a/crowdkit/aggregation/multilabel/binary_relevance.py +++ b/crowdkit/aggregation/multilabel/binary_relevance.py @@ -13,23 +13,35 @@ @attr.s class BinaryRelevance(BaseClassificationAggregator): - r"""Simple aggregation algorithm for multi-label classification. - - Binary Relevance is a straightforward approach for multi-label classification aggregation: - each label is treated as a class in binary classification problem and aggregated separately using - aggregation algorithms for classification, e.g. Majority Vote or Dawid Skene. + r"""The **Binary Relevance** algorithm is a simple aggregation algorithm for the multi-label classification. + + Binary Relevance is a straightforward approach for the multi-label classification aggregation: + each label is represented as a class in the binary classification problem and aggregated separately using + aggregation algorithms for classification (e.g., Majority Vote or Dawid-Skene). Specifically, + for each class label $λ_j$, Binary Relevance derives a binary training set $D_j$ from the original + multi-label training set $D$ in the following way: + $$ + D_j = {(x^i, y_j^i) | 1 <= i <= m}. + $$ + In other words, each multi-label training example $(x^i, y^i)$ is transformed into a binary training example + based on its relevancy to $λ_j$. {% note info %} - If this method is used for single-label classification, the output of the BinaryRelevance method may differ - from the output of the basic aggregator used for its intended purpose, since each class generates a binary + If this method is used for the single-label classification, the output of the Binary Relevance method may differ + from the output of the basic aggregator used for its intended purpose since each class generates a binary classification task, and therefore it is considered separately. For example, some objects may not have labels. {% endnote %} + M-L. Zhang, Y-K. Li, X-Y. Liu, X. Geng. Binary Relevance for Multi-Label Learning: An Overview. + *Frontiers of Computer Science. Vol. 12*, 2 (2018), 191-202. + + + Args: - base_aggregator: Aggregator instance that will be used for each binary classification. All class parameters - will be copied, except for the results of previous fit. + base_aggregator: The aggregator instance that will be used for each binary classification. All class parameters + will be copied, except for the results of the previous fit. Examples: >>> import pandas as pd @@ -48,15 +60,14 @@ class BinaryRelevance(BaseClassificationAggregator): >>> result = BinaryRelevance(DawidSkene(n_iter=10)).fit_predict(df) Attributes: - labels_ (typing.Optional[pandas.core.series.Series]): Tasks' labels. - A pandas.Series indexed by `task` such that `labels.loc[task]` - is the tasks' aggregated labels. - - aggregators_ (dict[str, BaseClassificationAggregator]): Labels' aggregators matched to classes. - A dictionary that matches aggregators to classes. - The key is the class found in the source data, - and the value is the aggregator used for this class. - The set of keys is all the classes that are in the input data. + labels_ (typing.Optional[pandas.core.series.Series]): The task labels. + The `pandas.Series` data is indexed by `task` so that `labels.loc[task]` is a list of the task aggregated labels. + + aggregators_ (dict[str, BaseClassificationAggregator]): The label aggregators matched to the classes. + It is represented as a dictionary that matches the aggregators to the classes. + The key is a class found in the source data, + and the value is an aggregator used for this class. + The set of keys is all the classes that are used in the input data. """ base_aggregator: BaseClassificationAggregator = attr.ib( # validator=attr.validators.instance_of(BaseClassificationAggregator), @@ -69,12 +80,12 @@ def _any_name_except_a_name_of_an_attribute(self, attribute: Any, value: Any) -> "Aggregator argument should be a classification aggregator" def fit(self, data: pd.DataFrame) -> 'BinaryRelevance': - """Fit the aggregators. + """Fits the model to the training data. Args: - data (DataFrame): Workers' labeling results. - A pandas.DataFrame containing `task`, `worker` and `label` columns. - 'label' column should contain list of labels, e.g. ['tree', 'house', 'car'] + data (DataFrame): The training dataset of workers' labeling results + which is represented as the `pandas.DataFrame` data containing `task`, `worker`, and `label` columns. + The `label` column should contain a list of labels (e.g., ['tree', 'house', 'car']). Returns: BinaryRelevance: self. @@ -107,15 +118,15 @@ def fit(self, data: pd.DataFrame) -> 'BinaryRelevance': return self def fit_predict(self, data: pd.DataFrame) -> pd.Series: - """Fit the model and return aggregated results. + """Fits the model to the training data and returns the aggregated results. Args: - data (DataFrame): Workers' labeling results. - A pandas.DataFrame containing `task`, `worker` and `label` columns. + data (DataFrame): The training dataset of workers' labeling results + which is represented as the `pandas.DataFrame` data containing `task`, `worker`, and `label` columns. Returns: - Series: Tasks' labels. - A pandas.Series indexed by `task` such that `labels.loc[task]` - is a list with the task's aggregated labels. + Series: Task labels. + The `pandas.Series` data is indexed by `task` so that `labels.loc[task]` + is a list of the task aggregated labels. """ return self.fit(data).labels_ diff --git a/crowdkit/aggregation/pairwise/bradley_terry.py b/crowdkit/aggregation/pairwise/bradley_terry.py index fbd979f0..8e50e17f 100644 --- a/crowdkit/aggregation/pairwise/bradley_terry.py +++ b/crowdkit/aggregation/pairwise/bradley_terry.py @@ -14,43 +14,45 @@ @attr.s class BradleyTerry(BasePairwiseAggregator): - r"""Bradley-Terry model for pairwise comparisons. - - The model implements the classic algorithm for aggregating pairwise comparisons. - The algorithm constructs an items' ranking based on pairwise comparisons. Given - a pair of two items $i$ and $j$, the probability of $i$ to be ranked higher is, - according to the Bradley-Terry's probabilistic model, + r"""The **Bradley-Terry model for paired comparisons** implements the classic algorithm + for aggregating paired comparisons. The algorithm constructs the ranking of items based on paired comparisons. + Given a pair of two items $i$ and $j$, the probability that $i$ is ranked higher than $j$, + according to the probabilistic Bradley-Terry model, is $$ - P(i > j) = \frac{p_i}{p_i + p_j}. + P(i > j) = \frac{p_i}{p_i + p_j}, $$ - Here $\boldsymbol{p}$ is a vector of positive real-valued parameters that the algorithm optimizes. These - optimization process maximizes the log-likelihood of observed comparisons outcomes by the MM-algorithm: + where $\boldsymbol{p}$ is a vector of the positive real-valued parameters that the algorithm optimizes. These + optimization process maximizes the log-likelihood of the outcomes of the observed comparisons using the MM algorithm: $$ L(\boldsymbol{p}) = \sum_{i=1}^n\sum_{j=1}^n[w_{ij}\ln p_i - w_{ij}\ln (p_i + p_j)], $$ - where $w_{ij}$ denotes the number of comparisons of $i$ and $j$ "won" by $i$. + where $w_{ij}$ denotes the number of times individual $i$ has beaten individual $j$ and we assume $w_{ij} = 0$ by convention. {% note info %} - The Bradley-Terry model needs the comparisons graph to be **strongly connected**. + The Bradley-Terry model requires the comparison graph to be **strongly connected**. {% endnote %} - David R. Hunter. - MM algorithms for generalized Bradley-Terry models - *Ann. Statist.*, Vol. 32, 1 (2004): 384–406. + David R. Hunter. MM Algorithms for Generalized Bradley-Terry Models. + *Ann. Statist. Vol. 32*, 1 (2004), 384–406. + + + + R. A. Bradley, M. E. Terry. Rank Analysis of Incomplete Block Designs: I. The Method of Paired Comparisons. + *Biometrika. Vol. 39*, 3/4 (1952), 324–345. - Bradley, R. A. and Terry, M. E. - Rank analysis of incomplete block designs. I. The method of paired comparisons. - *Biometrika*, Vol. 39 (1952): 324–345. + Args: - n_iter: A number of optimization iterations. + n_iter: The maximum number of optimization iterations. + tol: The tolerance stopping criterion for iterative methods with a variable number of steps. + The algorithm converges when the loss change is less than the `tol` parameter. Examples: - The Bradley-Terry model needs the data to be a `DataFrame` containing columns - `left`, `right`, and `label`. `left` and `right` contain identifiers of left and - right items respectively, `label` contains identifiers of items that won these + The Bradley-Terry model requires the `DataFrame` data containing columns + `left`, `right`, and `label`. `left` and `right` contain the identifiers of the left and + right items respectively, `label` contains the identifiers of the items that won these comparisons. >>> import pandas as pd @@ -64,8 +66,9 @@ class BradleyTerry(BasePairwiseAggregator): >>> ) Attributes: - scores_ (Series): 'Labels' scores. - A pandas.Series index by labels and holding corresponding label's scores + scores_ (Series): The label scores. + The `pandas.Series` data is indexed by `label` and contains the corresponding label scores. + loss_history_ (List[float]): A list of loss values during training. """ n_iter: int = attr.ib() @@ -74,10 +77,11 @@ class BradleyTerry(BasePairwiseAggregator): loss_history_: List[float] = attr.ib(init=False) def fit(self, data: pd.DataFrame) -> 'BradleyTerry': - """Args: - data (DataFrame): Workers' pairwise comparison results. - A pandas.DataFrame containing `worker`, `left`, `right`, and `label` columns'. - For each row `label` must be equal to either `left` column or `right` column. + """Fits the model to the training data. + Args: + data (DataFrame): The training dataset of workers' paired comparison results + which is represented as the `pandas.DataFrame` data containing `worker`, `left`, `right`, and `label` columns. + Each row `label` must be equal to either the `left` or `right` column. Returns: BradleyTerry: self. @@ -126,14 +130,15 @@ def fit(self, data: pd.DataFrame) -> 'BradleyTerry': return self def fit_predict(self, data: pd.DataFrame) -> pd.Series: - """Args: - data (DataFrame): Workers' pairwise comparison results. - A pandas.DataFrame containing `worker`, `left`, `right`, and `label` columns'. - For each row `label` must be equal to either `left` column or `right` column. + """Fits the model to the training data and returns the aggregated results. + Args: + data (DataFrame): The training dataset of workers' paired comparison results + which is represented as the `pandas.DataFrame` data containing `worker`, `left`, `right`, and `label` columns. + Each row `label` must be equal to either the `left` or `right` column. Returns: - Series: 'Labels' scores. - A pandas.Series index by labels and holding corresponding label's scores + Series: The label scores. + The `pandas.Series` data is indexed by `label` and contains the corresponding label scores. """ return self.fit(data).scores_ diff --git a/crowdkit/aggregation/pairwise/noisy_bt.py b/crowdkit/aggregation/pairwise/noisy_bt.py index 57436bc9..a06a84e6 100644 --- a/crowdkit/aggregation/pairwise/noisy_bt.py +++ b/crowdkit/aggregation/pairwise/noisy_bt.py @@ -15,16 +15,22 @@ @attr.s class NoisyBradleyTerry(BasePairwiseAggregator): - r"""Bradley-Terry model for pairwise comparisons with additional parameters. - - This model is a modification of the [Bradley-Terry model](crowdkit.aggregation.pairwise.bradley_terry.BradleyTerry.md) - with parameters for workers' skills (reliability) and biases. - + r"""The **Bradley-Terry model for paired comparisons with the additional parameters** is a modification + of the [Bradley-Terry model](crowdkit.aggregation.pairwise.bradley_terry.BradleyTerry.md) + with the parameters for the workers' skills (reliability) and biases. + + Args: + n_iter: The maximum number of optimization iterations. + tol: The tolerance stopping criterion for iterative methods with a variable number of steps. + The algorithm converges when the loss change is less than the `tol` parameter. + random_state: The seed number for the random initialization. + regularization_ratio: The regularization ratio. + Examples: The following example shows how to aggregate results of comparisons **grouped by some column**. - In the example the two questions `q1` and `q2` are used to group the labeled data. - Temporary data structure is created and the model is applied to it. - The results are splitted in two arrays, and each array contains scores for one of the initial groups. + In the example, two questions `q1` and `q2` are used to group the labeled data. + The temporary data structure is created and the model is applied to it. + The results are divided into two arrays, and each array contains scores for one of the initial groups. >>> import pandas as pd >>> from crowdkit.aggregation import NoisyBradleyTerry @@ -39,9 +45,9 @@ class NoisyBradleyTerry(BasePairwiseAggregator): >>> ], >>> columns=['question', 'worker', 'left', 'right', 'label'] >>> ) - >>> # Append question to other columns. After that the data looks like: - >>> # question worker left right label - >>> # 0 q1 w1 (q1, a) (q1, b) (q1, a) + >>> # Append question to other columns. After that, the data looks like: + >>> # question worker left right label + >>> # 0 q1 w1 (q1, a) (q1, b) (q1, a) >>> for col in 'left', 'right', 'label': >>> data[col] = list(zip(data['question'], data[col])) >>> result = NoisyBradleyTerry(n_iter=10).fit_predict(data) @@ -51,12 +57,12 @@ class NoisyBradleyTerry(BasePairwiseAggregator): >>> print(result['q2']['b']) # Score for the item b in the q2 question Attributes: - scores_ (Series): 'Labels' scores. - A pandas.Series index by labels and holding corresponding label's scores - skills_ (Series): workers' skills. - A pandas.Series index by workers and holding corresponding worker's skill - biases_ (Series): Predicted biases for each worker. Indicates the probability of a worker to choose the left item.. - A series of workers' biases indexed by workers + scores_ (Series): The label scores. + The `pandas.Series` data is indexed by `label` and contains the corresponding label scores. + skills_ (Series): The workers' skills. The `pandas.Series` data is indexed by `worker` + and has the corresponding worker skill. + biases_ (Series): The predicted biases for each worker. Indicates the probability of a worker to choose the left item. + The `pandas.Series` data is indexed by `worker` and has the corresponding worker bias. """ n_iter: int = attr.ib(default=100) tol: float = attr.ib(default=1e-5) @@ -68,10 +74,11 @@ class NoisyBradleyTerry(BasePairwiseAggregator): # scores_ def fit(self, data: pd.DataFrame) -> 'NoisyBradleyTerry': - """Args: - data (DataFrame): Workers' pairwise comparison results. - A pandas.DataFrame containing `worker`, `left`, `right`, and `label` columns'. - For each row `label` must be equal to either `left` column or `right` column. + """Fits the model to the training data. + Args: + data (DataFrame): The training dataset of workers' paired comparison results + which is represented as the `pandas.DataFrame` data containing `worker`, `left`, `right`, and `label` columns. + Each row `label` must be equal to either the `left` or `right` column. Returns: NoisyBradleyTerry: self. @@ -97,14 +104,15 @@ def fit(self, data: pd.DataFrame) -> 'NoisyBradleyTerry': return self def fit_predict(self, data: pd.DataFrame) -> pd.Series: - """Args: - data (DataFrame): Workers' pairwise comparison results. - A pandas.DataFrame containing `worker`, `left`, `right`, and `label` columns'. - For each row `label` must be equal to either `left` column or `right` column. + """Fits the model to the training data and returns the aggregated results. + Args: + data (DataFrame): The training dataset of workers' paired comparison results + which is represented as the `pandas.DataFrame` data containing `worker`, `left`, `right`, and `label` columns. + Each row `label` must be equal to either the `left` or `right` column. Returns: - Series: 'Labels' scores. - A pandas.Series index by labels and holding corresponding label's scores + Series: The label scores. + The `pandas.Series` data is indexed by `label` and contains the corresponding label scores. """ return self.fit(data).scores_