-
Notifications
You must be signed in to change notification settings - Fork 15
/
majority_vote.py
143 lines (111 loc) · 5.87 KB
/
majority_vote.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
__all__ = ['MajorityVote']
from typing import Optional
import attr
import pandas as pd
from ..base import BaseClassificationAggregator
from ..utils import normalize_rows, get_most_probable_labels, get_accuracy, add_skills_to_data, named_series_attrib
@attr.s
class MajorityVote(BaseClassificationAggregator):
"""Majority Vote aggregation algorithm.
Majority vote is a straightforward approach for categorical aggregation: for each task,
it outputs a label which has the largest number of responses. Additionaly, the majority vote
can be used when different weights assigned for workers' votes. In this case, the
resulting label will be the one with the largest sum of weights.
{% note info %}
In case when two or more labels have the largest number of votes, the resulting
label will be the same for all tasks which have the same set of labels with equal count of votes.
{% endnote %}
Args:
default_skill: Defualt worker's weight value.
Examples:
Basic majority voting:
>>> from crowdkit.aggregation import MajorityVote
>>> from crowdkit.datasets import load_dataset
>>> df, gt = load_dataset('relevance-2')
>>> result = MajorityVote().fit_predict(df)
Weighted majority vote:
>>> import pandas as pd
>>> from crowdkit.aggregation import MajorityVote
>>> df = pd.DataFrame(
>>> [
>>> ['t1', 'p1', 0],
>>> ['t1', 'p2', 0],
>>> ['t1', 'p3', 1],
>>> ['t2', 'p1', 1],
>>> ['t2', 'p2', 0],
>>> ['t2', 'p3', 1],
>>> ],
>>> columns=['task', 'worker', 'label']
>>> )
>>> skills = pd.Series({'p1': 0.5, 'p2': 0.7, 'p3': 0.4})
>>> result = MajorityVote.fit_predict(df, skills)
Attributes:
labels_ (typing.Optional[pandas.core.series.Series]): Tasks' labels.
A pandas.Series indexed by `task` such that `labels.loc[task]`
is the tasks's most likely true label.
skills_ (typing.Optional[pandas.core.series.Series]): workers' skills.
A pandas.Series index by workers and holding corresponding worker's skill
probas_ (typing.Optional[pandas.core.frame.DataFrame]): Tasks' label probability distributions.
A pandas.DataFrame indexed by `task` such that `result.loc[task, label]`
is the probability of `task`'s true label to be equal to `label`. Each
probability is between 0 and 1, all task's probabilities should sum up to 1
on_missing_skill (str): How to handle assignments done by workers with unknown skill.
Possible values:
* "error" — raise an exception if there is at least one assignment done by user with unknown skill;
* "ignore" — drop assignments with unknown skill values during prediction. Raise an exception if there is no
assignments with known skill for any task;
* value — default value will be used if skill is missing.
"""
# TODO: remove skills_
skills_: Optional[pd.Series] = named_series_attrib(name='skill')
probas_: Optional[pd.DataFrame] = attr.ib(init=False)
# labels_
on_missing_skill: str = attr.ib(default='error')
default_skill: Optional[float] = attr.ib(default=None)
def fit(self, data: pd.DataFrame, skills: pd.Series = None) -> 'MajorityVote':
"""Fit the model.
Args:
data (DataFrame): Workers' labeling results.
A pandas.DataFrame containing `task`, `worker` and `label` columns.
skills (Series): workers' skills.
A pandas.Series index by workers and holding corresponding worker's skill
Returns:
MajorityVote: self.
"""
data = data[['task', 'worker', 'label']]
if skills is None:
scores = data[['task', 'label']].value_counts()
else:
data = add_skills_to_data(data, skills, self.on_missing_skill, self.default_skill)
scores = data.groupby(['task', 'label'])['skill'].sum()
self.probas_ = normalize_rows(scores.unstack('label', fill_value=0))
self.labels_ = get_most_probable_labels(self.probas_)
self.skills_ = get_accuracy(data, self.labels_, by='worker')
return self
def fit_predict_proba(self, data: pd.DataFrame, skills: Optional[pd.Series] = None) -> pd.DataFrame:
"""Fit the model and return probability distributions on labels for each task.
Args:
data (DataFrame): Workers' labeling results.
A pandas.DataFrame containing `task`, `worker` and `label` columns.
skills (Series): workers' skills.
A pandas.Series index by workers and holding corresponding worker's skill
Returns:
DataFrame: Tasks' label probability distributions.
A pandas.DataFrame indexed by `task` such that `result.loc[task, label]`
is the probability of `task`'s true label to be equal to `label`. Each
probability is between 0 and 1, all task's probabilities should sum up to 1
"""
return self.fit(data, skills).probas_
def fit_predict(self, data: pd.DataFrame, skills: pd.Series = None) -> pd.Series:
"""Fit the model and return aggregated results.
Args:
data (DataFrame): Workers' labeling results.
A pandas.DataFrame containing `task`, `worker` and `label` columns.
skills (Series): workers' skills.
A pandas.Series index by workers and holding corresponding worker's skill
Returns:
Series: Tasks' labels.
A pandas.Series indexed by `task` such that `labels.loc[task]`
is the tasks's most likely true label.
"""
return self.fit(data, skills).labels_