-
Notifications
You must be signed in to change notification settings - Fork 10
/
weighted.py
166 lines (134 loc) · 5.48 KB
/
weighted.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import numpy as np
import pandas as pd
def weight(df, col, w):
"""Calculates the weighted value of a column in a DataFrame.
Args:
df: A pandas DataFrame.
col: A string indicating the column in the DataFrame to weight.
w: Weight column.
Returns:
A pandas Series multiplying the column by its weight.
"""
return df[col] * df[w]
def weighted_sum(df, col, w):
"""Calculates the weighted sum of a column in a DataFrame.
Args:
df: A pandas DataFrame.
col: A string indicating the column in the DataFrame.
w: Weight column.
Returns:
The weighted sum of a DataFrame's column.
"""
return (df[col] * df[w]).sum()
def weighted_mean(df, col, w):
"""Calculates the weighted mean of a column in a DataFrame.
Args:
df: A pandas DataFrame.
col: A string indicating the column in the DataFrame.
w: Weight column.
Returns:
The weighted mean of a DataFrame's column.
"""
return weighted_sum(df, col, w) / df[w].sum()
def weighted_quantile(values, quantiles, sample_weight=None,
values_sorted=False, old_style=False):
"""Calculates weighted quantiles of a set of values.
From https://stackoverflow.com/a/29677616/1840471.
Doesn't exactly match unweighted quantiles of stacked values.
See stackoverflow.com/q/21844024#comment102342137_29677616.
Args:
values: numpy array with data.
quantiles: array-like with many quantiles needed ([0, 1]).
sample_weight: array-like of the same length as `array`.
values_sorted: bool, if True, then will avoid sorting of
initial array
old_style: if True, will correct output to be consistent
with numpy.percentile.
Returns:
numpy.array with computed quantiles.
"""
values = np.array(values)
quantiles = np.array(quantiles)
if sample_weight is None:
sample_weight = np.ones(len(values))
sample_weight = np.array(sample_weight)
assert np.all(quantiles >= 0) and np.all(quantiles <= 1), \
'quantiles should be in [0, 1]'
if not values_sorted:
sorter = np.argsort(values)
values = values[sorter]
sample_weight = sample_weight[sorter]
weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight
if old_style:
# To be convenient with numpy.percentile
weighted_quantiles -= weighted_quantiles[0]
weighted_quantiles /= weighted_quantiles[-1]
else:
weighted_quantiles /= np.sum(sample_weight)
return np.interp(quantiles, weighted_quantiles, values)
def weighted_median(df, col, w):
"""Calculates the weighted median of a column in a DataFrame.
Args:
df: A pandas DataFrame containing Tax-Calculator data.
col: A string indicating the column in the DataFrame.
w: Weight column.
Returns:
The weighted median of a DataFrame's column.
"""
return weighted_quantile(df[col], 0.5, df[w])
def add_weighted_quantiles(df, col, w):
"""Adds weighted quantiles of a column to a DataFrame.
Adds columns for each of these types of quantiles to a DataFrame:
* *_percentile_exact: Exact percentile.
* *_percentile: Integer percentile (ceiling).
* *_2percentile: Integer percentile (ceiling, for each two percentiles).
* *_ventile: Integer percentile (ceiling, for each five percentiles).
* *_decile: Integer decile.
* *_quintile: Integer quintile.
* *_quartile: Integer quartile.
Negative values are assigned -1.
Args:
df: A pandas DataFrame.
col: A string indicating the column in the DataFrame to calculate.
w: Weight column.
Returns:
Nothing. Columns are added in place.
"""
df.sort_values(by=col, inplace=True)
col_pctile = col + '_percentile_exact'
df[col_pctile] = 100 * df[w].cumsum() / df[w].sum()
# "Null out" negatives using -1, since integer arrays can't be NaN.
# TODO: Should these be null floats?
df[col_pctile] = np.where(df[col] >= 0, df[col_pctile], 0)
# Reduce top record, otherwise it's incorrectly rounded up.
df[col_pctile] = np.where(df[col_pctile] >= 99.99999, 99.99999,
df[col_pctile])
df[col + '_percentile'] = np.ceil(df[col_pctile]).astype(int)
df[col + '_2percentile'] = 2 * np.ceil(df[col_pctile] / 2).astype(int)
df[col + '_ventile'] = 5 * np.ceil(df[col_pctile] / 5).astype(int)
df[col + '_decile'] = np.ceil(df[col_pctile] / 10).astype(int)
df[col + '_quintile'] = np.ceil(df[col_pctile] / 20).astype(int)
df[col + '_quartile'] = np.ceil(df[col_pctile] / 25).astype(int)
def quantile_chg(v1, v2, w1=None, w2=None, q=np.arange(0.1, 1, 0.1)):
""" Create table with two sets of quantiles.
Args:
v1: First set of values.
v2: Second set of values.
w1: First set of weights. Defaults to equal weight.
w2: Second set of weights. Defaults to equal weight.
q: Quantiles. Defaults to decile boundaries.
Returns:
DataFrame with two rows and a column for each quantile.
Column labels are "xth percentile" and a label is added
to the median.
"""
q1 = weighted_quantile(v1, q, w1)
q2 = weighted_quantile(v2, q, w2)
df = pd.DataFrame([q1, q2])
# Set decile labels.
q_print = [ordinal_label((i * 100)) for i in q]
# TODO: Check if other values are median
if q[4] == 0.5:
q_print[4] += ' (median)'
df.columns = q_print
return df