diff --git a/lectures/_static/quant-econ.bib b/lectures/_static/quant-econ.bib index 816d71e5e..4518996bc 100644 --- a/lectures/_static/quant-econ.bib +++ b/lectures/_static/quant-econ.bib @@ -6,11 +6,11 @@ @book{Chadhuri_Mukerjee_88, -title = {Randomized Response: Theory and Technique}, -author = {A Chadhuri and R Mukerjee}, -year = {1988}, -publisher = {Marcel Dekker}, -address = {New York} + title = {Randomized Response: Theory and Technique}, + author = {A Chadhuri and R Mukerjee}, + year = {1988}, + publisher = {Marcel Dekker}, + address = {New York} } @article{warner1965randomized, @@ -164,7 +164,7 @@ @article{Groves_73 title = {Incentives in teams}, journal = {Econometrica}, volume = {41}, - pages = {617–631} + pages = {617-631} } @article{Clarke_71, @@ -173,7 +173,7 @@ @article{Clarke_71 title = {Multipart pricing of public goods}, journal = {Public Choice}, volume = {8}, - pages = {19–33} + pages = {19-33} } @article{Vickrey_61, @@ -182,7 +182,7 @@ @article{Vickrey_61 title = {Counterspeculation, auctions, and competitive sealed tenders}, journal = {Journal of Finance}, volume = {16}, - pages = {8–37} + pages = {8-37} } @@ -315,7 +315,7 @@ @article{HST_1999 month = {}, keywords = {}, doi = {}, - abstract = {\"… I suppose there exists an extremely powerful, and, if I may so speak, malignant being, whose whole endeavours are directed toward deceiving me.\" Rene Descartes, Meditations, II.1}, + abstract = {\" I suppose there exists an extremely powerful, and, if I may so speak, malignant being, whose whole endeavours are directed toward deceiving me.\" Rene Descartes, Meditations, II.1}, url = {https://ideas.repec.org/a/oup/restud/v66y1999i4p873-907..html} } @@ -431,7 +431,7 @@ @book{Holt_Modigliani_Muth_Simon } @article{Leeper_Walker_Yang, - author = {Eric M. Leeper and Todd B. Walker and Shu‐Chun Susan Yang}, + author = {Eric M. Leeper and Todd B. Walker and Shu-Chun Susan Yang}, title = {Fiscal Foresight and Information Flows}, journal = {Econometrica}, year = 2013, @@ -662,7 +662,7 @@ @article{benhabib2018skewed } @article{pareto1896cours, - title = {Cours d’{\'e}conomie politique}, + title = {Cours d'{\'e}conomie politique}, author = {Vilfredo, Pareto}, journal = {Rouge, Lausanne}, volume = {2}, @@ -861,7 +861,7 @@ @incollection{Koopmans booktitle = {The Economic Approach to Development Planning}, address = { Chicago}, publilsher = {Rand McNally}, - pages = {225–287} + pages = {225-287} } @article{Cass, @@ -871,7 +871,7 @@ @article{Cass journal = {Review of Economic Studies}, volume = {32}, number = {3}, - pages = {233–240} + pages = {233-240} } @incollection{Cagan, diff --git a/lectures/rand_resp.md b/lectures/rand_resp.md index e29c8501f..b63c54ae4 100644 --- a/lectures/rand_resp.md +++ b/lectures/rand_resp.md @@ -3,10 +3,8 @@ jupytext: text_representation: extension: .md format_name: myst - format_version: 0.13 - jupytext_version: 1.13.4 kernelspec: - display_name: Python 3 (ipykernel) + display_name: Python 3 language: python name: python3 --- @@ -16,7 +14,7 @@ kernelspec: ## Overview -Social stigmas can inhibit people from confessing potentially embarrassing activities or opinions. +Social stigmas can inhibit people from confessing potentially embarrassing activities or opinions. When people are reluctant to participate a sample survey about personally sensitive issues, they might decline to participate, and even if they do participate, they might choose to provide incorrect answers to sensitive questions. @@ -39,8 +37,6 @@ Related ideas underlie modern **differential privacy** systems. ## Warner's Strategy - - As usual, let's bring in the Python modules we'll be using. @@ -61,216 +57,141 @@ Warner {cite}`warner1965randomized` proposed and analyzed the following procedur - Prepare a **random spinner** that with $p$ probability points to the Letter A and with $(1-p)$ probability points to the Letter B. - Each subject spins a random spinner and sees an outcome (A or B) that the interviewer does **not observe**. - The subject states whether he belongs to the group to which the spinner points. -- If the spinner points to the group that the spinner belongs, the subject reports “yes”; otherwise he reports “no”. +- If the spinner points to the group that the spinner belongs, the subject reports "yes"; otherwise he reports "no". - The subject answers the question truthfully. - Warner constructed a maximum likelihood estimators of the proportion of the population in set A. - Let -+++ - - $\pi$ : True probability of A in the population - -+++ - - $p$ : Probability that the spinner points to A - -+++ - - $X_{i}=\begin{cases}1,\text{ if the } i\text{th} \ \text{ subject says yes}\\0,\text{ if the } i\text{th} \ \text{ subject says no}\end{cases}$ -+++ Index the sample set so that the first $n_1$ report "yes", while the second $n-n_1$ report "no". The likelihood function of a sample set is -+++ - $$ -\begin{equation} L=\left[\pi p + (1-\pi)(1-p)\right]^{n_{1}}\left[(1-\pi) p +\pi (1-p)\right]^{n-n_{1}} - \tag{1} -\end{equation} -$$ - -+++ +$$ (eq:one) The log of the likelihood function is: -+++ - -$$ -\begin{equation} -\log(L)= n_1 \log \left[\pi p + (1-\pi)(1-p)\right] + (n-n_{1}) \log \left[(1-\pi) p +\pi (1-p)\right] \tag{2} -\end{equation} $$ - -+++ +\log(L)= n_1 \log \left[\pi p + (1-\pi)(1-p)\right] + (n-n_{1}) \log \left[(1-\pi) p +\pi (1-p)\right] +$$ (eq:two) The first-order necessary condition for maximimizng the log likelihood function with respect to $\pi$ is: -+++ - $$ \frac{(n-n_1)(2p-1)}{(1-\pi) p +\pi (1-p)}=\frac{n_1 (2p-1)}{\pi p + (1-\pi)(1-p)} $$ -+++ - or -+++ - -$$ -\begin{equation} -\pi p + (1-\pi)(1-p)=\frac{n_1}{n} \tag{3} -\end{equation} $$ - -+++ +\pi p + (1-\pi)(1-p)=\frac{n_1}{n} +$$ (eq:3) If $p \neq \frac{1}{2}$, then the maximum likelihood estimator (MLE) of $\pi$ is: -+++ - -$$ -\begin{equation} -\hat{\pi}=\frac{p-1}{2p-1}+\frac{n_1}{(2p-1)n} \tag{4} -\end{equation} $$ - -+++ +\hat{\pi}=\frac{p-1}{2p-1}+\frac{n_1}{(2p-1)n} +$$ (eq:four) We compute the mean and variance of the MLE estimator $\hat \pi$ to be: -+++ - $$ -\begin{align} +\begin{aligned} \mathbb{E}(\hat{\pi})&= \frac{1}{2 p-1}\left[p-1+\frac{1}{n} \sum_{i=1}^{n} \mathbb{E} X_i \right] \\ &=\frac{1}{2 p-1} \left[ p -1 + \pi p + (1-\pi)(1-p)\right] \\ -&=\pi \tag{5} -\end{align} -$$ - -+++ +&=\pi +\end{aligned} +$$ (eq:five) and -+++ - $$ -\begin{align} +\begin{aligned} Var(\hat{\pi})&=\frac{n Var(X_i)}{(2p - 1 )^2 n^2} \\ &= \frac{\left[\pi p + (1-\pi)(1-p)\right]\left[(1-\pi) p +\pi (1-p)\right]}{(2p - 1 )^2 n^2}\\ &=\frac{\frac{1}{4}+(2 p^2 - 2 p +\frac{1}{2})(- 2 \pi^2 + 2 \pi -\frac{1}{2})}{(2p - 1 )^2 n^2}\\ -&=\frac{1}{n}\left[\frac{1}{16(p-\frac{1}{2})^2}-(\pi-\frac{1}{2})^2 \right] \tag{6} -\end{align} -$$ - -+++ +&=\frac{1}{n}\left[\frac{1}{16(p-\frac{1}{2})^2}-(\pi-\frac{1}{2})^2 \right] +\end{aligned} +$$ (eq:six) -Equation (5) indicates that $\hat{\pi}$ is an **unbiased estimator** of $\pi$ while equation (6) tell us the variance of the estimator. +Equation {eq}`eq:five` indicates that $\hat{\pi}$ is an **unbiased estimator** of $\pi$ while equation {eq}`eq:six` tell us the variance of the estimator. -To compute a confidence interval, first rewrite (6) as: +To compute a confidence interval, first rewrite {eq}`eq:six` as: -+++ - -$$ -\begin{equation} -Var(\hat{\pi})=\frac{\frac{1}{4}-(\pi-\frac{1}{2})^2}{n}+\frac{\frac{1}{16(p-\frac{1}{2})^2}-\frac{1}{4}}{n} \tag{7} -\end{equation} $$ - -+++ +Var(\hat{\pi})=\frac{\frac{1}{4}-(\pi-\frac{1}{2})^2}{n}+\frac{\frac{1}{16(p-\frac{1}{2})^2}-\frac{1}{4}}{n} +$$ (eq:seven) This equation indicates that the variance of $\hat{\pi}$ can be represented as a sum of the variance due to sampling plus the variance due to the random device. - - From the expressions above we can find that: -- When $p$ is $\frac{1}{2}$, expression (1) degenerates to a constant. +- When $p$ is $\frac{1}{2}$, expression {eq}`eq:one` degenerates to a constant. - When $p$ is $1$ or $0$, the randomized estimate degenerates to an estimator without randomized sampling. -+++ - We shall analyze only discuss the situation in which $p \in (\frac{1}{2},1)$ (the situation in which $p \in (0,\frac{1}{2})$ is symmetric). -From expressions (5) and (7) we can deduce that: +From expressions {eq}`eq:five` and {eq}`eq:seven` we can deduce that: - The MSE of $\hat{\pi}$ decreases as $p$ increasing. -+++ ## Comparing Two Survey Designs Let's compare the preceding randomized-response method with a stylized non-randomized response method. -+++ - In our non-randomized response method, we suppose that: -+++ - - Members of Group A tells the truth with probability of $T_a$ while the members of Group B tells the truth with probability of $T_b$ - -+++ - - $Y_i$ is $1$ or $0$ according to whether the sample's $i\text{th}$ member's report is in Group A or not. -+++ - Then we can estimate $\pi$ as: -+++ - $$ -\begin{equation} -\hat{\pi}=\frac{\sum_{i=1}^{n}Y_i}{n} \tag{8} -\end{equation} -$$ - -+++ +\hat{\pi}=\frac{\sum_{i=1}^{n}Y_i}{n} +$$ (eq:eight) We calculate the expectation, bias, and variance of the estimator to be: -+++ +$$ +\begin{aligned} +\mathbb{E}(\hat{\pi})&=\pi T_a + \left[ (1-\pi)(1-T_b)\right]\\ +\end{aligned} +$$ (eq:nine) $$ -\begin{align} -\mathbb{E}(\hat{\pi})&=\pi T_a + \left[ (1-\pi)(1-T_b)\right] \tag{9}\\ -\\ +\begin{aligned} Bias(\hat{\pi})&=\mathbb{E}(\hat{\pi}-\pi)\\ -&=\pi [T_a + T_b -2 ] + [1- T_b] \tag{10}\\ -\\ -Var(\hat{\pi})&=\frac{ \left[ \pi T_a + (1-\pi)(1-T_b)\right] \left[1- \pi T_a -(1-\pi)(1-T_b)\right] }{n} \tag{11} -\end{align} -$$ +&=\pi [T_a + T_b -2 ] + [1- T_b] \\ +\end{aligned} +$$ (eq:ten) +$$ +\begin{aligned} +Var(\hat{\pi})&=\frac{ \left[ \pi T_a + (1-\pi)(1-T_b)\right] \left[1- \pi T_a -(1-\pi)(1-T_b)\right] }{n} +\end{aligned} +$$ (eq:eleven) It is useful to define a - - $$ \text{MSE Ratio}=\frac{\text{Mean Square Error Randomized}}{\text{Mean Square Error Regular}} $$ -+++ - We can compute MSE Ratios for different surveys and survey designs associated with different parameter values. -+++ - The following Python code computes the objects we want to stare at in order to make comparisons under different values of $\pi_A$ and $n$: @@ -323,13 +244,9 @@ class Comparison: Let's put the code to work for parameter values -+++ - - $\pi_A=0.6$ - $n=1000$ -+++ - We can generate MSE Ratios theoretically using the above formulas. We can also perform a Monte-Carlo simulation of the MSE Ratio. @@ -351,19 +268,13 @@ We see that in many situations, especially when the bias is not small, the MSE o These differences become larger as $p$ increases. -+++ - By adjusting parameters $\pi_A$ and $n$, we can study outcomes in different situations. For example, for another situation described in Warner {cite}`warner1965randomized`: -+++ - - $\pi_A=0.5$ - $n=1000$ -+++ - we can use the code ```{code-cell} ipython3 @@ -379,13 +290,9 @@ df2_mc We can also revisit a calculation in the concluding section of Warner {cite}`warner1965randomized` in which -+++ - - $\pi_A=0.6$ - $n=2000$ -+++ - We use the code ```{code-cell} ipython3 @@ -401,8 +308,6 @@ df3_mc Evidently, as $n$ increases, the randomized response method does better performance in more situations. -+++ - ## Concluding Remarks {doc}`This quantecon lecture ` describes some alternative randomized response surveys. diff --git a/lectures/util_rand_resp.md b/lectures/util_rand_resp.md index c9b708b7b..0afeca4d3 100644 --- a/lectures/util_rand_resp.md +++ b/lectures/util_rand_resp.md @@ -3,10 +3,8 @@ jupytext: text_representation: extension: .md format_name: myst - format_version: 0.13 - jupytext_version: 1.13.4 kernelspec: - display_name: Python 3 (ipykernel) + display_name: Python 3 language: python name: python3 --- @@ -40,239 +38,153 @@ proposed, for example, by {cite}`lanke1975choice`, {cite}`lanke1976degree`, {cit ## Privacy Measures -+++ - -We consider randomized response models with only two possible answers, "yes" and "no." - -The design determines probabilities - +We consider randomized response models with only two possible answers, "yes" and "no." +The design determines probabilities $$ -\begin{align} +\begin{aligned} \text{Pr}(\text{yes}|A)&=1-\text{Pr}(\text{no}|A)\\ \text{Pr}(\text{yes}|A^{'})&=1-\text{Pr}(\text{no}|A^{'}) -\end{align} +\end{aligned} $$ These design probabilities in turn can be used to compute the conditional probability of belonging to the sensitive group $A$ for a given response, say $r$: - - -$$ -\begin{equation} -\text{Pr}(A|r)=\frac{\pi_A \text{Pr}(r|A)}{\pi_A \text{Pr}(r|A)+ (1-\pi_A) \text{Pr}(r|A^{'})} \tag{1} -\end{equation} $$ +\text{Pr}(A|r)=\frac{\pi_A \text{Pr}(r|A)}{\pi_A \text{Pr}(r|A)+ (1-\pi_A) \text{Pr}(r|A^{'})} +$$ (eq:one) ## Zoo of Concepts At this point we describe some concepts proposed by various researchers -+++ - -### Leysieffer and Warner(1976) {cite}`leysieffer1976respondent` - -+++ +### Leysieffer and Warner(1976) The response $r$ is regarded as jeopardizing with respect to $A$ or $A^{'}$ if -+++ - $$ -\begin{align} +\begin{aligned} \text{Pr}(A|r)&>\pi_A\\ \text{or}&\\ -\text{Pr}(A^{'}|r)&>1-\pi_A \tag{2} -\end{align} -$$ - -+++ +\text{Pr}(A^{'}|r)&>1-\pi_A +\end{aligned} +$$ (eq:two) From Bayes's rule: -+++ - -$$ -\begin{equation} -\frac{\text{Pr}(A|r)}{\text{Pr}(A^{'}|r)}\times \frac{(1-\pi_A)}{\pi_A} = \frac{\text{Pr}(r|A)}{\text{Pr}(r|A^{'})} \tag{3} -\end{equation} $$ - -+++ +\frac{\text{Pr}(A|r)}{\text{Pr}(A^{'}|r)}\times \frac{(1-\pi_A)}{\pi_A} = \frac{\text{Pr}(r|A)}{\text{Pr}(r|A^{'})} +$$ (eq:three) If this expression is greater (less) than unity, it follows that r is jeopardizing with respect to $A$($A^{'}$). Then, the natural measure of jeopardy will be: -+++ - $$ -\begin{align} +\begin{aligned} g(r|A)&=\frac{\text{Pr}(r|A)}{\text{Pr}(r|A^{'})}\\ &\text{and}\\ -g(r|A^{'})&=\frac{\text{Pr}(r|A^{'})}{\text{Pr}(r|A)} \tag{4} -\end{align} -$$ +g(r|A^{'})&=\frac{\text{Pr}(r|A^{'})}{\text{Pr}(r|A)} +\end{aligned} +$$ (eq:four) -+++ Suppose, without loss of generality, that $\text{Pr}(\text{yes}|A)>\text{Pr}(\text{yes}|A^{'})$, then a yes (no) answer is jeopardizing with respect $A$($A^{'}$), that is, -+++ - $$ -\begin{align} +\begin{aligned} g(\text{yes}|A)&>1\\ \text{and}&\\ g(\text{no}|A^{'})&>1 -\end{align} +\end{aligned} $$ -+++ - Leysieffer and Warner proved that the variance of the estimate can only be decreased through an increase in one or both of these two measures of jeopardy. -+++ - An efficient randomized response model is, therefore, any model that attains the maximum acceptable levels of jeopardy that are consistent with cooperation of the respondents. As a special example, Leysieffer and Warner considered "a problem in which there is no jeopardy in a no answer"; that is, $g(\text{no}|A^{'})$ can be of unlimited magnitude. Evidently, an optimal design must have -+++ - - $$\text{Pr}(\text{yes}|A)=1$$ - -+++ +$$ +\text{Pr}(\text{yes}|A)=1 +$$ which implies that -+++ - -$$\text{Pr}(A|\text{no})=0$$ - -+++ - -### Lanke(1976) {cite}`lanke1976degree` +$$ +\text{Pr}(A|\text{no})=0 +$$ -+++ +### Lanke(1976) Lanke (1975) {cite}`lanke1975choice` argued that "it is membership in Group A that people may want to hide, not membership in the complementary Group A'." For that reason, Lanke (1976) {cite}`lanke1976degree` argued that ah appropriate measure of protection is to minimize -+++ - -$$ -\begin{equation} -\max \left\{ \text{Pr}(A|\text{yes}) , \text{Pr}(A|\text{no}) \right\} \tag{5} -\end{equation} $$ - -+++ +\max \left\{ \text{Pr}(A|\text{yes}) , \text{Pr}(A|\text{no}) \right\} +$$ (eq:five) Holding this measure constant, he explained under what conditions the smallest variance of the estimate was achieved with the unrelated question model or Warner's (1965) original model. -+++ - -### 2.3 Fligner, Policello, and Singh {cite}`fligner1977comparison` - -+++ +### 2.3 Fligner, Policello, and Singh -Fligner, Policello, and Singh reached similar conclusion as Lanke (1976). +Fligner, Policello, and Singh reached similar conclusion as Lanke (1976). {cite}`fligner1977comparison` They measured "private protection" as -+++ - -$$ -\begin{equation} -\frac{1-\max \left\{ \text{Pr}(A|\text{yes}) , \text{Pr}(A|\text{no}) \right\}}{1-\pi_A} \tag{6} -\end{equation} $$ +\frac{1-\max \left\{ \text{Pr}(A|\text{yes}) , \text{Pr}(A|\text{no}) \right\}}{1-\pi_A} +$$ (eq:six) -+++ -### 2.4 Greenberg, Kuebler, Abernathy, and Horvitz (1977) {cite}`greenberg1977respondent` +### 2.4 Greenberg, Kuebler, Abernathy, and Horvitz (1977) -+++ +{cite}`greenberg1977respondent` Greenberg, Kuebler, Abernathy, and Horvitz (1977) stressed the importance of examining the risk to respondents who do not belong to $A$ as well as the risk to those who do belong to the sensitive group. They defined the hazard for an individual in $A$ as the probability that he or she is perceived as belonging to $A$: -+++ - $$ -\begin{equation} -\text{Pr}(\text{yes}|A)\times \text{Pr}(A|\text{yes})+\text{Pr}(\text{no}|A)\times \text{Pr}(A|\text{no}) \tag{7.a} -\end{equation} -$$ - -+++ +\text{Pr}(\text{yes}|A)\times \text{Pr}(A|\text{yes})+\text{Pr}(\text{no}|A)\times \text{Pr}(A|\text{no}) +$$ (eq:seven-a) Similarly, the hazard for an individual who does not belong to $A$ would be -+++ - $$ -\begin{equation} -\text{Pr}(\text{yes}|A^{'})\times \text{Pr}(A|\text{yes})+\text{Pr}(\text{no}|A^{'}) \times \text{Pr}(A|\text{no}) \tag{7.b} -\end{equation} -$$ - -+++ +\text{Pr}(\text{yes}|A^{'})\times \text{Pr}(A|\text{yes})+\text{Pr}(\text{no}|A^{'}) \times \text{Pr}(A|\text{no}) +$$ (eq:seven-b) Greenberg et al. (1977) also considered an alternative related measure of hazard that "is likely to be closer to the actual concern felt by a respondent." The "limited hazard" for an individual in $A$ and $A^{'}$ is -+++ - $$ -\begin{equation} -\text{Pr}(\text{yes}|A)\times \text{Pr}(A|\text{yes}) \tag{8.a} -\end{equation} -$$ - -+++ +\text{Pr}(\text{yes}|A)\times \text{Pr}(A|\text{yes}) +$$ (eq:eight-a) and -+++ - -$$ -\begin{equation} -\text{Pr}(\text{yes}|A^{'})\times \text{Pr}(A|\text{yes}) \tag{8.b} -\end{equation} $$ - -+++ +\text{Pr}(\text{yes}|A^{'})\times \text{Pr}(A|\text{yes}) +$$ (eq:eight-b) This measure is just the first term in $(7)$, i.e., the probability that an individual answers "yes" and is perceived to belong to A. -+++ - ## Respondent's Expected Utility -+++ - ### Truth Border -+++ - Key assumptions that underlie a randomized response technique for estimating the fraction of a population that belongs to A are: -+++ - - **Assumption 1**: Respondents feel discomfort from being thought of as belonging to $A$. - **Assumption 2**: Respondents prefer to answer questions truthfully than to lie, so long as the cost of doing so is not too high. The cost is taken to be the discomfort in 1. -+++ - Let $r_i$ denote individual $i$'s response to the randomized question $r_i$ can only take values "yes" or "no". @@ -282,85 +194,50 @@ that belongs to $A$, the respondent's answer is associated with a conditional p Given $r_i$ and complete privacy, the individual's utility is higher if $r_i$ represents a truthful answer rather than a lie. -+++ - In terms of a respondent's expected utility as a function of $ \text{Pr}(A|r_i)$ and $r_i$ -+++ - - The higher is $ \text{Pr}(A|r_i)$, the lower isindividual $i$'s expected utility. - expected utility is higher if $r_i$ represents a truthful answer rather than a lie -+++ - Define: -+++ - - $\phi_i \in \left\{\text{truth},\text{lie}\right\}$, a dichotomous variable that indicates whether or not $r_i$ is a truthful statement. - $U_i\left(\text{Pr}(A|r_i),\phi_i\right)$, a utility function that is differentiable in its first argument, summarizes individual $i$'s expected utility. -+++ - Then there is an $r_i$ such that -+++ - $$ -\begin{equation} -\frac{\partial U_i\left(\text{Pr}(A|r_i),\phi_i\right) }{\partial \text{Pr}(A|r_i)} <0, \text{ for } \phi_i \in \left\{\text{truth},\text{lie}\right\} \tag{9.a} -\end{equation} -$$ - -+++ +\frac{\partial U_i\left(\text{Pr}(A|r_i),\phi_i\right) }{\partial \text{Pr}(A|r_i)} <0, \text{ for } \phi_i \in \left\{\text{truth},\text{lie}\right\} +$$ (eq:nine-a) and -+++ - $$ -\begin{equation} -U_i\left(\text{Pr}(A|r_i),\text{truth}\right)>U_i\left(\text{Pr}(A|r_i),\text{lie}\right) , \text{ for } \text{Pr}(A|r_i) \in [0,1] \tag{9.b} -\end{equation} -$$ - -+++ +U_i\left(\text{Pr}(A|r_i),\text{truth}\right)>U_i\left(\text{Pr}(A|r_i),\text{lie}\right) , \text{ for } \text{Pr}(A|r_i) \in [0,1] +$$ (eq:nine-b) Suppose now that correct answer for individual $i$ is "yes". Individual $i$ would choose to answer truthfully if -+++ - -$$ -\begin{equation} -U_i\left(\text{Pr}(A|\text{yes}),\text{truth}\right)\geq U_i\left(\text{Pr}(A|\text{no}),\text{lie}\right) \tag{10.a} -\end{equation} $$ +U_i\left(\text{Pr}(A|\text{yes}),\text{truth}\right)\geq U_i\left(\text{Pr}(A|\text{no}),\text{lie}\right) +$$ (eq:ten-a) -+++ If the correct answer is "no," individual $i$ would volunteer the correct answer only if -+++ - $$ -\begin{equation} -U_i\left(\text{Pr}(A|\text{no}),\text{truth}\right)\geq U_i\left(\text{Pr}(A|\text{yes}),\text{lie}\right) \tag{10.b} -\end{equation} -$$ - -+++ +U_i\left(\text{Pr}(A|\text{no}),\text{truth}\right)\geq U_i\left(\text{Pr}(A|\text{yes}),\text{lie}\right) +$$ (eq:ten-b) Assume that -+++ - -$$\text{Pr}(A|\text{yes})>\pi_A>\text{Pr}(A|\text{no})$$ - -+++ +$$ +\text{Pr}(A|\text{yes})>\pi_A>\text{Pr}(A|\text{no}) +$$ so that a "yes" answer increases the odds that an individual belongs to $A$. @@ -368,58 +245,34 @@ Constraint $(10.\text{b})$ holds for sure. Consequently, constraint $(10.\text{a})$ becomes the single necessarly condition for individual $i$ always to answer truthfully. -+++ +At equality, constraint $(10.\text{a})$ determines conditional probabilities that make the individual indifferent between telling the truth and lying when the correct answer is "yes": -At equality, constraint $(10.\text{a})$ determines conditional probabilities that make the individual indifferent between telling the truth and lying when the correct answer is "yes”: - -+++ - -$$ -\begin{equation} -U_i\left(\text{Pr}(A|\text{yes}),\text{truth}\right)= U_i\left(\text{Pr}(A|\text{no}),\text{lie}\right) \tag{11} -\end{equation} $$ - -+++ +U_i\left(\text{Pr}(A|\text{yes}),\text{truth}\right)= U_i\left(\text{Pr}(A|\text{no}),\text{lie}\right) +$$ (eq:eleven) Equation $(11)$ defines a "truth border". Differentiating $(11)$ with respect to the conditional probabilities shows that the truth border has a positive slope in the space of conditional probabilities: -+++ - $$ -\begin{equation} -\frac{\partial \text{Pr}(A|\text{no})}{\partial \text{Pr}(A|\text{yes})}=\frac{\frac{\partial U_i\left(\text{Pr}(A|\text{yes}),\text{truth}\right) }{\partial \text{Pr}(A|\text{yes})}}{\frac{\partial U_i\left(\text{Pr}(A|\text{no}),\text{lie}\right) }{\partial \text{Pr}(A|\text{no})}}>0 \tag{12} -\end{equation} -$$ - -+++ +\frac{\partial \text{Pr}(A|\text{no})}{\partial \text{Pr}(A|\text{yes})}=\frac{\frac{\partial U_i\left(\text{Pr}(A|\text{yes}),\text{truth}\right) }{\partial \text{Pr}(A|\text{yes})}}{\frac{\partial U_i\left(\text{Pr}(A|\text{no}),\text{lie}\right) }{\partial \text{Pr}(A|\text{no})}}>0 +$$ (eq:twelve) The source of the positive relationship is: -+++ - - The individual is willing to volunteer a truthful "yes" answer so long as the utility from doing so (i.e., the left side of $(11)$) is at least as high as the utility of lying on the right side of $(11)$. - Suppose now that $\text{Pr}(A|\text{yes})$ increases. That reduces the utility of telling the truth. To preserve indifference between a truthful answer and a lie, $\text{Pr}(A|\text{no})$ must increase to reduce the utility of lying. -+++ - ### Drawing a Truth Border -+++ - We can deduce two things about the truth border: -+++ - - The truth border divides the space of conditional probabilities into two subsets: "truth telling" and "lying". Thus, sufficient privacy elicits a truthful answer, whereas insufficient privacy results in a lie. The truth border depends on a respondent's utility function. - Assumptions in $(9)$ are sufficient only to guarantee a positive slope of the truth border. The truth border can have either a concave or a convex shape. -+++ - We can draw some truth borders with the following Python code: ```{code-cell} ipython3 @@ -454,11 +307,9 @@ Figure 1.1 three types of truth border. Without loss of generality, we consider the truth boder: -+++ - -$$U_i(\text{Pr}(A|r_i),\phi_i)=-\text{Pr}(A|r_i)+f(\phi_i)$$ - -+++ +$$ +U_i(\text{Pr}(A|r_i),\phi_i)=-\text{Pr}(A|r_i)+f(\phi_i) +$$ and plot the "truth telling" and "lying area" of individual $i$ in Figure 1.2: @@ -486,85 +337,49 @@ plt.title('Figure 1.2') ## Utilitarian View of Survey Design -+++ - ### Iso-variance Curves -+++ - A statistician's objective is -+++ - - to find a randomized response survey design that minimizes the bias and the variance of the estimator. -+++ - Given a design that ensures truthful answers by all respondents, Anderson(1976, Theorem 1) {cite}`anderson1976estimation` showed that the minimum variance estimate in the two-response model has variance -+++ - $$ -\begin{align} +\begin{aligned} V(\text{Pr}(A|\text{yes}) , \text{Pr}(A|\text{no})) -= &\frac{{\pi_A}^2 (1-\pi_A)^2}{n}\times \frac{1}{\text{Pr}(A|\text{yes})-\pi_A}\times \frac{1}{\pi_A-\text{Pr}(A|\text{no})} \tag{13} -\end{align} -$$ - -+++ += &\frac{{\pi_A}^2 (1-\pi_A)^2}{n}\times \frac{1}{\text{Pr}(A|\text{yes})-\pi_A}\times \frac{1}{\pi_A-\text{Pr}(A|\text{no})} +\end{aligned} +$$ (eq:thirteen) where the random sample with replacement consists of $n$ individuals. -+++ - We can use Expression $(13)$ to draw iso-variance curves. The following inequalities restrict the shapes of iso-variance curves: -+++ - $$ -\begin{equation} \frac{d \text{ Pr}(A|\text{no})}{d\text{ Pr}(A|\text{yes})}\bigg|_{\text{constant variance}}=\frac{\pi_A-\text{Pr}(A|\text{no})}{\text{Pr}(A|\text{yes})-\pi_A}>0 -\tag{14.a} -\end{equation} -$$ - -+++ +$$ (eq:fourteen-a) $$ -\begin{equation} \frac{d^2 \text{ Pr}(A|\text{no})}{d\text{ Pr}(A|\text{yes})^2}\bigg|_{\text{constant variance}}=- \frac{2 \left[\pi_A-\text{Pr}(A|\text{no})\right]}{\left[\text{Pr}(A|\text{yes})-\pi_A \right]^2}<0 -\tag{14.b} -\end{equation} -$$ - -+++ +$$ (eq:fourteen-b) From expression $(13)$ and $(14)$ we can see that: -+++ - - Variance can be reduced only by increasing the distance of $\text{Pr}(A|\text{yes})$ and/or $\text{Pr}(A|\text{no})$ from $r_A$. -+++ - - Iso-variance curves are always upward-sloping and concave. -+++ - ### Drawing Iso-variance Curves -+++ - We use Python code to draw iso-variance curves. The pairs of conditional probabilities can be attained using Warner's (1965) model. Note that: -+++ - - Any point on the iso-variance curves can be attained with the unrelated question model as long as the statistician can completely control the model design. - Warner's (1965) original randomized response model is less flexible than the unrelated question model. @@ -603,26 +418,16 @@ class Iso_Variance: Properties of iso-variance curves are: -+++ - - All points on one iso-variance curve share the same variance -+++ - - From $V_1$ to $V_9$, the variance of the iso-variance curve increase monotonically, as colors brighten monotonically -+++ - Suppose the parameters of the iso-variance model follow those in article XXXX Ljungqvist, which are: -+++ - - $\pi=0.3$ - $n=100$ -+++ - Then we can plot the iso-variance curve in Figure 2: ```{code-cell} ipython3 @@ -632,88 +437,50 @@ var.plotting_iso_variance_curve() ### Optimal Survey -+++ - A point on an iso-variance curves can be attained with the unrelated question design. We now focus on finding an "optimal survey design" that -+++ - - Minimizes the variance of the estimator subject to privacy restrictions. -+++ - To obtain an optimal design, we first superimpose all individuals' truth borders on the iso-variance mapping. To construct an optimal design -+++ - - The statistician should find the intersection of areas above all truth borders; that is, the set of conditional probabilities ensuring truthful answers from all respondents. -+++ - - The point where this set touches the lowest possible iso-variance curve determines an optimal survey design. -+++ - Consquently, a minimum variance unbiased estimator is pinned down by an individual who is the least willing to volunteer a truthful answer. -+++ - Here are some comments about the model design: -+++ - - An individual's decision of whether or not to answer truthfully depends on his or her belief about other respondents' behavior, because this determines the individual's calculation of $\text{ Pr}(A|\text{yes})$ and $\text{ Pr}(A|\text{no})$. - An equilibrium of the optimal design model is a Nash equilibrium of a noncooperative game. -+++ - - Assumption $(9.\text{b})$ is sufficient to guarantee existence of an optimal model design. By choosing $\text{ Pr}(A|\text{yes})$ and $\text{ Pr}(A|\text{no})$ sufficiently close to each other, all respondents will find it optimal to answer truthfully. The closer are these probabilities, the higher the variance of the estimator becomes. -+++ - - If respondents experience a large enough increase in expected utility from telling the truth, then there is no need to use a randomized response model. The smallest possible variance of the estimate is then obtained at $\text{ Pr}(A|\text{yes})=1$ and $\text{ Pr}(A|\text{no})=0$ ; that is, when respondents answer truthfully to direct questioning. -+++ - - A more general design problem would be to minimize some weighted sum of the estimator's variance and bias. It would be optimal to accept some lies from the most "reluctant" respondents. -+++ - ## Criticisms of Proposed Privacy Measures -+++ - We can use a utilitarian approach to analyze some privacy measures. We'll enlist Python Code to help us. -+++ - ### Analysis of Method of Lanke's (1976) -+++ - Lanke (1976) recommends a privacy protection criterion that minimizes: -+++ - -$$ -\begin{equation} -\max \left\{ \text{Pr}(A|\text{yes}) , \text{Pr}(A|\text{no}) \right\} \tag{5} -\end{equation} $$ - -+++ +\max \left\{ \text{Pr}(A|\text{yes}) , \text{Pr}(A|\text{no}) \right\} +$$ (eq:five) Following Lanke's suggestion, the statistician should find the highest possible $\text{ Pr}(A|\text{yes})$ consistent with truth telling while $\text{ Pr}(A|\text{no})$ is fixed at 0. The variance is then minimized at point $X$ in Figure 3. -+++ - However, we can see that in Figure 3, point $Z$ offers a smaller variance that still allows cooperation of the respondents, and it is achievable following our discussion of the truth border in Part III: ```{code-cell} ipython3 @@ -754,67 +521,48 @@ plt.title('Figure 3') ### Method of Leysieffer and Warner (1976) -+++ - Leysieffer and Warner (1976) recommend a two-dimensional measure of jeopardy that reduces to a single dimension when there is no jeopardy in a 'no' answer", which means that -+++ - $$\text{Pr}(\text{yes}|A)=1$$ - -+++ +$$ +\text{Pr}(\text{yes}|A)=1 +$$ and -+++ - -$$\text{Pr}(A|\text{no})=0$$ - -+++ +$$ +\text{Pr}(A|\text{no})=0 +$$ This is not an optimal choice under a utilitarian approach. -+++ - -### Analysis on the Method of Chaudhuri and Mukerjee's (1988) {cite}`Chadhuri_Mukerjee_88` +### Analysis on the Method of Chaudhuri and Mukerjee's (1988) -+++ +{cite}`Chadhuri_Mukerjee_88` Chaudhuri and Mukerjee (1988) argued that the individual may find that since "yes" may sometimes relate to the sensitive group A, a clever respondent may falsely but safely always be inclined to respond "no". In this situation, the truth border is such that individuals choose to lie whenever the truthful answer is "yes" and -+++ - -$$\text{Pr}(A|\text{no})=0$$ - -+++ +$$ +\text{Pr}(A|\text{no})=0 +$$ Here the gain from lying is too high for someone to volunteer a "yes" answer. This means that -+++ - $$ -\begin{equation} U_i\left(\text{Pr}(A|\text{yes}),\text{truth}\right)< U_i\left(\text{Pr}(A|\text{no}),\text{lie}\right) -\end{equation} $$ -+++ - in any situation always. As a result, there is no attainable model design. -+++ - However, under a utilitarian approach there should exist other survey designs that are consistent with truthful answers. In particular, respondents will choose to answer truthfully if the relative advantage from lying is eliminated. -+++ - - We can use Python to show that the optimal model design corresponds to point Q in Figure 4: +We can use Python to show that the optimal model design corresponds to point Q in Figure 4: ```{code-cell} ipython3 def f(x): @@ -859,124 +607,77 @@ plt.legend(loc=0,fontsize='large') plt.title('Figure 4') ``` -### Method of Greenberg et al. (1977) {cite}`greenberg1977respondent` +### Method of Greenberg et al. (1977) -+++ + {cite}`greenberg1977respondent` Greenberg et al. (1977) defined the hazard for an individual in $A$ as the probability that he or she is perceived as belonging to $A$: -+++ - $$ -\begin{equation} -\text{Pr}(\text{yes}|A)\times \text{Pr}(A|\text{yes})+\text{Pr}(\text{no}|A)\times \text{Pr}(A|\text{no}) \tag{7.a} -\end{equation} -$$ - -+++ +\text{Pr}(\text{yes}|A)\times \text{Pr}(A|\text{yes})+\text{Pr}(\text{no}|A)\times \text{Pr}(A|\text{no}) +$$ (eq:seven-a) The hazard for an individual who does not belong to $A$ is -+++ - $$ -\begin{equation} -\text{Pr}(\text{yes}|A^{'})\times \text{Pr}(A|\text{yes})+\text{Pr}(\text{no}|A^{'}) \times \text{Pr}(A|\text{no}) \tag{7.b} -\end{equation} -$$ - -+++ +\text{Pr}(\text{yes}|A^{'})\times \text{Pr}(A|\text{yes})+\text{Pr}(\text{no}|A^{'}) \times \text{Pr}(A|\text{no}) +$$ (eq:seven-a) They also considered an alternative related measure of hazard that they said "is likely to be closer to the actual concern felt by a respondent." Their "limited hazard" for an individual in $A$ and $A^{'}$ is -+++ - -$$ -\begin{equation} -\text{Pr}(\text{yes}|A)\times \text{Pr}(A|\text{yes}) \tag{8.a} -\end{equation} $$ - -+++ +\text{Pr}(\text{yes}|A)\times \text{Pr}(A|\text{yes}) +$$ (eq:eight-a) and -+++ - -$$ -\begin{equation} -\text{Pr}(\text{yes}|A^{'})\times \text{Pr}(A|\text{yes}) \tag{8.b} -\end{equation} $$ - -+++ +\text{Pr}(\text{yes}|A^{'})\times \text{Pr}(A|\text{yes}) +$$ (eq:eight-b) According to Greenberg et al. (1977), a respondent commits himself or herself to answer truthfully on the basis of a probability in $(7)$ or $(8)$ **before** randomly selecting the question to be answered. -+++ - Suppose that the appropriate privacy measure is captured by the notion of "limited hazard" in $(8)$. Consider an unrelated question model where the unrelated question is replaced by the instruction "Say the word 'no,' " which implies that -+++ - -$$ \text{Pr}(A|\text{yes})=1$$ - -+++ +$$ +\text{Pr}(A|\text{yes})=1 +$$ and it follows that: -+++ - - Hazard for an individual in $A^{'}$ is 0. - Hazard for an individual in A can also be made arbitrarily small by choosing a sufficiently small $\text{Pr}(\text{yes}|A)$. -+++ - Even though this hazard can be set arbitrarily close to 0, an individual in $A$ will completely reveal his or her identity whenever truthfully answering the sensitive question. -+++ - However, under utilitarian framework, it is obviously contradictory. If the individuals are willing to volunteer this information, it seems that the randomized response design was not necessary in the first place. It ignores the fact that respondents retain the option of lying until they have seen the question to be answered. -+++ - ## Concluding Remarks The justifications for a randomized response procedure are that -+++ - - Respondents are thought to feel discomfort from being perceived as belonging to the sensitive group. -+++ - - Respondents prefer to answer questions truthfully than to lie, unless it is too revealing. -+++ - If a privacy measure is not completely consistent with the rational behavior of the respondents, all efforts to derive an optimal model design are futile. A utilitarian approach provides a systematic way to model respondents' behavior under the assumption that they maximize their expected utilities. - In a utilitarian analysis: -+++ - - A truth border divides the space of conditional probabilities of being perceived as belonging to the sensitive group, $\text{Pr}(A|\text{yes})$ and $\text{Pr}(A|\text{no})$, into the truth-telling region and the lying region. - The optimal model design is obtained at the point where the truth border touches the lowest possible iso-variance curve. -+++ - A practical implication of the analysis of {cite}`ljungqvist1993unified` is that uncertainty about respondents' demands for privacy can be acknowledged by **choosing $\text{Pr}(A|\text{yes})$ and $\text{Pr}(A|\text{no})$ sufficiently close to each other**.