In [71]:
import numpy as np
import scipy as sp
import scipy.stats
import pandas as pd
import bokeh.plotting as plt

In [58]:
binom_10_density, binom_bins = np.histogram(np.random.binomial(10, .5, size=10), bins=np.arange(11), density=True)
binom_100_density, binom_bins = np.histogram(np.random.binomial(10, .5, size=100), bins=np.arange(11), density=True)
binom_1000_density, binom_bins = np.histogram(np.random.binomial(10, .5, size=1000), bins=np.arange(11), density=True)

In [105]:
plt.output_notebook(hide_banner=True)

p1 = plt.figure(height=400, width=960, title="E[bin[n=10, p=0.5](•)] → N[μ=5, σ=1.58](•)")

p1.line(
    binom_bins,
    binom_10_density,
    line_color="#ff9999",
    line_width=2
)

p1.line(
    binom_bins,
    binom_100_density,
    line_color="#ff4d4d",
    line_width=2
)

p1.line(
    binom_bins,
    binom_1000_density,
    line_color="#ff0000",
    line_width=2
)

# p1.quad(top=binom_1000_density, bottom=0, left=binom_bins[:-1], right=binom_bins[:-1],
#         fill_color="#036564", line_color="#033649")

quantiles = np.linspace(0, 10, num=100)
normal_rv = scipy.stats.norm(loc=5, scale=1.58)
pdfs = normal_rv.pdf(quantiles)

p1.line(
    quantiles,
    pdfs,
    line_width=3,
    line_color="black"
)

plt.show(p1)

<bokeh.io._CommsHandle at 0xc9839b0>

In [107]:
binom_10_density_2, binom_bins = np.histogram(np.random.binomial(10, .1, size=10), bins=np.arange(11), density=True)
binom_100_density_2, binom_bins = np.histogram(np.random.binomial(10, .1, size=100), bins=np.arange(11), density=True)
binom_1000_density_2, binom_bins = np.histogram(np.random.binomial(10, .1, size=1000), bins=np.arange(11), density=True)

In [109]:
plt.output_notebook(hide_banner=True)

p1 = plt.figure(height=400, width=960, title="E[bin[n=10, p=0.1](•)] → N[μ=1, σ=.95](•)")

p1.line(
    binom_bins,
    binom_10_density_2,
    line_color="#ff9999",
    line_width=2
)

p1.line(
    binom_bins,
    binom_100_density_2,
    line_color="#ff4d4d",
    line_width=2
)

p1.line(
    binom_bins,
    binom_1000_density_2,
    line_color="#ff0000",
    line_width=2
)

quantiles = np.linspace(0, 10, num=100)
normal_rv = scipy.stats.norm(loc=1, scale=.948)
pdfs = normal_rv.pdf(quantiles)

p1.line(
    quantiles,
    pdfs,
    line_width=3,
    line_color="black"
)

plt.show(p1)

<bokeh.io._CommsHandle at 0xdc86358>

Error is caused by the fact that we are approximating a *discrete* distribution with a *continuous* one. This will vanish if we increase the number of bins.

In [112]:
binom_10_density_3, binom_bins = np.histogram(np.random.binomial(100, .4, size=10), bins=np.arange(101), density=True)
binom_100_density_3, binom_bins = np.histogram(np.random.binomial(100, .4, size=100), bins=np.arange(101), density=True)
binom_1000_density_3, binom_bins = np.histogram(np.random.binomial(100, .4, size=1000), bins=np.arange(101), density=True)

In [114]:
plt.output_notebook(hide_banner=True)

p1 = plt.figure(height=400, width=960, title="E[bin[n=100, p=0.4](•)] → N[μ=40, σ=4.9](•)")

p1.line(
    binom_bins,
    binom_10_density_3,
    line_color="#ff9999",
    line_width=2
)

p1.line(
    binom_bins,
    binom_100_density_3,
    line_color="#ff4d4d",
    line_width=2
)

p1.line(
    binom_bins,
    binom_1000_density_3,
    line_color="#ff0000",
    line_width=2
)

quantiles = np.linspace(0, 100, num=100)
normal_rv = scipy.stats.norm(loc=40, scale=4.9)
pdfs = normal_rv.pdf(quantiles)

p1.line(
    quantiles,
    pdfs,
    line_width=3,
    line_color="black"
)

plt.show(p1)

<bokeh.io._CommsHandle at 0xe002588>

Any $\theta$ statistic estimator $\hat{\theta}$ that relies on the mean, that is, any $\bar{\theta} = \frac{1}{n}\sum_n \hat{\theta}_i$, will become normal via the classic central limit theorem:

$$\sqrt{n}\left(\left(\frac{1}{n}\sum_n \theta_i\right) - \mu\right) \to N(0, \sigma^2)$$
$$\cdots$$
$$\frac{\sum_n \hat{\theta}_i}{n} \to N(\mu_{\theta}, \frac{\sigma^2_{\theta}}{n})$$

Using an estimator for $p$ will work better because the data is truly continuous.