## Mean, median, mode: Finding the middle ground. 

In [1]:
import plotly.express as px
import numpy as np
from scipy import stats
import plotly.graph_objects as go
fig = go.Figure()

In [26]:
a = 1.9
x = np.arange(0.01, 10.0, 0.01)
pdf = stats.gamma.pdf(x, a)
cdf = stats.gamma.cdf(x, a)
data = np.random.gamma(a, size=5000)

In [27]:
def plot_gamma(x, pdf, data):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=x,y=pdf, name="pdf"))
    fig.update_layout(title="Gamma distribution")
    fig.add_trace(go.Histogram(x=data, histnorm="probability density", opacity=0.5, name="Sample histogram", nbinsx=100))
    return fig
plot_gamma(x, pdf, data).show()

In [10]:
def make_line(x, trace_name, Y_MAX):
    return go.Scatter(
        x = [x, x],
        y = [0, Y_MAX], 
        name=trace_name
    )

In [11]:
mean = stats.gamma.mean(a)
median = stats.gamma.median(a) 
mode = x[np.argmax(pdf)]
fig = plot_gamma(x, pdf)
y_max = np.max(pdf)
fig.add_trace(make_line(mean, "mean", y_max))
fig.add_trace(make_line(median, "median", y_max))
fig.add_trace(make_line(mode, "mode", y_max))
fig.show()

In [59]:
pdf = np.concatenate((np.ones(21), np.zeros(20), np.ones(20)))
x = np.linspace(0, 10, len(pdf))
pdf /= np.sum(pdf)
pdf /= (10/len(pdf))

In [60]:
cdf = np.cumsum(pdf) * (10/len(pdf))
median = x[np.argmin(np.abs(cdf-0.5))]
mean = np.dot(x,pdf) * (10/len(pdf))

In [61]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=x,y=pdf, name="pdf", line_shape="hv"))
fig.update_layout(title="Gapped uniform")

y_max = np.max(pdf)
fig.add_trace(make_line(mean, "mean", y_max))
fig.add_trace(make_line(median, "median", y_max))
fig.show()

# Mode
## Pro:
- Resilient against outliers
- Might do a better job of describing most interesting point of a histogram 
- Value that is observed by the largest group


## Con:
- Mode is not unique 
- Mode requires binning
- Not in Spark 

# Mean
## Pro:
- Minimum squared error constant approximation! Solving
$$ \min_{a} \sum_{i=0}^{N-1} \left( x[i] - a \right)^2 $$
 leads to 
$$ a^* = \dfrac{1}{N} \sum_{i=0}^{N-1} x[i] = mean(x)$$
 




# Mean
## Pro:
- Expected value of pdf! 
$$ \int_{-\inf}^{\inf} xp(x)~dx = \text{E}[x] =  mean(x)$$ 

 




# Mean
## Pro:
- Minimum squared error constant approximation!
- Expected value of the pdf!
- Easy computation: dividing sum and count  



## Con:
- Outliers can have a big influence 
- Average value might not actually exist in data 


# Median
## Pro:
- Usually close to average
- Less influence of outliers 

## Con:
- Doesn't have all the nice properties of the mean 
- Can be more tricky to compute


In [6]:
median = stats.gamma.median(a) 
fig=go.Figure()
fig.add_trace(go.Scatter(x=x,y=cdf, name="cdf"))
fig.add_trace(make_line(median, "median"))

fig.show()