In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
import math
plots.style.use('fivethirtyeight')

# Lecture 25 #

## Measures of Center ##

### Average (Mean) ###

Mean is the ratio of the sum of all of the data over the number of data points. 
The value will between the lowest and highest value (but not neccessarily the middle value)

There are several ways to calculate the mean. 

In [None]:
values = make_array(2, 3, 3, 9)
values

In [None]:
# Brute Force Average Calculation
sum(values)/len(values)

In [None]:
# Numpy Method for Average
np.average(values)

In [None]:
# Numpy Method for Mean
np.mean(values)

In [None]:
(2 + 3 + 3 + 9)/4

In [None]:
# Proportional Calculation
2*(1/4) + 3*(2/4) + 9*(1/4)

In [None]:
2*0.25 + 3*0.5 + 9*0.25

In [None]:
values_table = Table().with_columns('value', values)
values_table

In [None]:
bins_for_display = np.arange(0.5, 10.6, 1)
values_table.hist('value', bins = bins_for_display)

In [None]:
## Make array of 10 2s, 20 3s, and 10 9s

new_vals = make_array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
                      9, 9, 9, 9, 9, 9, 9, 9, 9, 9)

In [None]:
Table().with_column('value', new_vals).hist(bins = bins_for_display)

In [None]:
# Original Array Average
np.average(values)

In [None]:
# New Array Average
np.average(new_vals)

In [None]:
Table().with_column('value', new_vals).hist(bins = bins_for_display)
plots.ylim(-0.04, 0.5)
plots.plot([0, 10], [0, 0], color='grey', lw=2)
plots.scatter(4.25, -0.015, marker='^', color='red', s=100)
plots.title('Average as a Center of Gravity');

The average gives the value that would balance the distribution based on the wieght of each data point instead of finding the middle value (median).

Symmetric Distributions:    the mean   the median
Skewed Left Distributions:  the mean   the median
Skewed Right Distributions: the mean   the median

## Data Variability ##

How does the data vary from the mean?

Deviation from the mean = 
    Negative values imply
    Positive values imply

### Standard Deviation ###
A standard measure to help describe the distance a value is from the mean. 

The majority of the data will lie withing 4 standard deviations of the mean. 

In [None]:
sd_table = Table().with_columns('Value', values)
sd_table

In [None]:
average_value = np.mean(values)
average_value


In [None]:
deviations = values - average_value
sd_table = sd_table.with_column('Deviation', deviations)
sd_table

In [None]:
sum(deviations)

### Variance ###

The size of the deviations. This is calucated by squaring the standard deviation. 


In [None]:
sd_table = sd_table.with_column('Squared Deviation', deviations ** 2)
sd_table

In [None]:
# Variance of the data:
# mean squared deviation from average

variance = np.mean(deviations ** 2)
variance

In [None]:
# Standard Deviation (SD): 
# root mean squared deviation from average
# = square root of the variance

# Brute Force
sd = variance ** 0.5
print(f"Brute: {sd}")

# Math Module
sd = math.sqrt(variance)
print(f" Math: {sd}")

# Numpy Module
sd = np.sqrt(variance)
print(f"Numpy: {sd}")

In [None]:
np.std(values)

### Chebyshev's Bounds ###

No matter what the shape of the distribution,
the bulk of the data are in the range “average ± a few SDs”


In [None]:
births = Table.read_table('baby.csv').drop('Maternal Smoker')
births.num_rows

In [None]:
births.labels

In [None]:
births.hist(overlay = False)

In [None]:
mpw = births.column('Maternal Pregnancy Weight')
mean = np.mean(mpw)
sd = np.std(mpw)
mean, sd

In [None]:
within_3_SDs = births.where(
    'Maternal Pregnancy Weight', are.between(mean - 3*sd, mean + 3*sd))
within_3_SDs

In [None]:
# Proportion within 3 SDs of the mean

within_3_SDs.num_rows / births.num_rows

In [None]:
# Chebyshev's bound: 
# This proportion should be at least

1 - 1/3**2

In [None]:
births.labels

In [None]:
# See if Chebyshev's bounds work for distributions with various shapes

for feature in births.labels:
    values = births.column(feature)
    mean = np.mean(values)
    sd = np.std(values)
    print()
    print(feature)
    for z in make_array(2, 3, 4, 5):
        chosen = births.where(feature, are.between(mean - z*sd, mean + z*sd))
        proportion = chosen.num_rows / births.num_rows
        percent = round(proportion * 100, 2)
        print('Average plus or minus', z, 'SDs:', percent, '%')

## Standard Units ##

Standardizing the standard deviation tells how the number of deviations and direction. 
This is called the z-score and is based on a Normal (symmetric) curve. 
In a Normal distribution, the center is 0 and the Standrad Deviation is 1. 

To standardize subtract the mean from the data value and divide by the standard deviation.

In [None]:
def standard_units(x):
    """Convert array x to standard units."""
    return (x - np.mean(x)) / np.std(x)

In [None]:
ages = births.column('Maternal Age')
ages

In [None]:
ages_standard_units = standard_units(ages)
ages_standard_units

In [None]:
np.mean(ages_standard_units), np.std(ages_standard_units)


## Discussion Question 

a. Find the ages that are close to the mean age. 
b. Find the values that are close to the standard deviation of the data set. 


In [None]:
# Show the actual age and the standardized value for the age for each record.

both = Table().with_columns(
    'Age in Years', ages,
    'Age in Standard Units', ages_standard_units
)
both

In [None]:
# Determine the mean and the standard deviation of the data. 

np.mean(ages), np.std(ages)

In [None]:
both.hist('Age in Years', bins = np.arange(15, 46, 2))

In [None]:
both.hist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))
plots.xlim(-2, 3.1);

## The SD and Bell Shaped Curves 

In [None]:
births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))
plots.xticks(np.arange(57, 72, 2));

Estimates by eye

The average is approximately: 

Locate the point of inflection on the right. The SD is approximately:

In [None]:
heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)

In [None]:
height = births.column('Maternal Height')
height

In [None]:
height_standard_units = standard_units(height)
height_standard_units

In [None]:
np.mean(height_standard_units), np.std(height_standard_units)


In [None]:
both_height = Table().with_columns(
    'Height in Inches', height,
    'Height in Standard Units', height_standard_units
)
both_height

In [None]:
both_height.hist('Height in Standard Units', bins = np.arange(-3, 3.4, 0.4))
plots.xlim(-3.1, 3.1)