In [None]:
# HIDDEN
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

### Regression line vs other lines

In [None]:
def standard_units(arr):
    """ Converts an array to standard units """
    return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
    """ Computes correlation: t is a table, and x and y are column names """
    x_standard = standard_units(t.column(x))
    y_standard = standard_units(t.column(y))
    return np.average(x_standard * y_standard)

def slope(t, x, y):
    """ Computes the slope of the regression line, like correlation above """
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd

def intercept(t, x, y):
    """ Computes the intercept of the regression line, like slope above """
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

def fitted_values(t, x, y):
    """Return an array of the regression estimates (predictions) at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

## Residuals

Residual is the name for the error in the regression estimate for each point on the regression line.

To calculate the residual: </br>
= observed y - regression estimate of y</br>
= observed y - height of regression line at x</br>
= vertical distance between the point and the best line

In [None]:
#2016 data about different voting districts throughout the US. 
#We will focus on the median income of that district and what percent of voters attended college.  
demographics = Table.read_table('district_demographics2016.csv')
demographics = demographics.drop(
    'State', 'District', 'Percent voting for Clinton')
demographics.show(5)

In [None]:
#Create a function that finds the residual for each data point. 
def residuals(t, x, y):
    predictions = fitted_values(t, x, y)
    return t.column(y) - predictions

In [None]:
#Creates a table that addes the fitted value (from the regression line) and the residual created by that value.
demographics = demographics.with_columns(
    'Fitted Value', fitted_values(demographics, 'College%', 'Median Income'),
    'Residual', residuals(demographics, 'College%', 'Median Income')
)
demographics

#Why are some values negative and other positive?


In [None]:
#View the residuals as a scatter compared to the data scatter. 
#Where are the residuals centered around?Why?

demographics.scatter('College%')

In [None]:
def plot_residuals(t, x, y):
    tbl = t.with_columns(
        'Fitted', fitted_values(t, x, y),
        'Residual', residuals(t, x, y)
    )
    tbl.select(x, y, 'Fitted').scatter(0)
    tbl.scatter(x, 'Residual')

In [None]:
#Plotting the residuals on a separate scatter. 
#What are some characteristics you notice about the residuals?

plot_residuals(demographics, 'College%', 'Median Income')

In [None]:
#View the data and residuals of a comparison between average parent's height and their child's height. 
galton = Table.read_table('heights.csv')

heights = Table().with_columns(
    'MidParent', galton.column('midparentHeight'),
    'Child', galton.column('childHeight')
    )
plot_residuals(heights, 'MidParent', 'Child')

## Dugongs ##

Dugongs are a marine animal related to the manatee. There are declining in numbers so ecologist are monitoring them. 
Since they cannot precisely measure how old they are, they want to know if they can predict their age if they  measure how long they are. 


In [None]:
dugong = Table.read_table('dugong.csv')
dugong.show(5)

In [None]:
#Generate a scatter comparing Length and Age



In [None]:
#Find r for the Length and Age.



In [None]:
plot_residuals(dugong, 'Length', 'Age')

What do you notice about the data compared to the Regression Line?

What do you notice about the residual plot?

What might this indicate?



# US Women

In [None]:
#Import the table on US women. This displays the average weight for women in the given height category. 
us_women = Table.read_table('us_women.csv')
us_women.show(5)

In [None]:
#Why is this so linear? What type of correlation is this?

us_women.scatter('height')

In [None]:
#Find r for height and ave weight.



In [None]:
plot_residuals(us_women, 'height', 'ave weight')

What do you notice about the data compared to the Regression Line?

What do you notice about the residual plot?

What might this indicate?

## Average of Residuals ##

Since the residuals are centered around zero with some above and some below with basically no correlation, we expect the average of all of the residuals to equal to zero. 

Residuals from a linear regression always have
 * Zero mean (so rmse = sd of residuals)
 * Zero correlation with x
 * Zero correlation with the fitted values


These are all true no matter what the data look like
 * Just like deviations from mean are zero on average


In [None]:
#Check the residual average for the Dugong data

round(np.average(residuals(dugong, 'Length', 'Age')), 6)

In [None]:
#Check the residual average for the Child Height data

round(np.average(residuals(heights, 'MidParent', 'Child')), 6)

In [None]:
#Check the residual average for the Voting District data

round(np.average(residuals(demographics, 'College%', 'Median Income')), 6)

In [None]:
#Recall the parent average height, the child adult height with the fitted value and it's residual. 
heights = heights.with_columns(
    'Residual', residuals(heights, 'MidParent', 'Child'),
    'Fitted Value', fitted_values(heights, 'MidParent', 'Child')
)
heights

In [None]:
#Check the correlation with the residuals and child height

correlation(heights, 'MidParent', 'Residual')


In [None]:
#Check the correlation with the residuals and the fitted value

round(correlation(heights, 'Fitted Value', 'Residual'), 6)

In [None]:
#Recall the correlation of between the Parent avg height and the child's height. 
r_heights = correlation (heights, 'MidParent', 'Child')
r_heights

In [None]:
#Compare the STD of the Residual to the rmse of the child heights. 

np.std(heights.column('Residual')), np.sqrt(1 - r_heights**2) * np.std(heights.column('Child'))

In [None]:
#Recall the Voting District data with the district median income and percent who went to college 
#with the fitted value of the median income and its residual.
demographics

In [None]:
#Recall r for College and Median Income
r = correlation(demographics, 'College%', 'Median Income')
r

In [None]:
#Check the correlation between the fitted value and the residual.

correlation(demographics, 'Fitted Value', 'Residual')

In [None]:
#Compare std of residual to the rmse for demographics Median Income. 

np.std(demographics.column('Residual')), np.sqrt(1 - r**2) * np.std(demographics.column('Median Income'))

### Discussion Question ###

How would we adjust our regression line...
 * if the average residual were 10?
 * if the residuals were positively correlated with x?
 * if the residuals were above 0 in the middle and below 0 on the left and right?
 
 Residuals of 10:  
 
 Positively coorrelated: 
 
 Above and below 0 at the ends: 