In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

In [None]:
# Some functions for plotting. You don't have to understand how any
# of the functions in this cell work, since they use things we 
# haven't learned about in DSCI 100.


def resize_window(lim=3.5):
    plots.xlim(-lim, lim)
    plots.ylim(-lim, lim)
    
def draw_line(slope=0, intercept=0, x=make_array(-4, 4), color='#1e90ff'):
    y = x*slope + intercept
    plots.plot(x, y, color=color, lw=3)
    
def draw_vertical_line(x_position, color='black'):
    x = make_array(x_position, x_position)
    y = make_array(-4, 4)
    plots.plot(x, y, color=color, lw=3)
    
def make_correlated_data(r):
    "Make up data for analysis"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    return x, y

def r_scatter(r):
    """Generate a scatter plot with a correlation approximately r"""
    plots.figure(figsize=(5,5))
    x, y = make_correlated_data(r)
    plots.scatter(x, y, color='darkblue', s=20)
    plots.xlim(-4, 4)
    plots.ylim(-4, 4)
    
def r_table(r):
    """
    Generate a table of 1000 data points with a correlation approximately r
    """
    np.random.seed(8)
    x, y = make_correlated_data(r)
    return Table().with_columns('x', x, 'y', y)

# Lecture 30 Linear Regression

### Interpreting Relationships
Functions from lecture 29.

In [None]:
def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.average(x)) / np.std(x)

In [None]:
def correlation(t, x, y):
    """t is a table; x and y are column labels"""
    x_in_standard_units = standard_units(t.column(x))
    y_in_standard_units = standard_units(t.column(y))
    return np.average(x_in_standard_units * y_in_standard_units)

### Nonlinearity

Some relationships are not linear so r is not a helpful measurement in establishing the strength of that relationship. 

In [None]:
new_x = np.arange(-4, 4.1, 0.5)
nonlinear = Table().with_columns(
        'x', new_x,
        'y', new_x**2
    )
nonlinear.scatter('x', 'y', s=30, color='r')

In [None]:
#Since the relationship is not linear, r makes it appear as if there is no relationship. 
#There is obviously a pattern so r is not helpful.

correlation(nonlinear, 'x', 'y')

### Outliers

Outliers are individual data points that lie way outside reasonable values. </br>
These can cause r to show no relationship when there is one or a strong relationship when it's weak. 


In [None]:
#Example scatter with a very linear relationship. 
line = Table().with_columns(
        'x', make_array(1, 2, 3, 4),
        'y', make_array(1, 2, 3, 4)
    )
line.scatter('x', 'y', s=30, color='r')

In [None]:
correlation(line, 'x', 'y')

In [None]:
#How is r affected when we insert an outlier far off the trend of the graph.
outlier = Table().with_columns(
        'x', make_array(1, 2, 3, 4, 5),
        'y', make_array(1, 2, 3, 4, 0)
    )
outlier.scatter('x', 'y', s=30, color='r')

In [None]:
#Only one outlier can greatly affect r

correlation(outlier, 'x', 'y')

### Ecological Correlations

Load in the data for 2014 SAT Data for 51 regions": The 50 States and DC. 
                                   
Describe the table.
                                   
Note:The scores are the averages across the state.

In [None]:
sat2014 = Table.read_table('sat2014.csv').sort('State')
sat2014

In [None]:
#Plot the scatter for Reading and Math correlation.
#What would you estimate the correlation coefficient, r, to be?



In [None]:
#Use the correlation(t, x, y) function to quantify the relationship between Critical Reading and Math. 



### Be Careful
What does each point on the scatter plot represent?

Is this helpful in predicting an individuals expected score on the SAT? Why or Why Not?

Would r be the same if we used individuals?

In [None]:
#Each region has a different participation because of the requirements of that state regarding the SATs
#If we input the participation rate of the region, what will this function return?

def rate_code(x):
    if x <= 25:
        return 'low'
    elif x <= 75:
        return 'medium'
    else:
        return 'high'

In [None]:
#Run this function on the Particpation Rate column. 

rate_codes = 

In [None]:
#Create a new table adding the Rate Code column with those values. 

sat2014_rate = 
sat2014_rate

In [None]:
#View the scatter with the rate code grouping. Show the scatter command specs to see how to group. 



What is happening with the grouping?

Low Participation: 

Medium Particpation: 

High Participation: 


In [None]:
#Display a table of only the regions with low rate code. Do not reassign. 



In [None]:
#How many regions had a low rate code?



Ecological Correlation is when you compare the averages of a measurement for a group, instead of the measurements of each individual in those groups.
Not a true correlation. 

## Prediction Lines

R can also help us identify the straight line that the points are clustered around. 
Using the nearest neighbors can help us predict an average value for each y-value given a particular x-value. 

### When r = 0.99.

We will create a table of ficticious data that has a correlation of a particular r.


In [None]:

example = r_table(0.99)
example.show(3)

In [None]:
example.scatter('x', 'y')
resize_window()

In [None]:
#Nearest neighbor prediction.

def nn_prediction_example(x_val):
    """ Predicts y-value for x based on the example table. Nearest Neighbor """
    neighbors = example.where('x', are.between(x_val - .25, x_val + .25))
    return np.mean(neighbors.column('y'))   

In [None]:
#Use the function to predict the y-value from an x-value of -2.25.



In [None]:
#Add the column of the predicted y-values to the example table. 
example = 

In [None]:
#Show the plotted line of predicted values.

example.scatter('x')
resize_window()

In [None]:
#Same graph overlayed with a line of slope=1.
example.scatter('x')
draw_line(slope=1)
resize_window()

### Let's run through the same process with different r values. 

### When r = 0.

In [None]:
example = r_table(0)
example.scatter('x', 'y')
resize_window()

In [None]:
example = example.with_columns(
    'Predicted y', 
    example.apply(nn_prediction_example, 'x'))

In [None]:
example.scatter('x')
draw_line(slope = 0)
resize_window()

### When r = 0.5.

In [None]:
example = r_table(0.5)
example.scatter('x', 'y')
resize_window()

In [None]:
example = r_table(0.5)
example.scatter('x', 'y')
resize_window()
draw_vertical_line(1.5)
draw_line(slope=1, intercept=0, color='red')

In [None]:
example = example.with_column('Predicted y', example.apply(nn_prediction_example, 'x'))
example.scatter('x')
draw_line(slope=1, color='red')
draw_vertical_line(1.5)
resize_window()

In [None]:
example.scatter('x')
draw_line(slope=1, intercept=0, color='red')
draw_line(slope=0.5, intercept=0)
resize_window()

### When r = 0.7.

In [None]:
example = r_table(0.7)
example = example.with_column('Predicted y', example.apply(nn_prediction_example, 'x'))
example.scatter('x')
draw_line(slope=1, intercept=0, color='red')
draw_line(slope=0.7, intercept=0, color='dodgerblue')
resize_window()

## Linear regression: defining the line

Linear regression creates a model that defines the relationship between two variables. 

When r is in standard units, the line that defines the model is $y=mx$</br>
Where $m$, the slope, is equal to $r$ and the y-intercept, $b$, is equal to zero.

In [None]:
# Copy-pasted from above
def standard_units(x):
    """ Converts an array x to standard units """
    return (x - np.mean(x)) / np.std(x)

def correlation(t, x, y):
    """ Computes correlation: t is a table, and x and y are column names """
    x_su = standard_units(t.column(x))
    y_su = standard_units(t.column(y))
    return np.mean(x_su * y_su)


In [None]:
#Slope of data. 
#What is the standard deviation of normalized data?
def slope(t, x, y):
    """ Computes the slope of the regression line, like correlation above """
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd


In [None]:
#Slope of data
#what is the mean of normalized data?

def intercept(t, x, y):
    """ Computes the intercept of the regression line, like slope above """
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

In [None]:
example = r_table(0.5)
slope(example, 'x', 'y')

## Heights Data and Regression Line

When the data is not standardized, the model will follow the defintion of a line $y=mx+b$.</br>
Where $m$, the slope is equal to $\dfrac{r*y_{SD}}{x_{SD}}$</br>
and $b$, the y-intercept is equal to $y_{mean} - slope*x_{mean}$

In [None]:
# Note: Child heights are the **adult** heights of children in a family
families = Table.read_table('family_heights.csv')
parent_avgs = (families.column('father') + families.column('mother'))/2
heights = Table().with_columns(
    'Parent Average', parent_avgs,
    'Child', families.column('child'),
)
heights.show(5)

In [None]:
def nn_prediction_height(p_avg):
    """Predict the height of a child whose parents have a parent average height of p_avg.
    
    The prediction is the average height of the children whose parent average height is
    in the range p_avg plus or minus 0.5.
    """
    
    close_points = heights.where('Parent Average', are.between(p_avg-0.5, p_avg + 0.5))
    return np.average(close_points.column('Child')) 

In [None]:
heights_with_predictions = heights.with_column(
    'Nearest neighbor prediction', 
    heights.apply(nn_prediction_height, 'Parent Average'))
heights_with_predictions.show(5)

In [None]:
# Show a scatter of the Parent Averages.



In [None]:
predicted_heights_slope = slope(heights, 'Parent Average', 'Child')
predicted_heights_intercept = intercept(heights, 'Parent Average', 'Child')
[predicted_heights_slope, predicted_heights_intercept]

Write the information found in the format of a linear regression model.



In [None]:
# Use the model to create a Regression Prediction column. 
heights_with_predictions = heights_with_predictions.with_column(
    'Regression Prediction', 
    
)
heights_with_predictions

In [None]:
heights_with_predictions.scatter('Parent Average')


In [None]:
# Create a table that shows the
Standard_Heights = Table().with_columns(
    'Parent Average',  standard_units(heights.column('Parent Average')), 
    'Child', standard_units(heights.column('Child'))
    )
Standard_Heights

In [None]:
Standard_Heights.scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);

In [None]:
predicted_Sheights_slope = slope(Standard_Heights, 'Parent Average', 'Child')
predicted_Sheights_intercept = intercept(Standard_Heights, 'Parent Average', 'Child')
[predicted_Sheights_slope, predicted_Sheights_intercept]

Write the information found in the format of a linear regression model.



In [None]:
correlation(Standard_Heights, 'Parent Average', 'Child')

In [None]:
Standard_Heights.scatter(0, 1)
draw_line(slope=predicted_Sheights_slope, intercept=predicted_Sheights_intercept, color='red')
plots.xlim(-3, 3)
plots.ylim(-3, 3);