In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action='ignore',category=DeprecationWarning)

## Lecture 10 ##

## Prediction ##

Data Scientist often use existing data to help make predictions about an individual who is not in the data set. 

Describe the incoming data set

What could we try to predict with this data?

In [None]:
families = Table.read_table('family_heights.csv')
families

In [None]:
parent_avgs = (families.column('father') + families.column('mother'))/2

In [None]:
heights = Table().with_columns(
    'Parent Average', parent_avgs,
    'Child', families.column('child'),
    'Sex', families.column('sex')
)
heights

In [None]:
heights.scatter('Parent Average', 'Child')

How would you classify this scatter?

What questions or observations could be made about these variables?

What are some things a Data Scientist could predict using this data?

In [None]:
heights.scatter('Parent Average', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2);

What do the red vertical lines represent and what are they helping with?

In [None]:
nearby = heights.where('Parent Average', are.between(67.5, 68.5))
nearby_mean = np.average(nearby.column('Child'))
nearby_mean

What does the nearby mean value represent?

What could this help predict?


The code below will plot the nearby mean on our scatterplot.

In [None]:
heights.scatter('Parent Average', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)
plots.scatter(68, nearby_mean, color='red', s=50);

Below is a new function defined to do what?


In [None]:
def predict(h):
    nearby = heights.where('Parent Average', are.between(h - 1/2, h + 1/2))
    return np.average(nearby.column('Child'))

Use the new function to predit ...

In [None]:
# Use the function to make a predictions for parents whose average height is 68"


In [None]:
# Use the function to make a predictions for parents whose average height is 70"


In [None]:
# Use the function to make a predictions for parents whose average height is 73"


What does the code below do?

In [None]:
predicted_heights = heights.apply(predict, 'Parent Average')

In [None]:
heights = heights.with_column('Prediction', predicted_heights)

Create a new scatter that shows all the predicted heights.

In [None]:
heights.select('Parent Average', 'Child', 'Prediction').scatter('Parent Average')

## Prediction Accuracy ##

Prediction is rarely perfect. We may over or under estimate the correct value. 
Notice in the scatter above, there are data points above and below every prediction value. This is because this value is an average. 


In [None]:
# Define a new function to compute the difference between two values. 
def difference(x, y):
    return x - y

What will the code below produce?

In [None]:
pred_errs = heights.apply(difference, 'Prediction', 'Child')
heights = heights.with_column('errors',pred_errs)
heights

In [None]:
heights.hist('errors')

The histogram above shows the prediction errors. 
What is the shape? 

What are the values centered around? Why?

In [None]:
heights.hist('errors', group='Sex')

What does the histogram above show?

How would you classify these histograms?

What did disaggregating these values reveal?

# Discussion Questions

1. How could we take sex into account when making predictions?
2. Do we make smaller errors on average when we do this?

Let's start by creating a new function. How is this function different than the original prediction function?


In [None]:
#Original Function:
#def predict(h):
#    nearby = heights.where('Parent Average', are.between(h - 1/2, h + 1/2))
#    return np.average(nearby.column('Child'))

def predict_smarter(h, s):
    nearby = heights.where('Parent Average', are.between(h - 1/2, h + 1/2))
    nearby_same_sex = nearby.where('Sex', s)
    return np.average(nearby_same_sex.column('Child'))

In [None]:
# Use the new function to predict the average female's height born to parents with an average height of 68".


In [None]:
# Use the new function to predict the average male's height born to parents with an average height of 68"


In [None]:
smarter_predicted_heights = heights.apply(predict_smarter, 'Parent Average', 'Sex')
heights = heights.with_column('Smarter Prediction', smarter_predicted_heights)

In [None]:
smarter_pred_errs = heights.apply(difference, 'Child', 'Smarter Prediction')
heights = heights.with_column('Smarter Errors', smarter_pred_errs)
heights

In [None]:
heights.hist('Smarter Errors', group='Sex')

Did the function change improve the prediction results? How do you know?


## Grouping by One Column ##

In [None]:
cones = Table.read_table('cones.csv').drop('Color')
cones

In [None]:
cones.group('Flavor')

In [None]:
cones.group('Flavor', np.average)

In [None]:
cones.group('Flavor', np.min)

## Grouping By One Column: Welcome Survey ##

In [None]:
survey = Table.read_table('welcome_survey_sp22.csv')
survey.show(3)

In [None]:
survey.hist('Extroversion')

In [None]:
by_extra = survey.group('Extroversion', np.average)
by_extra

In [None]:
by_extra.plot('Extroversion', 'Number of textees average')

In [None]:
survey.group("Year")

In [None]:
(survey
 .select("Year", "Hours of sleep")
 .group(0, np.average)
 .take(1, 7, 8, 3))

## Lists

In [None]:
[1, 5, 'hello', 5.0]

In [None]:
[1, 5, 'hello', 5.0, make_array(1,2,3)]

## Grouping by Two Columns ##

![Do right-handed people tend to sleep on their left side and left-handed people sleep on their right?](handed.png)

In [None]:
survey.group(['Handedness', 'Sleep position']).show()