In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture 8 ##

## Histogram Review

In [None]:
top_movies = Table.read_table('top_movies_2017.csv')
top_movies

In [None]:
ages = 2019 - top_movies.column('Year')
top_movies = top_movies.with_column('Age', ages)

In [None]:
my_bins = make_array(0, 5, 10, 15, 25, 40, 65, 100)

In [None]:
binned_data = top_movies.bin('Age', bins = my_bins)
binned_data

In [None]:
num_movies = sum(binned_data.column('Age count'))
num_movies

In [None]:
top_movies.hist('Age', bins = my_bins, unit = 'Year')

In [None]:
binned_data = binned_data.with_column(
    'Percent', binned_data.column('Age count')/num_movies * 100)

In [None]:
binned_data

## Height

### Question: What is the height of the [40, 65] bin?

In [None]:
# Step 1: Calculate % of movies in the [40, 65) bin
percent = binned_data.where('bin', 40).column('Percent').item(0)

In [None]:
# Step 2: Calculate the width of the 40-65 bin
width = 65 - 40

In [None]:
# Step 3: Area of rectangle = height * width
#         --> height = percent / width
height = percent / width
height

### What are the heights of the rest of the bins?

In [None]:
# Get the bin lefts
bin_lefts = binned_data.take(np.arange(binned_data.num_rows - 1))

In [None]:
# Get the bin widths
bin_widths = np.diff(binned_data.column('bin'))
bin_lefts = bin_lefts.with_column('Width', bin_widths)

In [None]:
# Get the bin heights
bin_heights = bin_lefts.column('Percent') / bin_widths
bin_lefts = bin_lefts.with_column('Height', bin_heights)

In [None]:
bin_lefts

In [None]:
top_movies.hist('Age', bins = my_bins, unit = 'Year')

## Visualization Example: Welcome Survey ##

In [None]:
survey = Table.read_table('welcome_survey_sp21.csv')
survey

### Categorical Data: Bar Charts

In [None]:
intro_extra = survey.group('intro_extra')
intro_extra

In [None]:
intro_extra.barh('intro_extra')

### Numerical Data: Histograms

In [None]:
survey.hist('countries')

In [None]:
survey.hist('sleep')

In [None]:
min(survey.column('sleep')), max(survey.column('sleep'))

In [None]:
sleep_bins = np.arange(5,12,1)

In [None]:
survey.hist('sleep', bins=sleep_bins)

In [None]:
survey.bin('sleep', bins=make_array(0,8,15))

In [None]:
19 / (19 + 17) * 100

## Functions ##

In [None]:
def triple(x):
    return 3 * x

In [None]:
triple(3)

In [None]:
num = 4

In [None]:
triple(num)

In [None]:
triple(num * 5)

### Type Agnostic

In [None]:
triple('ha')

In [None]:
triple(np.arange(4))

### Discussion Question

In [None]:
# What does this function do? 
# What kind of input does it take? 
# What output will it give? 
# What would be a reasonable name for this function?
def f(s):
    return np.round(s / sum(s) * 100, 2)

### Multiple Arguments

$ h^2 = x^2 + y^2 \hspace{20 pt} => \hspace{20 pt} h = \sqrt{ x^2 + y^2 } $

In [None]:
def hypotenuse(x,y):
    hypot_squared = (x ** 2 + y ** 2)
    return hypot_squared ** 0.5

In [None]:
hypotenuse(9, 12)

In [None]:
hypotenuse(2, 2)

## Apply ##

In [None]:
ages = Table().with_columns(
    'Person', make_array('Jim', 'Pam', 'Michael', 'Creed'),
    'Birth Year', make_array(1985, 1988, 1967, 1904)
)
ages

In [None]:
def cap_at_1980(x):
    return min(x, 1980)

In [None]:
cap_at_1980(1975)

In [None]:
cap_at_1980(1991)

In [None]:
ages.apply(cap_at_1980, 'Birth Year')

In [None]:
def name_and_age(name, year):
    age = 2021 - year
    return name + ' is ' + str(age)

In [None]:
ages.apply(name_and_age, 'Person', 'Birth Year')