In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

path = 'https://github.com/oregon-data-science/DSCI101/raw/main/data/'

'imports OK'

## Table review with census data

In [None]:
## Read in census data

In [None]:
full = Table.read_table(path + 'nc-est2014-agesex-res.csv')
full

In [None]:
# Keep only the columns we care about
partial = full.select('SEX', 'AGE', 'POPESTIMATE2010', 'POPESTIMATE2014')
partial

In [None]:
# Make things easier to read
simple = partial.relabeled(2, '2010').relabeled(3, '2014')
simple

In [None]:
# Sort by age
simple.sort('AGE')

In [None]:
# Remove the age totals (the 999 catagory), use are.below
no_999 = simple.where('AGE', are.below(999))
no_999 

In [None]:
# Remove male and female (keep only combined 0 catagory)
everyone = no_999.where('SEX', 0).drop('SEX')
everyone

In [None]:
## Males and Females in 2014 ##

In [None]:
# Let's compare male and female counts per age
males = no_999.where('SEX', 1).drop('SEX')
females = no_999.where('SEX', 2).drop('SEX')

In [None]:
females.num_rows

In [None]:
males.num_rows

In [None]:
## put them back together in a table
## only works because they are the name number of rows!!
pop_2014 = Table().with_columns(
    'Age', males.column('AGE'),  
    'Males', males.column('2014'),
    'Females', females.column('2014')
)
pop_2014

In [None]:
pop_2014.plot('Age')

In [None]:
# Calculate the percent female for each age
total = pop_2014.column('Males') + pop_2014.column('Females')
total

In [None]:
pct_female = (pop_2014.column('Females') / total) * 100
pct_female

In [None]:
# Round it to 3 so that it's easier to read
pct_female = np.round(pct_female, 2)
pct_female

In [None]:
# Add female percent to our table
pop_2014 = pop_2014.with_column('Percent female', pct_female)
pop_2014

In [None]:
pop_2014.plot('Age', 'Percent female')

In [None]:
# ^^ Look at the y-axis! Trend is not as dramatic as you might think
pop_2014.plot('Age', 'Percent female')
plots.ylim(0, 100);  # Optional for Data 101

## More ways to manipulate rows 

In [None]:
## Selecting data in a column 

In [None]:
movies = Table.read_table(path + 'movies_by_year_with_ticket_price.csv')
movies.show()

In [None]:
## How much each movie made in dollars, use .column so can use array functions 
gross_in_dollars = movies.column('Total Gross') * 1e6
gross_in_dollars

In [None]:
## How many tickets were sold of each movie? $ each movie made/average price of a ticket
tix_sold = gross_in_dollars / movies.column('Average Ticket Price')
tix_sold

In [None]:
## Add our new column onto the original table
movies = movies.with_column('Tickets sold', tix_sold)
movies.show(4)

In [None]:
## a way to make the tickets column more readable
movies = movies.set_format('Tickets sold', NumberFormatter)
movies.show(4)

In [None]:
## Let's investigate the number of tickets sold in each year using a line plot
movies.plot('Year', 'Tickets sold')

In [None]:
## New flavors of where, subsetting to a range using are.between "predicate"
movies.where('Year', are.between(2000, 2005))

In [None]:
## compare that to what we have been previously using, an exact match 
movies.where('Year', 2002)

In [None]:
## The default predicate that is called "under the hood" 
movies.where('Year', are.equal_to(2002))

In [None]:
## Another where flavor, matching to a pattern
movies.where('#1 Movie', are.containing('Harry Potter'))

In [None]:
## Another where flavor, matching to a pattern
movies.where('#1 Movie', are.containing('Lord'))

In [None]:
## what if we wanted to be systematic about this, imagine there were many rows to count up 

In [None]:
hp = movies.where('#1 Movie', are.containing('Harry Potter'))
lotr = movies.where('#1 Movie', are.containing('Lord'))

In [None]:
hp.num_rows, lotr.num_rows

In [None]:
print("HR top movies=", hp.num_rows, "vs. LOTR top movies=", lotr.num_rows)

In [None]:
## what other patterns would be interesting to look at??
movies.where('#1 Movie', are.containing(...))

In [None]:
## Comparing the number of tickets sold in the years HR vs. LOTR were top

In [None]:
hp.column("Tickets sold")

In [None]:
lotr.column("Tickets sold")

In [None]:
## How can we compare the tickets sold? 

In [None]:
...

## Visualizing numerical data: Scatter plots vs. line plots

In [None]:
# Actors and their highest grossing movies
actors = Table.read_table(path + 'actors.csv')
actors

In [None]:
## Make a scatter plot
actors.scatter('Number of Movies', 'Total Gross')

In [None]:
## What about a line plot (AH!) 
actors.plot('Number of Movies', 'Total Gross')

In [None]:
## our x variable is not sequential, leads to mayhem, scatter is best 

In [None]:
## Are there any actors that are one hit wonders? 
actors.scatter('...', '...')

In [None]:
## to Help decide which columns to use
actors.labels

In [None]:
## or depending on preference... 
actors.show(5)

In [None]:
## Who is that crazy point with < 10 movies but a crazy high average gross???? 

In [None]:
actors.where('Average per Movie', ...)

## Visualizing categorical data: barcharts

In [None]:
# Highest grossing movies as of 2017
top_movies = Table.read_table(path + 'top_movies_2017.csv')
top_movies

In [None]:
## subset to the top 10, how? 
top10_adjusted = top_movies.take(...)
top10_adjusted

In [None]:
# Convert to millions of dollars for readability, round
millions = np.round(top10_adjusted.column('Gross (Adjusted)') / 1e6, 2)
top10_adjusted = top10_adjusted.with_column('Millions', millions)
top10_adjusted

In [None]:
# A line plot doesn't make sense here: don't do this! AH! 
top10_adjusted.plot('Year', 'Millions')

In [None]:
## Use a barchar instead
top10_adjusted.barh('Title', 'Millions')

In [None]:
## When did we start keeping track of movies
min(top_movies.column("Year"))