In [173]:
from pydataset import data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [49]:
# 1. Load the mpg dataset. Read the documentation for it, and use the data to answer 
# these questions:

data('mpg', show_doc=True) # view the documentation for the dataset
# with show_doc=True won't assign data to variable

mpg = data('mpg') # load the dataset and store it in a variable



mpg

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Fuel economy data from 1999 and 2008 for 38 popular models of car

### Description

This dataset contains a subset of the fuel economy data that the EPA makes
available on http://fueleconomy.gov. It contains only models which had a new
release every year between 1999 and 2008 - this was used as a proxy for the
popularity of the car.

### Usage

    data(mpg)

### Format

A data frame with 234 rows and 11 variables

### Details

  * manufacturer. 

  * model. 

  * displ. engine displacement, in litres 

  * year. 

  * cyl. number of cylinders 

  * trans. type of transmission 

  * drv. f = front-wheel drive, r = rear wheel drive, 4 = 4wd 

  * cty. city miles per gallon 

  * hwy. highway miles per gallon 

  * fl. 

  * class. 




In [61]:
# On average, which manufacturer has the best miles per gallon?

# add column of average mpg
mpg['avg_mpg'] = (mpg.cty + mpg.hwy) / 2

# average mpg per manufacturer best on top
mpg.groupby('manufacturer').avg_mpg.mean().sort_values(ascending=False)


manufacturer
honda         28.500000
volkswagen    25.074074
hyundai       22.750000
subaru        22.428571
audi          22.027778
toyota        21.720588
pontiac       21.700000
nissan        21.346154
chevrolet     18.447368
ford          16.680000
mercury       15.625000
jeep          15.562500
dodge         15.540541
lincoln       14.166667
land rover    14.000000
Name: avg_mpg, dtype: float64

In [74]:
# How many different manufacturers are there?

# use .unique() to get list of unique values, len() will give total count
# https://cmdlinetips.com/2018/01/how-to-get-unique-values-from-a-column-in-pandas-data-frame/
len(mpg.manufacturer.unique())

15

In [75]:
# How many different models are there?
# use .unique() to get list of unique values, len() will give total count
len(mpg.model.unique())

38

In [89]:
# Do automatic or manual cars have better miles per gallon?
mpg.head()
bools = mpg.trans.str.contains('man')
print("Cars with a manual transmission = True, Automatic Transmission = False")
print("Cars with a manual transmission have a higher average mpg")
mpg.groupby(bools).avg_mpg.mean().sort_values(ascending=False)

Cars with a manual transmission = True, Automatic Transmission = False
Cars with a manual transmission have a higher average mpg


trans
True     22.227273
False    19.130573
Name: avg_mpg, dtype: float64

In [93]:
# 2. Joining and Merging
# Copy the users and roles dataframes from the examples above. 

users = pd.DataFrame({
    'id': [1, 2, 3, 4, 5, 6],
    'name': ['bob', 'joe', 'sally', 'adam', 'jane', 'mike'],
    'role_id': [1, 2, 3, 3, np.nan, np.nan]
})
roles = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['admin', 'author', 'reviewer', 'commenter']
})


# What do you think a right join would look like? 

# a right join would show all roles (even those without users), but not all users
pd.merge(users, roles, left_on='role_id', right_on='id', how='right')


Unnamed: 0,id_x,name_x,role_id,id_y,name_y
0,1.0,bob,1.0,1,admin
1,2.0,joe,2.0,2,author
2,3.0,sally,3.0,3,reviewer
3,4.0,adam,3.0,3,reviewer
4,,,,4,commenter


In [94]:
#  An outer join?

# an outer join should show all users and all roles inserting null values where needed
pd.merge(users, roles, left_on='role_id', right_on='id', how='outer')

Unnamed: 0,id_x,name_x,role_id,id_y,name_y
0,1.0,bob,1.0,1.0,admin
1,2.0,joe,2.0,2.0,author
2,3.0,sally,3.0,3.0,reviewer
3,4.0,adam,3.0,3.0,reviewer
4,5.0,jane,,,
5,6.0,mike,,,
6,,,,4.0,commenter


In [95]:
# What happens if you drop the foreign keys from the dataframes and try to merge them?

# it puts the roles as additional names below the original users
pd.merge(users, roles, how='outer')

Unnamed: 0,id,name,role_id
0,1,bob,1.0
1,2,joe,2.0
2,3,sally,3.0
3,4,adam,3.0
4,5,jane,
5,6,mike,
6,1,admin,
7,2,author,
8,3,reviewer,
9,4,commenter,


In [96]:
# 3. Getting data from SQL databases
# Create a function named get_db_url. It should accept a username, hostname, 
# password, and database name and return a url formatted like in the examples 
# in this lesson.

# from env import host, user, password
# url = f'mysql+pymysql://{user}:{password}@{host}/employees'
# pd.read_sql('SELECT * FROM employees LIMIT 5 OFFSET 50', url)


# for getting from Codeup SQL databases
def get_db_url(database):
    from env import host, user, password
    url = f'mysql+pymysql://{user}:{password}@{host}/{database}'
    return url

# note: this would require typing user and password into jupyter notebook
# which would defeat the purpose of annonimity!
# def general_get_db_url(user, password, host, database):
#     url = f'mysql+pymysql://{user}:{password}@{host}/{database}'
#     return url


In [21]:
# Ryan's challenge = show all available databases

url = f'mysql+pymysql://{user}:{password}@{host}'
query = '''show databases'''
pd.read_sql(query, url)

Unnamed: 0,Database
0,information_schema
1,albums_db
2,chipotle
3,darden_1030
4,elo_db
5,employees
6,fruits_db
7,iris_db
8,join_example_db
9,mall_customers


In [25]:
# Use your function to obtain a connection to the employees database.
# (and run a query)

query = """select * from orders limit 100"""
pd.read_sql(query, get_db_url('chipotle'))


Unnamed: 0,id,order_id,quantity,item_name,choice_description,item_price
0,1,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,2,1,1,Izze,[Clementine],$3.39
2,3,1,1,Nantucket Nectar,[Apple],$3.39
3,4,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,5,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans, Rice, Cheese, Sour Cream]]",$16.98
5,6,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sour Cream, Guacamole, Lettuce]]",$10.98
6,7,3,1,Side of Chips,,$1.69
7,8,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables, Black Beans, Pinto Beans, Cheese, Sour...",$11.75
8,9,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Cheese, Sour Cream, Lettuce]]",$9.25
9,10,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto Beans, Cheese, Sour Cream, Lettuce]]",$9.25


In [26]:
# Once you have successfully run a query:
# Intentionally make a typo in the database url. What kind of error message do you see?

# misspell database name
#query = """select * from orders limit 100"""
#pd.read_sql(query, get_db_url('chipotel'))

# spelling error in url, supposed to be pymysql
url = f'mysql+pysql://{user}:{password}@{host}'
query = '''show databases'''
pd.read_sql(query, url)


NoSuchModuleError: Can't load plugin: sqlalchemy.dialects:mysql.pysql

In [27]:
# Intentionally make an error in your SQL query. What does the error message look like?

# misspell query table name
query = """select * from oders limit 100"""
pd.read_sql(query, get_db_url('chipotle'))


ProgrammingError: (pymysql.err.ProgrammingError) (1146, "Table 'chipotle.oders' doesn't exist")
[SQL: select * from oders limit 100]
(Background on this error at: http://sqlalche.me/e/f405)

In [97]:
# Read the employees and titles tables into two separate dataframes

# write SQL queries for tables
query_employees = '''select * from employees'''
query_titles = '''select * from titles'''
# url function
url = get_db_url('employees')
# assign tables to df variables
employeesdf = pd.read_sql(query_employees, url)
titlesdf = pd.read_sql(query_titles, url)


In [132]:
# Visualize the number of employees with each title.

# get a sense of what titlesdf looks like
titlesdf.head()

today = pd.to_datetime('today')
bools = titlesdf.to_date > today 

titlesdf[bools].groupby('title').count()

Unnamed: 0_level_0,emp_no,from_date,to_date
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Assistant Engineer,15128,15128,15128
Engineer,115003,115003,115003
Manager,24,24,24
Senior Engineer,97750,97750,97750
Senior Staff,92853,92853,92853
Staff,107391,107391,107391
Technique Leader,15159,15159,15159


In [None]:
# for all titles historical
titlesdf.groupby('title').count()

In [None]:
# Join the employees and titles dataframes together.

In [None]:
# Visualize how frequently employees change titles.

In [None]:
# For each title, find the hire date of the employee that was hired most recently 
# with that title.

In [None]:
# Write the code necessary to create a cross tabulation of the number of titles
# by department. (Hint: this will involve a combination of SQL and python/pandas code)

In [None]:
# 4. Use your get_db_url function to help you explore the data from the chipotle database. 
# Use the data to answer the following questions:






In [None]:
# What is the total price for each order?

In [None]:
# What are the most popular 3 items?

In [None]:
# Which item has produced the most revenue?