In [8]:
import pandas as pd
import numpy as np

In [9]:
from pydataset import data

In [7]:
data('BOD', show_doc=True)
#df = data(BOD)

BOD

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

##  Biochemical Oxygen Demand

### Description

The `BOD` data frame has 6 rows and 2 columns giving the biochemical oxygen
demand versus time in an evaluation of water quality.

### Usage

    BOD

### Format

This data frame contains the following columns:

Time

A numeric vector giving the time of the measurement (days).

demand

A numeric vector giving the biochemical oxygen demand (mg/l).

### Source

Bates, D.M. and Watts, D.G. (1988), _Nonlinear Regression Analysis and Its
Applications_, Wiley, Appendix A1.4.

Originally from Marske (1967), _Biochemical Oxygen Demand Data Interpretation
Using Sum of Squares Surface_ M.Sc. Thesis, University of Wisconsin – Madison.

### Examples

    require(stats)
    # simplest form of fitting a first-order model to these data
    fm1 <- nls(demand ~ A*(1-exp(-exp(lrc)*Time)), data = BOD,
       start = c(A = 20, lrc = log(.35)))
    coef(fm1)
    fm1


In [4]:
data()

Unnamed: 0,dataset_id,title
0,AirPassengers,Monthly Airline Passenger Numbers 1949-1960
1,BJsales,Sales Data with Leading Indicator
2,BOD,Biochemical Oxygen Demand
3,Formaldehyde,Determination of Formaldehyde
4,HairEyeColor,Hair and Eye Color of Statistics Students
...,...,...
752,VerbAgg,Verbal Aggression item responses
753,cake,Breakage Angle of Chocolate Cakes
754,cbpp,Contagious bovine pleuropneumonia
755,grouseticks,Data on red grouse ticks from Elston et al. 2001


In [None]:
#from env import host, user, password

#url = f'mysql+pymysql://{user}:{password}@{host}/employees'

In [11]:
def get_db_url(db_name):
    from env import host, user, password
    return f'mysql+pymysql://{user}:{password}@{host}/{db_name}'

In [19]:
sql = """
    SELECT * FROM employees
    """

In [20]:
# Connection String has all the info to connect

url = get_db_url("employees")

df  = pd.read_sql(sql, url)
df.head()

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01
4,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12


# Exercises Part 1

In [None]:
#Run python -m pip install pymysql from your terminal to install the mysql client (any folder is fine)
#cd into your exercises folder for this module and run echo env.py >> .gitignore
#Create a function named get_db_url. It should accept a username, hostname, password, and database name and return a url connection string formatted like in the example at the start of this lesson.

#Use your function to obtain a connection to the employees database.

#Once you have successfully run a query:

#a. Intentionally make a typo in the database url. What kind of error message do you see?

#b. Intentionally make an error in your SQL query. What does the error message look like?

#Read the employees and titles tables into two separate DataFrames.

#How many rows and columns do you have in each DataFrame? Is that what you expected?

#Display the summary statistics for each DataFrame.

#How many unique titles are in the titles DataFrame?

#What is the oldest date in the to_date column?

#What is the most recent date in the to_date column?

In [None]:
#Read the employees and titles tables into two separate DataFrames.

In [21]:
# Employees Data Frame

sql = """
    SELECT * FROM employees
    """

url = get_db_url("employees")

Employees_DF  = pd.read_sql(sql, url)
Employees_DF.head()

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01
4,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12


In [22]:
# Titles Data Frame

sql = """
    SELECT * FROM titles
    """

url = get_db_url("employees")

Titles_DF  = pd.read_sql(sql, url)
Titles_DF.head()

Unnamed: 0,emp_no,title,from_date,to_date
0,10001,Senior Engineer,1986-06-26,9999-01-01
1,10002,Staff,1996-08-03,9999-01-01
2,10003,Senior Engineer,1995-12-03,9999-01-01
3,10004,Engineer,1986-12-01,1995-12-01
4,10004,Senior Engineer,1995-12-01,9999-01-01


In [24]:
#How many rows and columns do you have in each DataFrame? Is that what you expected?

Employees_DF.shape

(300024, 6)

In [25]:
Titles_DF.shape

(443308, 4)

In [None]:
#Display the summary statistics for each DataFrame.

In [27]:
Employees_DF.describe()

Unnamed: 0,emp_no
count,300024.0
mean,253321.763392
std,161828.23554
min,10001.0
25%,85006.75
50%,249987.5
75%,424993.25
max,499999.0


In [28]:
Titles_DF.describe()

Unnamed: 0,emp_no
count,443308.0
mean,253075.03443
std,161853.292613
min,10001.0
25%,84855.75
50%,249847.5
75%,424891.25
max,499999.0


In [None]:
#How many unique titles are in the titles DataFrame?

In [44]:
sql = """
    SELECT COUNT(DISTINCT title) AS "Unique Titles" FROM titles
    """

url = get_db_url("employees")

Titles_DF  = pd.read_sql(sql, url)
Titles_DF

Unnamed: 0,Unique Titles
0,7


In [49]:
url = get_db_url("employees")

Titles_DF = pd.read_sql("SELECT COUNT(DISTINCT title) FROM titles",url)
Titles_DF

Unnamed: 0,COUNT(DISTINCT title)
0,7


In [43]:
sql = """
    SELECT DISTINCT title FROM titles
    """

url = get_db_url("employees")

Titles_DF  = pd.read_sql(sql, url)
len(Titles_DF)
Titles_DF.

7

AttributeError: 'DataFrame' object has no attribute 'title'

In [36]:
#What is the oldest date in the to_date column?

In [51]:
sql = """
    SELECT to_date 
    FROM titles 
    ORDER BY to_date ASC
    """

url = get_db_url("employees")

Titles_DF  = pd.read_sql(sql, url)
#Titles_DF.head(1)
Titles_DF.min()

to_date    1985-03-01
dtype: object

In [39]:
#What is the most recent date in the to_date column?

In [95]:
sql = """
    SELECT to_date 
    FROM titles 
    WHERE to_date NOT LIKE '9999%%'
    """ 

url = get_db_url("employees")

Titles_DF  = pd.read_sql(sql, url)
Titles_DF.max()

to_date    2002-08-01
dtype: object

# Exercises Part 2

In [None]:






# .)Load the mpg dataset from PyDataset.

# .)Output and read the documentation for the mpg dataset.

# .)How many rows and columns are in the dataset?

# .)Check out your column names and perform any cleanup you may want on them.

# .)Display the summary statistics for the dataset.

# .)How many different manufacturers are there?

# .)How many different models are there?

# .)Create a column named mileage_difference like you did in the DataFrames exercises; this column should contain the difference between highway and city mileage for each car.

# .)Create a column named average_mileage like you did in the DataFrames exercises; this is the mean of the city and highway mileage.

# .)Create a new column on the mpg dataset named is_automatic that holds boolean values denoting whether the car has an automatic transmission.

# .)Using the mpg dataset, find out which which manufacturer has the best miles per gallon on average?

# .)Do automatic or manual cars have better miles per gallon?

In [None]:
# .)Copy the users and roles DataFrames from the examples above.

In [52]:
users = pd.DataFrame({
    'id': [1, 2, 3, 4, 5, 6],
    'name': ['bob', 'joe', 'sally', 'adam', 'jane', 'mike'],
    'role_id': [1, 2, 3, 3, np.nan, np.nan]
})
users

Unnamed: 0,id,name,role_id
0,1,bob,1.0
1,2,joe,2.0
2,3,sally,3.0
3,4,adam,3.0
4,5,jane,
5,6,mike,


In [53]:
roles = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['admin', 'author', 'reviewer', 'commenter']
})
roles

Unnamed: 0,id,name
0,1,admin
1,2,author
2,3,reviewer
3,4,commenter


In [None]:
# .)What is the result of using a right join on the DataFrames?

In [54]:
users.merge(roles, how = 'right')

Unnamed: 0,id,name,role_id
0,1,admin,
1,2,author,
2,3,reviewer,
3,4,commenter,


In [55]:
roles.merge(users, how = 'right')

Unnamed: 0,id,name,role_id
0,1,bob,1.0
1,2,joe,2.0
2,3,sally,3.0
3,4,adam,3.0
4,5,jane,
5,6,mike,


In [56]:
# .)What is the result of using an outer join on the DataFrames?

In [57]:
roles.merge(users, how = 'outer')

Unnamed: 0,id,name,role_id
0,1,admin,
1,2,author,
2,3,reviewer,
3,4,commenter,
4,1,bob,1.0
5,2,joe,2.0
6,3,sally,3.0
7,4,adam,3.0
8,5,jane,
9,6,mike,


In [58]:
users.merge(roles, how = 'outer')

Unnamed: 0,id,name,role_id
0,1,bob,1.0
1,2,joe,2.0
2,3,sally,3.0
3,4,adam,3.0
4,5,jane,
5,6,mike,
6,1,admin,
7,2,author,
8,3,reviewer,
9,4,commenter,


In [59]:
# .)What happens if you drop the foreign keys from the DataFrames and try to merge them?