# SQL Joins

In [21]:
from sqlalchemy import create_engine, Table, MetaData, select
from sqlalchemy import and_, or_, not_, between, desc, func
from sqlalchemy import case, cast, Float

# instantiate the database connection
engine = create_engine('sqlite:///../data/sqlalchemy/census.sqlite')
connection = engine.connect()
print('Tables:', engine.table_names())

# instantiate the table obj
census = Table('census', MetaData(), autoload=True, autoload_with=engine)
print('Census:', census.columns.keys())

state_fact = Table('state_fact', MetaData(), autoload=True, autoload_with=engine)
print('State fact:', state_fact.columns.keys())

print('Setup complete!')

Tables: ['census', 'state_fact']
Census: ['state', 'sex', 'age', 'pop2000', 'pop2008']
State fact: ['id', 'name', 'abbreviation', 'country', 'type', 'sort', 'status', 'occupied', 'notes', 'fips_state', 'assoc_press', 'standard_federal_region', 'census_region', 'census_region_name', 'census_division', 'census_division_name', 'circuit_court']
Setup complete!


Where we have two tables that already have an established relationship, we can use that relationship to automatically join the two tables by just adding the columns we want from each table to the `select` statement.

In this case, the `census` and `state_fact` tables have a pre-defined relationship: the `state` column of the former corresponded to the `name` column of the latter.

In [28]:
stmt = select([
    census.columns.pop2008,
    state_fact.columns.abbreviation
])

result = connection.execute(stmt).first()

# Loop over the keys in the result object and print the key and value
for key in result.keys():
    print(key, getattr(result, key))

pop2008 95012
abbreviation IL


In [27]:
stmt = select([
    census.columns.pop2000, 
    state_fact.columns.abbreviation
])

# Execute the statement and get the first result: result
result = connection.execute(stmt).first()

# Loop over the keys in the result object and print the key and value
for key in result.keys():
    print(key, getattr(result, key))

pop2000 89600
abbreviation IL


Where we aren't selecting columns from both tables or the two tables don't have a defined relationship, we can still use the `.join()` method on a table to join it with another table and get extra data related to our query. The `join()` takes the table object you want to join in as the first argument and a condition that indicates how the tables are related to the second argument. Finally, we use the `.select_from()` method on the select statement to wrap the join clause. 

* it needs to some immediately after the `select` function, and before any `where`, `group_by` or `order_by` clauses.
* use `select_from`, which is passed `join()` to 'tell' sqlalchemy which tables to join.

**Determine the total population in 2000, within the 10th Courts duristiction**

In [25]:
stmt = select([func.sum(census.columns.pop2000)]) # detemine size of population

# join the two tables
stmt = stmt.select_from(
    census.join(
        state_fact, 
        census.columns.state == state_fact.columns.name # join on the following cols
)) 

query = stmt.where(state_fact.columns.circuit_court == '10') # find matching records

connection.execute(query).scalar()

14945252

**Determine population size in East Sout Central in 2000**

In [26]:
query = stmt.where(state_fact.columns.census_division_name == 'East South Central')

connection.execute(query).scalar()

16982311

In [29]:
# get all the columns from both tables
stmt = select([census, state_fact])

# Add a select_from clause that wraps a join for the census and state_fact
# tables where the census state column and state_fact name column match
stmt = stmt.select_from(
    census.join(state_fact, census.columns.state == state_fact.columns.name))

# Execute the statement and get the first result
result = connection.execute(stmt).first()

# Loop over the keys in the result object and print the key and value
for key in result.keys():
    print(key, getattr(result, key))


state Illinois
sex M
age 0
pop2000 89600
pop2008 95012
id 13
name Illinois
abbreviation IL
country USA
type state
sort 10
status current
occupied occupied
notes 
fips_state 17
assoc_press Ill.
standard_federal_region V
census_region 2
census_region_name Midwest
census_division 3
census_division_name East North Central
circuit_court 7


In [32]:
# Build a statement to select the state, sum of 
# 2008 population and census division name
stmt = select([
    census.columns.state,
    state_fact.columns.abbreviation,
    func.sum(census.columns.pop2008),
    state_fact.columns.census_division_name
])

# Append select_from to join the census and state_fact tables 
# by the census state and state_fact name columns
stmt = stmt.select_from(
    census.join(state_fact, census.columns.state == state_fact.columns.name)
)

# Append a group by for the state_fact name column
stmt = stmt.group_by(state_fact.columns.name)

# Execute the statement and get the results: results
results = connection.execute(stmt).fetchall()

# Loop over the the results object and print each record.
for record in results:
    print(record)

('Alabama', 'AL', 4649367, 'East South Central')
('Alaska', 'AK', 664546, 'Pacific')
('Arizona', 'AZ', 6480767, 'Mountain')
('Arkansas', 'AR', 2848432, 'West South Central')
('California', 'CA', 36609002, 'Pacific')
('Colorado', 'CO', 4912947, 'Mountain')
('Connecticut', 'CT', 3493783, 'New England')
('Delaware', 'DE', 869221, 'South Atlantic')
('Florida', 'FL', 18257662, 'South Atlantic')
('Georgia', 'GA', 9622508, 'South Atlantic')
('Hawaii', 'HI', 1250676, 'Pacific')
('Idaho', 'ID', 1518914, 'Mountain')
('Illinois', 'IL', 12867077, 'East North Central')
('Indiana', 'IN', 6373299, 'East North Central')
('Iowa', 'IA', 3000490, 'West North Central')
('Kansas', 'KS', 2782245, 'West North Central')
('Kentucky', 'KY', 4254964, 'East South Central')
('Louisiana', 'LA', 4395797, 'West South Central')
('Maine', 'ME', 1312972, 'New England')
('Maryland', 'MD', 5604174, 'South Atlantic')
('Massachusetts', 'MA', 6492024, 'New England')
('Michigan', 'MI', 9998854, 'East North Central')
('Minneso

## Self Joins

In [36]:
# instantiate the database connection
engine = create_engine('sqlite:///../data/sqlalchemy/employees.sqlite')
connection = engine.connect()
print('Tables:', engine.table_names())

# instantiate the table obj
employees = Table('employees', MetaData(), autoload=True, autoload_with=engine)
print('Columns:', employees.columns.keys())

print('Setup complete!')

Tables: ['employees']
Columns: ['id', 'name', 'job', 'mgr', 'hiredate', 'sal', 'comm', 'dept']
Setup complete!


We'll create **self** joins when dealing with **hierarchical** tables, where there is an undefined relationship between two of the columns in the table, e.g. the manager in `mgr` column also appears in the `id` column.

in order to use this relationship in a query, we use the `alias` method which allows us to refer to the table by another name - you can refer to the same table with two unique names.

**Create a list of managers and the employees that report to them**

In [40]:
# create an alias for the employees table
managers = employees.alias()

# columns of interest and define alternate names
stmt = select([
    managers.columns.name.label('manager'),
    employees.columns.name.label('employee')
])

# join the tables
stmt = stmt.select_from(
    employees.join(
        managers,
        managers.columns.id == employees.columns.mgr
    )
)

stmt = stmt.order_by(managers.columns.name)
connection.execute(stmt).fetchall()

[('FILLMORE', 'GRANT'),
 ('FILLMORE', 'ADAMS'),
 ('FILLMORE', 'MONROE'),
 ('GARFIELD', 'JOHNSON'),
 ('GARFIELD', 'LINCOLN'),
 ('GARFIELD', 'POLK'),
 ('GARFIELD', 'WASHINGTON'),
 ('HARDING', 'TAFT'),
 ('HARDING', 'HOOVER'),
 ('JACKSON', 'HARDING'),
 ('JACKSON', 'GARFIELD'),
 ('JACKSON', 'FILLMORE'),
 ('JACKSON', 'ROOSEVELT')]

If you need to perform a `group_by` or using `func`, treat them as two separate tables. Have the table in the `group_by` and the `alias` in the `func`, or vice versa.

**Determine the sum each managers employees salary**

In the example we apply the function to the employees table, and group by to the managers alias.

In [42]:
managers = employees.alias()

stmt = select([
    managers.columns.name,
    func.sum(employees.columns.sal)
])

# join the tables
stmt = stmt.select_from(
    employees.join(
        managers,
        managers.columns.id == employees.columns.mgr
    )
)

# group by mgr's name and execute the query
stmt = stmt.group_by(managers.columns.name)
connection.execute(stmt).fetchall()

[('FILLMORE', Decimal('96000.00')),
 ('GARFIELD', Decimal('83500.00')),
 ('HARDING', Decimal('52000.00')),
 ('JACKSON', Decimal('197000.00'))]

**Count number of employees assigned to each particular manager**

In [43]:
# Make an alias of the employees table: managers
managers = employees.alias()

# Build a query to select managers and counts of their employees: stmt
stmt = select([managers.columns.name, func.count(employees.columns.id)])

# Append a where clause that ensures the manager id and employee mgr are equal
stmt = stmt.where(managers.columns.id == employees.columns.mgr)

# Group by Managers Name
stmt = stmt.group_by(managers.columns.name)

# Execute statement: results
results = connection.execute(stmt).fetchall()

# print manager
for record in results:
    print(record)

('FILLMORE', 3)
('GARFIELD', 4)
('HARDING', 2)
('JACKSON', 4)
