# Advanced SQLAlchemy Queries

## Calculating values in a query

In [204]:
from sqlalchemy import create_engine
from sqlalchemy import select, Table, MetaData
engine = create_engine("sqlite:///census.sqlite")
connection = engine.connect()
metadata = MetaData()
census = Table("census", metadata, autoload=True, autoload_with=engine)

from sqlalchemy import desc
stmt = select([census.columns.age, (census.columns.pop2008 -
                                   census.columns.pop2000).label("pop_change")])
stmt = stmt.group_by(census.columns.age)
stmt = stmt.order_by(desc("pop_change"))
stmt = stmt.limit(5)
results = connection.execute(stmt).fetchall()
print(results)

from sqlalchemy import case
from sqlalchemy import func
stmt = select([
    func.sum(
        case([
            (census.columns.state == "New York", census.columns.pop2008)
        ], else_=0))])
results = connection.execute(stmt).fetchall()
print(results)

# with simple select query it returns the same thing.
stmt = select([census.columns.state, func.sum(census.columns.pop2008)])
stmt = stmt.where(census.columns.state == "New York")
results = connection.execute(stmt).fetchall()
print(results)

# cast statement useful when converting a column from one type to another.
from sqlalchemy import cast, Float
stmt = select([
    (func.sum(case([
        (census.columns.state == "New York", census.columns.pop2000)
    ], else_=0))/ cast(func.sum(census.columns.pop2008),
                      Float) * 100).label("ny_percent")])
results = connection.execute(stmt).fetchall()
print(results)

[(61, 25201), (54, 23503), (55, 21716), (60, 19677), (58, 19526)]
[(19465159,)]
[('New York', 19465159)]
[(6.26613848194347,)]


### Connecting to a MySQL database

In [205]:
import pymysql
engine = create_engine("mysql+pymysql://student:datacamp@courses.csrrinzqubik.us-east-1.rds.amazonaws.com:3306/census")
print(engine.table_names())
connection = engine.connect()
metadata = MetaData()
census = Table("census", metadata, autoload=True, autoload_with=engine)

['census', 'state_fact']


### Calculating a difference between two columns

In [122]:
engine = create_engine("sqlite:///census.sqlite")
connection = engine.connect()
metadata = MetaData()
census = Table("census", metadata, autoload=True, autoload_with=engine)


stmt = select([census.columns.state, (census.columns.pop2008-census.columns.pop2000).label("pop_change")])
stmt_grouped = stmt.group_by(census.columns.state)
stmt_ordered = stmt_grouped.order_by(desc("pop_change"))
stmt_top5 = stmt_ordered.limit(5)
results = connection.execute(stmt_top5).fetchall()
for result in results:
    print('{}:{}'.format(result.state, result.pop_change))

Texas:40137
California:35406
Florida:21954
Arizona:14377
Georgia:13357


### Determining the overall percentage of women

Use the case() expression to operate on data that meets specific criteria while not affecting the query as a whole. The case() expression accepts a list of conditions to match and the column to return if the condition matches, followed by an else_ if none of the conditions match. 

Use the cast() function to convert an expression to a particular type.

In [96]:
female_pop2000 = func.sum(case([
    (census.columns.sex == "F", census.columns.pop2000)
], else_=0))

total_pop2000 = cast(func.sum(census.columns.pop2000),Float)
stmt = select([female_pop2000 / total_pop2000 * 100])
percent_female = connection.execute(stmt).scalar()
print(percent_female)

# same thing with sql 
q = """SELECT CAST((SELECT SUM(pop2000) FROM census WHERE sex =='F' GROUP BY sex) AS float)/ SUM(pop2000) * 100 AS percent_female
FROM census"""
pd.read_sql(q, engine)

51.09467432293413


Unnamed: 0,percent_female
0,51.094674


## SQL relationships

SQLAlchemy joins the two table after selecting them because that is predefined in the database.

Join can add a relationship that is not necessarily predefied in a query.
When we **dont** select a column from the joined table we can use select from to join.

To join tables we can give the join clause a boolean expression that explains how the tables are related.

### Automatic joins with an established relationship

If you have two tables that already have an established relationship, you can automatically use that relationship by just adding the columns we want from each table to the select statement.

In [131]:
state_fact = Table("state_fact", metadata, autoload=True, autoload_with=engine)
print(state_fact.columns.keys(),"\n")
stmt = select([census.columns.pop2000, state_fact.columns.abbreviation])
result = connection.execute(stmt).first()
for key in result.keys():
    print(key, getattr(result, key))

['id', 'name', 'abbreviation', 'country', 'type', 'sort', 'status', 'occupied', 'notes', 'fips_state', 'assoc_press', 'standard_federal_region', 'census_region', 'census_region_name', 'census_division', 'census_division_name', 'circuit_court'] 

pop2000 89600
abbreviation IL


### Joins

In [206]:
stmt = select([census, state_fact])
# the join columns are already defined but it is done for the sake of practising.
stmt_join = stmt.select_from(census.join(state_fact,
                                        census.columns.state == state_fact.columns.name))

result = connection.execute(stmt_join).first()
for key in result.keys():
    print(key, getattr(result, key))

state Illinois
sex M
age 0
pop2000 89600
pop2008 95012
id 13
name Illinois
abbreviation IL
country USA
type state
sort 10
status current
occupied occupied
notes 
fips_state 17
assoc_press Ill.
standard_federal_region V
census_region 2
census_region_name Midwest
census_division 3
census_division_name East North Central
circuit_court 7


### More practice with joins


In [207]:
stmt = select([census.columns.state, func.sum(census.columns.pop2008),
              state_fact.columns.census_division_name])
stmt_joined = stmt.select_from(census.join(state_fact, census.columns.state ==
                                          state_fact.columns.name))

stmt_grouped = stmt_joined.group_by(state_fact.columns.name)
results = connection.execute(stmt_grouped).fetchall()
for record in results:
    print(record)

('Alabama', Decimal('4681422'), 'East South Central')
('Alaska', Decimal('664546'), 'Pacific')
('Arizona', Decimal('10698743'), 'Mountain')
('Arkansas', Decimal('4343608'), 'West South Central')
('California', Decimal('56952946'), 'Pacific')
('Colorado', Decimal('7474086'), 'Mountain')
('Connecticut', Decimal('3727540'), 'New England')
('Delaware', Decimal('869221'), 'South Atlantic')
('Florida', Decimal('20339477'), 'South Atlantic')
('Georgia', Decimal('9622508'), 'South Atlantic')
('Hawaii', Decimal('1250676'), 'Pacific')
('Idaho', Decimal('1518914'), 'Mountain')
('Illinois', Decimal('16274391'), 'East North Central')
('Indiana', Decimal('7378168'), 'East North Central')
('Iowa', Decimal('3000490'), 'West North Central')
('Kansas', Decimal('4045759'), 'West North Central')
('Kentucky', Decimal('4525061'), 'East South Central')
('Louisiana', Decimal('5183486'), 'West South Central')
('Maine', Decimal('2018932'), 'New England')
('Maryland', Decimal('7246747'), 'South Atlantic')
('Mass

## Working with hierarchical tables

An undefined realtionship within a table.

### Using alias to handle same table joined queries

In [161]:
engine = create_engine("sqlite:///employees.sqlite")
connection = engine.connect()
metadata = MetaData()
employees = Table("employees", metadata, autoload=True, autoload_with=engine)
menagers = employees.alias()

stmt = select([menagers.columns.name.label("menager"), employees.columns.name.label("employee")])
stmt_matched = stmt.where(menagers.columns.id == employees.columns.mgr)
stmt_ordered = stmt_matched.order_by(menagers.columns.name)
results = connection.execute(stmt_ordered).fetchall()
for record in results:
    print(record)

('FILLMORE', 'GRANT')
('FILLMORE', 'ADAMS')
('FILLMORE', 'MONROE')
('GARFIELD', 'JOHNSON')
('GARFIELD', 'LINCOLN')
('GARFIELD', 'POLK')
('GARFIELD', 'WASHINGTON')
('HARDING', 'TAFT')
('HARDING', 'HOOVER')
('JACKSON', 'HARDING')
('JACKSON', 'GARFIELD')
('JACKSON', 'FILLMORE')
('JACKSON', 'ROOSEVELT')


### Leveraging functions and group_bys with hierarchical data

In [170]:
menagers = employees.alias()

stmt = select([menagers.columns.name, func.count(employees.columns.name)])
stmt_matched = stmt.where(menagers.columns.id == employees.columns.mgr)
stmt_grouped = stmt_matched.group_by(menagers.columns.name)

results = connection.execute(stmt_grouped).fetchall()
for record in results:
    print(record)

('FILLMORE', 3)
('GARFIELD', 4)
('HARDING', 2)
('JACKSON', 4)


## Handling large ResultSets

sometimes you may have the need to work on a large ResultProxy, and you may not have the memory to load all the results at once. 

In [209]:
engine = create_engine("sqlite:///census.sqlite")
connection = engine.connect()
metadata = MetaData()
census = Table("census", metadata, autoload=True, autoload_with=engine)
state_fact = Table("state_fact", metadata, autoload=True, autoload_with=engine)

stmt = select([census.columns.state])
stmt = stmt.select_from(census.join(state_fact, census.columns.state ==
                                          state_fact.columns.name))
state_count = {}
results_proxy = engine.execute(stmt)
more_result = True
while more_result:
    result = results_proxy.fetchmany(50)
    if result == []:
        more_result = False
    for row in result:
        if row.state in state_count:
            state_count[row.state] +=1
        else:
            state_count[row.state] =1
results_proxy.close()
print(state_count)

{'Illinois': 172, 'New Jersey': 172, 'North Dakota': 172, 'Florida': 172, 'Maryland': 172, 'Idaho': 172, 'Massachusetts': 172, 'Oregon': 172, 'Nevada': 172, 'Michigan': 172, 'Wisconsin': 172, 'Missouri': 172, 'Washington': 172, 'North Carolina': 172, 'Arizona': 172, 'Arkansas': 172, 'Colorado': 172, 'Indiana': 172, 'Pennsylvania': 172, 'Hawaii': 172, 'Kansas': 172, 'Louisiana': 172, 'Alabama': 172, 'Minnesota': 172, 'South Dakota': 172, 'New York': 172, 'California': 172, 'Connecticut': 172, 'Ohio': 172, 'Rhode Island': 172, 'Georgia': 172, 'South Carolina': 172, 'Alaska': 172, 'Delaware': 172, 'Tennessee': 172, 'Vermont': 172, 'Montana': 172, 'Kentucky': 172, 'Utah': 172, 'Nebraska': 172, 'West Virginia': 172, 'Iowa': 172, 'Wyoming': 172, 'Maine': 172, 'New Hampshire': 172, 'Mississippi': 172, 'Oklahoma': 172, 'New Mexico': 172, 'Virginia': 172, 'Texas': 172}
