# Putting it all together

## Census case study

* Preparearing SQLAlchemy and the database.
* Loading data into the database
* Solving data science problems with queries.


### Setup the engine and metadata


In [2]:
from sqlalchemy import create_engine, MetaData
engine = create_engine("sqlite:///chapter5.sqlite")
metadata = MetaData()

### Create the table to the database

In [4]:
from sqlalchemy import Table, Column, String, Integer
census = Table("census", metadata,
              Column("state", String(30)),
              Column("sex", String(1)),
              Column("age", Integer()),
              Column("pop2000", Integer()),
              Column("pop2008", Integer()))
metadata.create_all(engine)

## Populating the database

### Reading the data from the CSV

In [25]:
import csv
f = open("census.csv", "r")
csv_reader = csv.reader(f)
next(csv_reader)
values_list = []
for row in csv_reader:
    data = {"state":row[1], "sex":row[2], "age":row[3], "pop2000":row[4], "pop2008":row[5]}
    values_list.append(data)

### Load data from a list into the Table

In [29]:
from sqlalchemy import insert
connection = engine.connect()
stmt = insert(census)
results = connection.execute(stmt, values_list)
print(results.rowcount)

8772


## Querying the database

In [43]:
from sqlalchemy import select, func

stmt = select([(func.sum(census.columns.age * census.columns.pop2000) / 
              func.sum(census.columns.pop2000)).label("average_age"), census.columns.sex])
stmt = stmt.group_by(census.columns.sex)
results= connection.execute(stmt).fetchall()
for record in results:
    print(record.sex, record.average_age)

F 37
M 34


### Determine the percentage of population by gender and state


In [50]:
from sqlalchemy import case, cast, Float, desc

stmt = select([census.columns.state, (func.sum(case([(
    census.columns.sex == "F", census.columns.pop2000)], else_=0)) / 
                                     cast(func.sum(
                                     census.columns.pop2000), Float) *
                                     100).label("percent_female")])
stmt = stmt.group_by(census.columns.state)
stmt = stmt.order_by(desc("percent_female"))
results = connection.execute(stmt).fetchall()
for result in results:
    print(result.state, result.percent_female)

District of Columbia 53.129626141738385
Rhode Island 52.07343391902215
Maryland 51.93575549972231
Mississippi 51.92229481794672
Massachusetts 51.843023571316785
New York 51.83453865150073
Alabama 51.832407770179465
Louisiana 51.75351596554121
Pennsylvania 51.74043473051053
South Carolina 51.73072129765755
Connecticut 51.66816507130644
Virginia 51.657252447241795
Delaware 51.61109733558627
New Jersey 51.51713956125773
Maine 51.50570813418951
North Carolina 51.482262322084594
Missouri 51.46888602639692
Ohio 51.46550350015544
Tennessee 51.430689699449275
West Virginia 51.40042318092286
Florida 51.36488001165242
Kentucky 51.32687036927168
Arkansas 51.26992846221834
Hawaii 51.118011836915514
Georgia 51.11408350339436
Oklahoma 51.11362457075227
Illinois 51.11224234802867
New Mexico 51.0471720798335
Vermont 51.018573209949466
Michigan 50.97246518318712
Indiana 50.95480313297678
Iowa 50.950398342534264
Nebraska 50.8584549336086
New Hampshire 50.858019844961746
Kansas 50.821864107754735
Wiscons

### Determine the difference by state from the 2000 and 2008 censuses

In [133]:
# since it was logically wrong i fixed the issue here.

stmt = select([census.columns.state, (func.sum(census.columns.pop2008) -
                                     func.sum(census.columns.pop2000)).label("pop_change")])
stmt = stmt.group_by(census.columns.state)
stmt = stmt.order_by(desc("pop_change"))
stmt = stmt.limit(10)
results = connection.execute(stmt).fetchall()
for result in results:
    print('{}: {}'.format(result.state, result.pop_change))

Texas: 3383317
California: 2779560
Florida: 2281569
Georgia: 1460732
Arizona: 1336836
North Carolina: 1143025
Virginia: 693112
Washington: 638917
Colorado: 612070
Nevada: 577131
