# Extracting data from files and selecting columns

In [1]:
import pandas as pd
from sqlalchemy import create_engine

In [2]:
json = pd.read_json("schoolInfo.json")
json = json[['overallRank', 'displayName', 'enrollment']]
json.columns = ['rank', 'name', 'enrollment']
json.head()

Unnamed: 0,rank,name,enrollment
0,1,Princeton University,5400.0
1,2,Harvard University,6710.0
2,3,University of Chicago,5941.0
3,3,Yale University,5472.0
4,5,Columbia University,6113.0


In [3]:
salary_potential = pd.read_csv("salary_potential.csv")
salary_potential = salary_potential[['name', 'early_career_pay', 'mid_career_pay']]
salary_potential.head()

Unnamed: 0,name,early_career_pay,mid_career_pay
0,Auburn University,54400,104500
1,University of Alabama in Huntsville,57500,103900
2,The University of Alabama,52300,97400
3,Tuskegee University,54500,93500
4,Samford University,48400,90500


In [4]:
tuition_cost = pd.read_csv("tuition_cost.csv")
tuition_cost = tuition_cost[['name', 'state', 'state_code']]
tuition_cost.head()

Unnamed: 0,name,state,state_code
0,Aaniiih Nakoda College,Montana,MT
1,Abilene Christian University,Texas,TX
2,Abraham Baldwin Agricultural College,Georgia,GA
3,Academy College,Minnesota,MN
4,Academy of Art University,California,CA


In [5]:
US_Income = pd.read_csv("US_Income14_18.csv", encoding='latin1') # median household income
US_Income = US_Income[['State or territory', '2014', '2015', '2016', '2017', '2018']]
US_Income.columns = ['state', 'MHI 2014', 'MHI 2015', 'MHI 2016', 'MHI 2017', 'MHI 2018']
US_Income

Unnamed: 0,state,MHI 2014,MHI 2015,MHI 2016,MHI 2017,MHI 2018
0,"Washington, D.C.","$71,648","$75,628","$75,506","$82,372","$85,203"
1,Maryland,"$73,971","$75,847","$78,945","$80,776","$83,242"
2,New Jersey,"$71,919","$72,222","$76,126","$80,088","$81,740"
3,Hawaii,"$69,592","$73,486","$74,511","$77,765","$80,212"
4,Massachusetts,"$69,160","$70,628","$75,297","$77,385","$79,835"
5,Connecticut,"$70,048","$71,346","$73,433","$74,168","$76,348"
6,California,"$61,933","$64,500","$67,739","$71,805","$75,277"
7,New Hampshire,"$66,532","$70,303","$70,936","$73,381","$74,991"
8,Alaska,"$71,583","$73,355","$76,440","$73,181","$74,346"
9,Washington (state) Washington,"$61,366","$64,129","$67,106","$70,979","$74,073"


# Merging Files

In [6]:
first_merge = pd.merge(json, salary_potential, on='name', how='outer')
second_merge = pd.merge(first_merge, tuition_cost, on='name', how='outer') # df with selected columns

raw_df = second_merge[['name', 'rank', 'enrollment', 'state', 'state_code', 'early_career_pay', 'mid_career_pay']]

In [7]:
raw_df.head()

Unnamed: 0,name,rank,enrollment,state,state_code,early_career_pay,mid_career_pay
0,Princeton University,1.0,5400.0,New Jersey,NJ,75200.0,139400.0
1,Harvard University,2.0,6710.0,,,74800.0,146800.0
2,University of Chicago,3.0,5941.0,Illinois,IL,64000.0,114200.0
3,Yale University,3.0,5472.0,Connecticut,CT,70300.0,138300.0
4,Columbia University,5.0,6113.0,New York,NY,,


In [8]:
raw_df.to_csv('../csv tables/raw_df.csv')
US_Income.to_csv('../csv tables/US_Income.csv')

# Connection to local Database

In [9]:
# rds_connection_string = "postgres:password@localhost:5432/ETL_project"
# engine = create_engine(f'postgresql://{rds_connection_string}')

In [10]:
# engine.table_names()

# Confirm data has been added by querying


In [11]:
# raw_df.to_sql(name='raw_df', con=engine, if_exists='append', index=False)

In [12]:
# pd.read_sql_query('select * from raw_df', con=engine).head()

# Confirm data has been added by querying

In [13]:
# index_df.to_sql(name='index_df', con=engine, if_exists='append', index=False)

In [14]:
# pd.read_sql_query('select * from index_df', con=engine).head()