# Extracting data from files and selecting columns

In [30]:
import pandas as pd
from sqlalchemy import create_engine

In [31]:
json = pd.read_json("schoolInfo.json")
json = json[['overallRank', 'displayName', 'enrollment']]
json.columns = ['rank', 'name', 'enrollment']
json.head()

Unnamed: 0,rank,name,enrollment
0,1,Princeton University,5400.0
1,2,Harvard University,6710.0
2,3,University of Chicago,5941.0
3,3,Yale University,5472.0
4,5,Columbia University,6113.0


In [32]:
salary_potential = pd.read_csv("salary_potential.csv")
salary_potential = salary_potential[['name', 'early_career_pay', 'mid_career_pay']]
salary_potential.head()

Unnamed: 0,name,early_career_pay,mid_career_pay
0,Auburn University,54400,104500
1,University of Alabama in Huntsville,57500,103900
2,The University of Alabama,52300,97400
3,Tuskegee University,54500,93500
4,Samford University,48400,90500


In [33]:
tuition_cost = pd.read_csv("tuition_cost.csv")
tuition_cost = tuition_cost[['name', 'state', 'state_code']]
tuition_cost.head()

Unnamed: 0,name,state,state_code
0,Aaniiih Nakoda College,Montana,MT
1,Abilene Christian University,Texas,TX
2,Abraham Baldwin Agricultural College,Georgia,GA
3,Academy College,Minnesota,MN
4,Academy of Art University,California,CA


In [34]:
US_Income = pd.read_csv("US_Income14_18.csv") # median household income
US_Income = US_Income[['State or territory', '2014', '2015', '2016', '2017', '2018']]
US_Income.columns = ['state', 'MHI 2014', 'MHI 2015', 'MHI 2016', 'MHI 2017', 'MHI 2018']
US_Income

Unnamed: 0,state,MHI 2014,MHI 2015,MHI 2016,MHI 2017,MHI 2018
0,Washington DC,71648,75628,75506,82372,85203
1,Maryland,73971,75847,78945,80776,83242
2,New Yersey,71919,72222,76126,80088,81740
3,Hawaii,69592,73486,74511,77765,80212
4,Massachusetts,69160,70628,75297,77385,79835
5,Conneticut,70048,71346,73433,74168,76348
6,California,61933,64500,67739,71805,75277
7,New Hampshire,66532,70303,70936,73381,74991
8,Alaska,71583,73355,76440,73181,74346
9,Washington state,61366,64129,67106,70979,74073


In [35]:
US_Income['state']= US_Income['state'].astype(str)

In [36]:
US_Income.dtypes

state       object
MHI 2014     int64
MHI 2015     int64
MHI 2016     int64
MHI 2017     int64
MHI 2018     int64
dtype: object

# Merging Files

In [37]:
first_merge = pd.merge(json, salary_potential, on='name', how='outer')
second_merge = pd.merge(first_merge, tuition_cost, on='name', how='outer') # df with selected columns

raw_df = second_merge[['name', 'rank', 'enrollment', 'state', 'state_code', 'early_career_pay', 'mid_career_pay']]

In [38]:
raw_df.head()

Unnamed: 0,name,rank,enrollment,state,state_code,early_career_pay,mid_career_pay
0,Princeton University,1.0,5400.0,New Jersey,NJ,75200.0,139400.0
1,Harvard University,2.0,6710.0,,,74800.0,146800.0
2,University of Chicago,3.0,5941.0,Illinois,IL,64000.0,114200.0
3,Yale University,3.0,5472.0,Connecticut,CT,70300.0,138300.0
4,Columbia University,5.0,6113.0,New York,NY,,


In [40]:
US_Income.head()

Unnamed: 0,state,MHI 2014,MHI 2015,MHI 2016,MHI 2017,MHI 2018
0,Washington DC,71648,75628,75506,82372,85203
1,Maryland,73971,75847,78945,80776,83242
2,New Yersey,71919,72222,76126,80088,81740
3,Hawaii,69592,73486,74511,77765,80212
4,Massachusetts,69160,70628,75297,77385,79835


In [39]:
raw_df.to_csv('../csv tables/raw_df.csv')
US_Income.to_csv('../csv tables/US_Income.csv')

# Connection to local Database

In [41]:
rds_connection_string = "postgres:password@localhost:5432/ETL_DATABASES"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [42]:
engine.table_names()

['us_income', 'raw_df']

# Confirm data has been added by querying


In [43]:
raw_df.to_sql(name='raw_df', con=engine, if_exists='append', index=False)

In [44]:
pd.read_sql_query('select * from raw_df', con=engine).head()

Unnamed: 0,rank,name,enrollment,early_career_pay,mid_career_pay,state,state_code
0,1.0,Princeton University,5400.0,75200.0,139400.0,New Jersey,NJ
1,2.0,Harvard University,6710.0,74800.0,146800.0,,
2,3.0,University of Chicago,5941.0,64000.0,114200.0,Illinois,IL
3,3.0,Yale University,5472.0,70300.0,138300.0,Connecticut,CT
4,5.0,Columbia University,6113.0,,,New York,NY


# Confirm data has been added by querying

In [47]:
US_Income.to_sql(name='us_income', con=engine, if_exists='append', index=False)

In [48]:
pd.read_sql_query('select * from us_income', con=engine).head()

Unnamed: 0,state,MHI 2014,MHI 2015,MHI 2016,MHI 2017,MHI 2018
0,Washington DC,71648,75628,75506,82372,85203
1,Maryland,73971,75847,78945,80776,83242
2,New Yersey,71919,72222,76126,80088,81740
3,Hawaii,69592,73486,74511,77765,80212
4,Massachusetts,69160,70628,75297,77385,79835
