In [10]:
import pandas as pd
import shortuuid
from sqlalchemy import create_engine
import plotly.express as px
import psycopg2 as ps

In [2]:
df = pd.read_csv('dataset/cleaned_nur_data.csv', index_col=0)
df.head(2)

Unnamed: 0_level_0,name,location,rank,description,tuition_and_fees,in_state,undergrad_enrollment,state,year,state_full,region
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Princeton University,"Princeton, NJ",1,"Princeton, the fourth-oldest college in the Un...",45320,0,5402,NJ,0,New Jersey,Northeast
1,Harvard University,"Cambridge, MA",2,"Harvard is located in Cambridge, Massachusetts...",47074,0,6699,MA,0,Massachusetts,Northeast


In [3]:
user_state = 'NJ'
state_reg = df[df['state'] == user_state]['region'][0]
print(f'{user_state} is in the {state_reg}')    

NJ is in the Northeast


In [4]:
def generate_rank_id(row):
    concat_str = str(row['rank']) + str(row['name'])
    unique_id = shortuuid.uuid(name=concat_str)
    return unique_id

def generate_school_id(row):
    concat_str = str(row['name']) + str(row['location'])
    unique_id = shortuuid.uuid(name=concat_str)
    return unique_id

In [5]:
df_nur = df.copy()
df_nur = df_nur.rename(columns={'state': 'state_id'})
df_nur['rank_id'] = df_nur.apply(generate_rank_id, axis=1)
df_nur['school_id'] = df_nur.apply(generate_school_id, axis=1)
df_nur.head(3)

Unnamed: 0_level_0,name,location,rank,description,tuition_and_fees,in_state,undergrad_enrollment,state_id,year,state_full,region,rank_id,school_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,Princeton University,"Princeton, NJ",1,"Princeton, the fourth-oldest college in the Un...",45320,0,5402,NJ,0,New Jersey,Northeast,Eq5cQV4hKkFR6z2MfWxEqc,Lr2R5kZceTU4pdjUnocLDg
1,Harvard University,"Cambridge, MA",2,"Harvard is located in Cambridge, Massachusetts...",47074,0,6699,MA,0,Massachusetts,Northeast,EkYNgT2BHJsCyyswYys6Vw,UdJi2dnSZPoorgUFCaSU5h
2,University of Chicago,"Chicago, IL",3,"The University of Chicago, situated in Chicago...",52491,0,5844,IL,0,Illinois,Midwest,XVqUGZZRK7ERHTrVRBrRTL,BGe6nZUAjMJF8Wf9e5ETiv


In [8]:
def region_df(reg):
    region = df_nur[df_nur['region'] == reg][['school_id', 'name', 'location', 'description', 'year', 'state_id', 'rank_id']]
    return region

In [20]:
northeast = region_df('Northeast')
midwest = region_df('Midwest')
west = region_df('West')
south = region_df('South')
rank_df = df_nur[['rank_id', 'rank', 'tuition_and_fees', 'in_state', 'undergrad_enrollment']]
state_df = df_nur[['state_id', 'state_full', 'region']]

In [11]:
engine = create_engine('postgresql://XXX:xxx@xxxx/nur_db')
conn = ps.connect('postgresql://XXX:1234@xxxx/nur_db')


In [27]:
def insertion(data, table_name):
    data.to_sql(name=table_name, con=engine, schema='nur_app', if_exists='replace', index=False)

In [19]:
insertion(northeast, 'northeast')
insertion(midwest, 'midwest')
insertion(south, 'south')
insertion(west, 'west')
insertion(rank_df, 'rank')
insertion(state_df, 'state')

In [33]:
query = """
SELECT n.name, tuition_and_fees
FROM nur_app.northeast n
JOIN nur_app.rank r
ON r.id = n.rank_id
ORDER BY tuition_and_fees DESC
LIMIT 3
"""
df = pd.read_sql_query(query, conn)
df


pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.



Unnamed: 0,name,tuition_and_fees
0,Columbia University,55056
1,Tufts University,52430
2,Carnegie Mellon University,52040


In [31]:
fig = px.bar(df, y="rank", x="name", text_auto=True,height = 300, width= 550, labels={'name':'', 'rank':''})
fig.update_layout(xaxis={"categoryorder": "total ascending"}, title_text="Top Universities by Rank")
fig.update_yaxes(showticklabels=False)
fig.show()

In [24]:
fig = px.bar(df, x='name', y='rank', text_auto=True)
# fig.update_yaxes(range=[0, max(df['rank'])])
fig