To read from a database into a pandas dataframe, we use another library called SQLAlchemy. This is already available on datasciencehub.

First, let's open our CSV (`Calgary_Public_Library_Locations_and_Hours.csv`) in pandas. 

This CSV is provided courtesy of the City of Calgary, under the Open Government License, and can be found here: https://data.calgary.ca/Recreation-and-Culture/Calgary-Public-Library-Locations-and-Hours/m9y7-ui7j

First, let's review the data we will be using, and read it into a DataFrame. 

In [1]:
import pandas as pd

college = pd.read_csv("HFA_39_EN.csv")
college.head()
#cpl_locations = pd.read_csv("Calgary_Public_Library_Locations_and_Hours.csv")
#cpl_locations.head()

Unnamed: 0,COUNTRY,COUNTRY_GRP,SEX,YEAR,VALUE
0,ALB,,ALL,2001.0,7.43
1,ALB,,ALL,2008.0,9.82
2,ALB,,ALL,2011.0,12.0
3,AND,,ALL,2003.0,27.81
4,AND,,ALL,2004.0,30.51


In [2]:
import sqlalchemy as sq
sq.__version__

'2.0.22'

To create a connection, we create an _engine_ object

In [3]:
# fill in the details for your username and password below 

engine = sq.create_engine('mysql+mysqlconnector://sean_anselmo:4i1tawVQFvTUd@datasciencedb.ucalgary.ca/sean_anselmo')

In [4]:
college.to_sql('college', engine )

ValueError: Table 'college' already exists.

In [10]:
# What is the SQL equivalent of this method call?

college_df = pd.read_sql_table("college", engine)
college_df.head()

Unnamed: 0,index,COUNTRY,COUNTRY_GRP,SEX,YEAR,VALUE
0,0,ALB,,ALL,2001.0,7.43
1,1,ALB,,ALL,2008.0,9.82
2,2,ALB,,ALL,2011.0,12.0
3,3,AND,,ALL,2003.0,27.81
4,4,AND,,ALL,2004.0,30.51


In [11]:
# Now try a query

query_table = pd.read_sql_query('select value from college;', engine)
print (query_table)

     value
0     7.43
1     9.82
2    12.00
3    27.81
4    30.51
..     ...
540    NaN
541    NaN
542    NaN
543    NaN
544    NaN

[545 rows x 1 columns]


In [12]:
college_df = college_df[:-7]  
print(college_df)

     index COUNTRY      COUNTRY_GRP  SEX    YEAR  VALUE
0        0     ALB             None  ALL  2001.0   7.43
1        1     ALB             None  ALL  2008.0   9.82
2        2     ALB             None  ALL  2011.0  12.00
3        3     AND             None  ALL  2003.0  27.81
4        4     AND             None  ALL  2004.0  30.51
..     ...     ...              ...  ...     ...    ...
533    533     TUR             None  ALL  2010.0  11.53
534    534     TUR             None  ALL  2011.0  12.76
535    535     UKR             None  ALL  1970.0   6.50
536    536     UKR             None  ALL  2001.0  38.14
537    537    None  WESTERN_BALKANS  ALL  2011.0  15.50

[538 rows x 6 columns]


In [13]:
college_df.to_sql('college', engine, if_exists='replace', index=False)

538

In [6]:
query_table = pd.read_sql_query('select country from college;', engine)
print (query_table)

    country
0       ALB
1       ALB
2       ALB
3       AND
4       AND
..      ...
533     TUR
534     TUR
535     UKR
536     UKR
537    None

[538 rows x 1 columns]


In [8]:
#Search for highest post secondary rate for age 25+

query_table = pd.read_sql_query('SELECT * FROM college ORDER BY value DESC LIMIT 1;', engine)
print(query_table)


   index COUNTRY COUNTRY_GRP  SEX    YEAR  VALUE
0     80     BLR        None  ALL  2019.0  73.77


In [9]:
#Search for lowest post secondary rate for age 25+

query_table = pd.read_sql_query('SELECT year FROM college ORDER BY value ASC LIMIT 1;', engine)
print(query_table)

   index COUNTRY COUNTRY_GRP  SEX    YEAR  VALUE
0    424     PRT        None  ALL  1970.0    1.6


In [10]:
#avg grad rate by year

query_table = pd.read_sql_query('SELECT year, AVG(value) AS avg_value FROM college GROUP BY year ORDER BY year;', engine)
print(query_table)


      year  avg_value
0   1970.0   5.033333
1   1971.0   3.840000
2   1972.0  14.800000
3   1975.0   6.975000
4   1976.0   6.700000
5   1977.0   4.600000
6   1978.0   5.700000
7   1979.0  15.400000
8   1980.0   8.256667
9   1981.0   6.009000
10  1982.0  15.580000
11  1983.0  10.060000
12  1984.0   8.755000
13  1985.0   9.327500
14  1986.0  14.366667
15  1987.0   8.940000
16  1988.0  14.210000
17  1989.0  14.218889
18  1990.0  15.028571
19  1991.0  11.988824
20  1992.0  15.405000
21  1993.0  16.345000
22  1994.0  19.724286
23  1995.0  21.333333
24  1998.0   7.900000
25  1999.0  30.410000
26  2000.0  16.462500
27  2001.0  19.453333
28  2002.0  19.590000
29  2003.0  21.167500
30  2004.0  24.545385
31  2005.0  23.505652
32  2006.0  26.078000
33  2007.0  26.161600
34  2008.0  25.138519
35  2009.0  27.538667
36  2010.0  26.674242
37  2011.0  26.468947
38  2012.0  28.875200
39  2013.0  28.796364
40  2014.0  29.341379
41  2015.0  31.605333
42  2016.0  30.744444
43  2017.0  34.972778
44  2018.0

In [12]:
#highest grad rate since 2000

query_table = pd.read_sql_query('SELECT country, AVG(value) AS avg_value FROM college WHERE year >= 2000 GROUP BY country ORDER BY avg_value DESC LIMIT 1;', engine)
print(query_table)


  country  avg_value
0     BLR     68.575


In [13]:
# Highest average value by year
highest_avg_by_year = pd.read_sql_query('SELECT year, AVG(value) AS avg_value FROM college GROUP BY year ORDER BY avg_value DESC LIMIT 1;', engine)
print("Highest Average Value by Year:")
print(highest_avg_by_year)

# Highest average value by country
highest_avg_by_country = pd.read_sql_query('SELECT country, AVG(value) AS avg_value FROM college GROUP BY country ORDER BY avg_value DESC LIMIT 1;', engine)
print("\nHighest Average Value by Country:")
print(highest_avg_by_country)


Highest Average Value by Year:
     year  avg_value
0  2019.0    37.6775

Highest Average Value by Country:
  country  avg_value
0     GEO  52.448333


In [16]:
engine.dispose()