In [None]:
#%pip install SQLAlchemy pymysql python-dotenv 

# 1. Connect to SQL Database

- It would be bad practice to store your database password into a Python script or Jupyter Notebook
- Instead, we store the password in a separate `.env` file. 
- This is a text file that contains environment variables in the form of name=value pairs. 
- You can then load these variables into environment variables using the dotenv package.
- Such a .env file should never be shared with others or checked into version control (will be covered in our lecture on git and Github)

In [None]:
from dotenv import load_dotenv                        # Load passwords etc from .env file 
load_dotenv('.env')                                   # Absolute or relative path to the .env file 

- The package SQLAlchemy allows you to connect to different variants of SQL databases (e.g. MySQL, PostgreSQL, SQLite, ...).
- To connect to a MySQL database, you need to specify both the database dialect (mysql) and choose an appropriate driver (pymysql or mysqlconnector).

In [None]:
from sqlalchemy import create_engine                  # Connection to database
import os                                             # Needed to access environment variables

In [None]:
DIALECT = 'mysql'
DRIVER = 'pymysql'
USER = os.getenv('MYSQL_USER')
PASSWORD = os.getenv('MYSQL_PASSWORD')
HOST = 'localhost'
PORT = '3306'
DB = 'music'

connection_string = f"{DIALECT}+{DRIVER}://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}"
engine = create_engine(connection_string)

# 2. Read from SQL Database to Pandas DataFrame

## 2.1 Inspecting the MySQL database

First, we inspect the available databases in our MySQL Server, and specifically the available tables in our "music" database.

In [None]:
import pandas as pd
from sqlalchemy import inspect

In [None]:
inspector = inspect(engine)

print('Available Databases:')
inspector.get_schema_names()

In [None]:
inspector.get_table_names()

## 2.2 Read full table

We can read entire tables into a Pandas Dataframe

In [None]:
tracks = pd.read_sql('charts', con=engine)
tracks.head()

## 2.3 SQL Select Query

We can send arbitrary SQL queries to read data into a Pandas DataFrame

### Select all columns

In [None]:
query = "select * from charts limit 3"
pd.read_sql(query, con=engine)

### Where conditions

In [None]:
query = "select * from charts where streams >  17000000"
pd.read_sql(query, con=engine)

### Aggregations

In [None]:
query = "select count(*), avg(streams) from charts "
pd.read_sql(query, con=engine)

### Grouped aggregations

In [None]:
query = "select region, avg(streams) from charts group by region order by avg(streams) desc"
pd.read_sql(query, con=engine)

### Joining tables

In [None]:
query = """select * from charts
           left join tracks
           on charts.track_id = tracks.id"""
pd.read_sql(query, con=engine)

# 3. Write Pandas DataFrame to SQL Database

For this demo, we create some new DataFrame with aggregated data, and then write this data back to the database

In [None]:
query = """select region, avg(streams) as avg_streams from charts
           left join tracks
           on charts.track_id = tracks.id 
           group by region
           order by avg_streams desc"""
aggregated_data = pd.read_sql(query, con=engine)
aggregated_data

In [None]:
aggregated_data.to_sql(name="avg_streams_by_region", 
                       con=engine, 
                       if_exists='replace',   # fail, replace, append 
                       index=False)

In [None]:
inspect(engine).get_table_names()

# 4. Execute Arbitrary SQL

So far, we have only used Pandas `read_sql` and `write_sql` functions to interact with our database. **However, we can run arbitrary SQL from our Python session**. 

In the following we:

1. Connect to our MySQL Server
2. We create a new test database
3. We create a new table from a Pandas DataFrame
4. We clean up by dropping the table and the database


In [None]:
from sqlalchemy import text
import seaborn as sns

### Connect to MySQL Server

Here we do not connect to a specific database, but to the MySQL Server (one level higher in the hierarchy). This will allow us to manage (create, alter, delete ...)  entire databases

In [None]:
connection_string = f"{DIALECT}+{DRIVER}://{USER}:{PASSWORD}@{HOST}:{PORT}"
server_engine = create_engine(connection_string)

### Create a new database

In [None]:
with server_engine.connect() as connection:
    connection.execute(text('DROP DATABASE IF EXISTS test'))
    connection.execute(text('CREATE DATABASE test'))

In [None]:
connection_string = f"{DIALECT}+{DRIVER}://{USER}:{PASSWORD}@{HOST}:{PORT}/test"
test_engine = create_engine(connection_string)

In [None]:
titanic = sns.load_dataset('titanic')
titanic.to_sql(name='titanic', con=test_engine)

### Clean up: drop table and database

In [None]:
with server_engine.connect() as connection:
    connection.execute(text("Drop table if exists test.titanic"))
    connection.execute(text("Drop database if exists test"))

Close all open connections, if present.

In [None]:
engine.dispose()
server_engine.dispose()
test_engine.dispose()