# MySQL Project
---
```bash
Author: Witchakorn Wanasanwongkot
```
## Sakila database

The Sakila database is a nicely normalised schema modelling a DVD rental store, featuring things like films, actors, film-actor relationships, and a central inventory table that connects films, stores, and rentals.

<p align="center">
    <img src="https://www.jooq.org/img/sakila.png" width="100%"/>
</p>

*Image source: https://www.jooq.org/img/sakila.png*

In [268]:
import pandas as pd
import sqlalchemy as db

In [269]:
# Specify the MySQL database configurations.
config = {
    'host': 'localhost',
    'port': 3306,
    'user': 'root',
    'password': 'root',
    'database': 'sakila'
}

db_user = config.get('user')
db_pwd = config.get('password')
db_host = config.get('host')
db_port = config.get('port')
db_name = config.get('database')

# Specify connection string
connection_str = f'mysql+pymysql://{db_user}:{db_pwd}@{db_host}:{db_port}/{db_name}'

# Connect to `sakila` database
engine = db.create_engine(connection_str)
conn = engine.connect()
print(conn)

<sqlalchemy.engine.base.Connection object at 0x000001CFA2166A90>


In [270]:
# Display the names of all tables in the `sakila` database.
query = """
    SELECT TABLE_NAME 
    FROM INFORMATION_SCHEMA.TABLES
    WHERE TABLE_TYPE = 'BASE TABLE' AND TABLE_SCHEMA='sakila' ;
"""

pd.read_sql(query, con=conn).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
TABLE_NAME,actor,address,category,city,country,customer,film,film_actor,film_category,film_text,inventory,language,payment,rental,staff,store


In [271]:
# Display the count of records in both the `actor` and `film` tables.
query = """
    SELECT
        (SELECT COUNT(*) FROM actor) AS actor,
        (SELECT COUNT(*) FROM film) AS film;
"""

pd.read_sql(query, con=conn)

Unnamed: 0,actor,film
0,200,1000


In [272]:
# Display the actors whose names start with 'P' or 'A'.
query = """
    SELECT 
        * 
    FROM actor
    WHERE 
        first_name REGEXP '^[PA]';
"""

pd.read_sql(query, con=conn)

Unnamed: 0,actor_id,first_name,last_name,last_update
0,1,PENELOPE,GUINESS,2006-02-15 04:34:33
1,29,ALEC,WAYNE,2006-02-15 04:34:33
2,34,AUDREY,OLIVIER,2006-02-15 04:34:33
3,46,PARKER,GOLDBERG,2006-02-15 04:34:33
4,49,ANNE,CRONYN,2006-02-15 04:34:33
5,54,PENELOPE,PINKETT,2006-02-15 04:34:33
6,65,ANGELA,HUDSON,2006-02-15 04:34:33
7,71,ADAM,GRANT,2006-02-15 04:34:33
8,76,ANGELINA,ASTAIRE,2006-02-15 04:34:33
9,104,PENELOPE,CRONYN,2006-02-15 04:34:33


In [273]:
# Display the top 10 actors who have acted in the most movies, ranked in descending order.
query = """
    SELECT
        first_name,
        last_name,
        COUNT(*) films
    FROM actor A
        JOIN film_actor B
            ON A.actor_id = B.actor_id
    GROUP BY 1, 2
    HAVING films >= 35
    ORDER BY 3 DESC;
"""

print("# It necessary to return more rows with values that match the last row in the limited result set")
pd.read_sql(query, con=conn)

# It necessary to return more rows with values that match the last row in the limited result set


Unnamed: 0,first_name,last_name,films
0,SUSAN,DAVIS,54
1,GINA,DEGENERES,42
2,WALTER,TORN,41
3,MARY,KEITEL,40
4,MATTHEW,CARREY,39
5,SANDRA,KILMER,37
6,SCARLETT,DAMON,36
7,UMA,WOOD,35
8,VAL,BOLGER,35
9,HENRY,BERRY,35


In [274]:
# Display the number of films in each rating.
query = """
    SELECT
        rating,
        COUNT(*) n
    FROM film
    GROUP BY 1;
"""

pd.read_sql(query, con=conn)

Unnamed: 0,rating,n
0,PG,194
1,G,178
2,NC-17,210
3,PG-13,223
4,R,195


In [275]:
# Display the percentage of films in each rating.
query = """
    WITH num_rating AS (
    SELECT
        rating,
        COUNT(*) n,
        (SELECT COUNT(*) FROM film) AS totaln
    FROM film
    GROUP BY 1
    )

    SELECT
        rating,
        n,
        n / CAST(totaln AS REAL) AS percent
    FROM num_rating
    ORDER BY 2 DESC;
"""

pd.read_sql(query, con=conn)

Unnamed: 0,rating,n,percent
0,PG-13,223,0.223
1,NC-17,210,0.21
2,R,195,0.195
3,PG,194,0.194
4,G,178,0.178


In [276]:
# Generate a new email column using the firstname and lastname.
query = """
    SELECT
        first_name,
        last_name,
        CONCAT(LOWER(first_name), '.', LOWER(LEFT(last_name, 1)), '@sakila.com')  AS email
    FROM actor;
"""

pd.read_sql(query, con=conn)

Unnamed: 0,first_name,last_name,email
0,PENELOPE,GUINESS,penelope.g@sakila.com
1,NICK,WAHLBERG,nick.w@sakila.com
2,ED,CHASE,ed.c@sakila.com
3,JENNIFER,DAVIS,jennifer.d@sakila.com
4,JOHNNY,LOLLOBRIGIDA,johnny.l@sakila.com
...,...,...,...
195,BELA,WALKEN,bela.w@sakila.com
196,REESE,WEST,reese.w@sakila.com
197,MARY,KEITEL,mary.k@sakila.com
198,JULIA,FAWCETT,julia.f@sakila.com


In [277]:
# Retrieve the customer data for those who live in Japan.
query = """
    SELECT
        A.customer_id,
        A.first_name,
        A.last_name,
        B.address,
        C.city,
        D.country
    FROM customer A
        JOIN address B 
            ON A.address_id = B.address_id
        JOIN city C 
            ON B.city_id = C.city_id
        JOIN country D 
            ON C.country_id = D.country_id
    WHERE D.country = 'Japan';
"""

pd.read_sql(query, con=conn)

Unnamed: 0,customer_id,first_name,last_name,address,city,country
0,574,JULIAN,VEST,923 Tangail Boulevard,Akishima,Japan
1,141,DEBBIE,REYES,928 Jaffna Loop,Fukuyama,Japan
2,264,GWENDOLYN,MAY,446 Kirovo-Tepetsk Lane,Higashiosaka,Japan
3,194,KRISTEN,CHAVEZ,345 Oshawa Boulevard,Hino,Japan
4,299,JAMES,GANNON,1635 Kuwana Boulevard,Hiroshima,Japan
5,519,RON,DELUCA,1949 Sanya Street,Isesaki,Japan
6,240,MARLENE,WELCH,1148 Saarbrcken Parkway,Iwaki,Japan
7,54,TERESA,ROGERS,1964 Allappuzha (Alleppey) Street,Iwakuni,Japan
8,396,EARL,SHANKS,168 Cianjur Manor,Iwatsuki,Japan
9,391,CLARENCE,GAMEZ,767 Pyongyang Drive,Izumisano,Japan


In [278]:
# Show the list of comedy films with a 'PG' rating.
query = """
    SELECT
        C.name,
        A.film_id,
        A.title,
        A.release_year,
        A.rating
    FROM film A
        JOIN film_category B 
            ON A.film_id = B.film_id
        JOIN category C 
            ON B.category_id = C.category_id
            AND A.rating = 'PG'
            AND C.name = 'Comedy';
"""

pd.read_sql(query, con=conn)

Unnamed: 0,name,film_id,title,release_year,rating
0,Comedy,99,BRINGING HYSTERICAL,2006,PG
1,Comedy,188,CRAZY HOME,2006,PG
2,Comedy,265,DYING MAKER,2006,PG
3,Comedy,308,FERRIS MOTHER,2006,PG
4,Comedy,317,FIREBALL PHILADELPHIA,2006,PG
5,Comedy,404,HATE HANDICAP,2006,PG
6,Comedy,410,HEAVEN FREEDOM,2006,PG
7,Comedy,443,HURRICANE AFFAIR,2006,PG
8,Comedy,524,LION UNCUT,2006,PG
9,Comedy,555,MALLRATS UNITED,2006,PG


In [279]:
# Display the total number of sales and revenue for each branch.
query = """
    SELECT
        A.store_id,
        B.address,
        COUNT(*) AS totalN,
        ROUND(SUM(D.amount), 2) AS totalSales
    FROM store A
        JOIN address B 
            ON A.address_id = B.address_id
        JOIN staff C 
            ON A.store_id = C.store_id
        JOIN payment D 
            ON C.staff_id = D.staff_id
    GROUP BY 1, 2
    ORDER BY 4 DESC;
"""

pd.read_sql(query, con=conn)

Unnamed: 0,store_id,address,totalN,totalSales
0,2,28 MySQL Boulevard,7990,33924.06
1,1,47 MySakila Drive,8054,33482.5


In [280]:
# Display the overall sales and revenue generated in 2005.
query = """
    SELECT
        A.store_id,
        address,
        DATE_FORMAT(payment_date, '%%Y%%m') AS monthId,
        COUNT(*) AS totalN,
        ROUND(SUM(D.amount), 2) AS totalSales
    FROM store A
        JOIN address B 
            ON A.address_id = B.address_id
        JOIN staff C 
            ON A.store_id = C.store_id
        JOIN payment D 
            ON C.staff_id = D.staff_id
    GROUP BY 1, 2, 3
    HAVING 
        monthId LIKE '2005%%'
    ORDER BY 1, 3;
"""

pd.read_sql(query, con=conn)

Unnamed: 0,store_id,address,monthId,totalN,totalSales
0,1,47 MySakila Drive,200505,617,2621.83
1,1,47 MySakila Drive,200506,1163,4774.37
2,1,47 MySakila Drive,200507,3344,13998.56
3,1,47 MySakila Drive,200508,2835,11853.65
4,2,28 MySQL Boulevard,200505,539,2201.61
5,2,28 MySQL Boulevard,200506,1148,4855.52
6,2,28 MySQL Boulevard,200507,3365,14370.35
7,2,28 MySQL Boulevard,200508,2851,12216.49


In [281]:
# Display the list of names for customers who haven't returned their DVDs yet.
query = """
    SELECT 
        DISTINCT A.first_name,
        A.last_name,
        B.address,
        B.postal_code
    FROM customer A
        JOIN address B 
            ON A.address_id = B.address_id
        JOIN rental C 
            ON A.customer_id = C.customer_id
    WHERE 
        C.return_date IS NULL
    ORDER BY 1;
"""

pd.read_sql(query, con=conn)

Unnamed: 0,first_name,last_name,address,postal_code
0,ADRIAN,CLARY,1986 Sivas Place,95775
1,ALBERT,CROUSE,1641 Changhwa Place,37636
2,ALBERTO,HENNING,502 Mandi Bahauddin Parkway,15992
3,ALICIA,MILLS,1963 Moscow Place,64863
4,ALLAN,CORNISH,947 Trshavn Place,841
...,...,...,...,...
154,WENDY,HARRISON,1107 Nakhon Sawan Avenue,75149
155,WILLIE,HOWELL,1244 Allappuzha (Alleppey) Place,20657
156,WILLIE,MARKHAM,1623 Kingstown Drive,91299
157,YOLANDA,WEAVER,605 Rio Claro Parkway,49348


In [282]:
# Display the list of DVDs available for rental at either store 1 or 2 only.
query = """
    SELECT 
        DISTINCT release_year,
        title,
        rating,
        store_id
    FROM film A
        JOIN inventory B 
            ON A.film_id = B.film_id
    WHERE 
        A.film_id IN (
            SELECT film_id
            FROM inventory
            GROUP BY 1
            HAVING COUNT(DISTINCT store_id) = 1
        )
    ORDER BY title;
"""

pd.read_sql(query, con=conn)

Unnamed: 0,release_year,title,rating,store_id
0,2006,ACE GOLDFINGER,G,2
1,2006,ADAPTATION HOLES,NC-17,2
2,2006,AFRICAN EGG,G,2
3,2006,AIRPORT POLLOCK,R,2
4,2006,ALI FOREVER,PG,2
...,...,...,...,...
390,2006,WRATH MILE,NC-17,2
391,2006,WYOMING STORM,PG-13,1
392,2006,YOUNG LANGUAGE,G,1
393,2006,YOUTH KICK,NC-17,1


In [283]:
# Display the list of the top 10 best-selling movies.
query = """
    SELECT
        film.title,
        category.name,
        SUM(amount) AS totalSales
    FROM film
        JOIN inventory      USING(film_id)
        JOIN rental         USING(inventory_id)
        JOIN payment        USING(rental_id)
        JOIN film_category  USING(film_id)
        JOIN category       USING(category_id)
    GROUP BY 1, 2
    ORDER BY 3 DESC
    LIMIT 10;
"""

pd.read_sql(query, con=conn)    

Unnamed: 0,title,name,totalSales
0,TELEGRAPH VOYAGE,Music,231.73
1,WIFE TURN,Documentary,223.69
2,ZORRO ARK,Comedy,214.69
3,GOODFELLAS SALUTE,Sci-Fi,209.69
4,SATURDAY LAMBS,Sports,204.72
5,TITANS JERK,Sci-Fi,201.71
6,TORQUE BOUND,Drama,198.72
7,HARRY IDAHO,Drama,195.7
8,INNOCENT USUAL,Foreign,191.74
9,HUSTLER PARTY,Comedy,190.78


In [284]:
# Display the list of the 10 actors with the lowest income from rental sales.
query = """
    SELECT
        CONCAT(actor.first_name, ' ', actor.last_name) AS full_name,
        ROUND(SUM(amount), 2) AS totalSales
    FROM actor
        JOIN film_actor     USING(actor_id)
        JOIN film           USING(film_id)
        JOIN inventory      USING(film_id)
        JOIN rental         USING(inventory_id)
        JOIN payment        USING(rental_id)
    GROUP BY 1
    ORDER BY 2
    LIMIT 10;
"""

pd.read_sql(query, con=conn)    

Unnamed: 0,full_name,totalSales
0,EMILY DEE,883.85
1,SISSY SOBIESKI,902.65
2,ADAM GRANT,974.19
3,JUDY DEAN,976.47
4,SANDRA PECK,1040.12
5,JENNIFER DAVIS,1052.27
6,JULIA ZELLWEGER,1064.79
7,JULIA FAWCETT,1189.42
8,PENELOPE GUINESS,1230.94
9,RUSSELL CLOSE,1251.05


In [285]:
# Display the rental date/time range for each movie.
query = """
    SELECT
        rental_id,
        rental_date,
        return_date,
        DATEDIFF(return_date, rental_date) AS rent_days,
        TIMESTAMPDIFF(HOUR, rental_date, return_date) AS rent_hours
    FROM rental
    WHERE 
        return_date IS NOT NULL
    ORDER BY rent_days DESC, rent_hours DESC;
"""

pd.read_sql(query, con=conn)

Unnamed: 0,rental_id,rental_date,return_date,rent_days,rent_hours
0,12048,2005-08-17 20:49:24,2005-08-27 02:01:24,10,221
1,12091,2005-08-17 22:22:50,2005-08-27 03:55:50,10,221
2,12765,2005-08-18 23:21:50,2005-08-28 04:42:50,10,221
3,12774,2005-08-18 23:34:22,2005-08-28 05:00:22,10,221
4,14622,2005-08-21 18:25:59,2005-08-31 00:11:59,10,221
...,...,...,...,...,...
15856,5004,2005-07-09 01:20:50,2005-07-09 19:56:50,0,18
15857,5070,2005-07-09 04:58:26,2005-07-09 23:00:26,0,18
15858,5071,2005-07-09 05:00:39,2005-07-09 23:08:39,0,18
15859,5589,2005-07-10 04:22:58,2005-07-10 23:13:58,0,18


In [286]:
# Display the total sales and running total for each month.
query = """
    WITH payment_amt AS (
        SELECT
            DATE_FORMAT(payment_date, '%%Y-%%m') AS payment_date,
            SUM(amount) AS amount
        FROM payment
        GROUP BY 1
    )

    SELECT
        payment_date,
        ROUND(amount, 2) AS amount,
        ROUND(SUM(amount) OVER(ORDER BY payment_date), 2) AS rollAmt
    FROM payment_amt;
"""

pd.read_sql(query, con=conn)    

Unnamed: 0,payment_date,amount,rollAmt
0,2005-05,4823.44,4823.44
1,2005-06,9629.89,14453.33
2,2005-07,28368.91,42822.24
3,2005-08,24070.14,66892.38
4,2006-02,514.18,67406.56
