### Test notebook -- Sparkify

The queries below allow for testing of final star schema tables loaded using `create_tables.py`, `etl.py` and `sql_queries.py`, with connection parameters specified in `dwh.cfg`. 
The final star schema is loaded in Redshift cluster, and aims to address the requirements of **Sparkify** to analyse their user behaviour.

In [None]:
from time import time
import configparser
import psycopg2

In [None]:
#Read connection parameters
config = configparser.ConfigParser()
config.read('dwh.cfg')

DB_NAME = config.get("CLUSTER","DB_NAME")
DB_USER = config.get("CLUSTER","DB_USER")
DB_PASSWORD = config.get("CLUSTER","DB_PASSWORD")
DB_PORT = config.get("CLUSTER","DB_PORT")
HOST = config.get("CLUSTER","HOST")

In [None]:
%load_ext sql

In [None]:
#Create connection
conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, HOST, DB_PORT, DB_NAME)
print(conn_string)
%sql $conn_string

### Call each table row count - `songs`, `users`, `songplays`, `artists`, `time`

In [None]:
%%time
%%sql
SELECT COUNT(*) FROM songs;

In [None]:
%%time
%%sql
SELECT COUNT(*) FROM users;

In [None]:
%%time
%%sql
SELECT COUNT(*) FROM songplays;

In [None]:
%%time
%%sql
SELECT COUNT(*) FROM artists;

In [None]:
%%time
%%sql
SELECT COUNT(*) FROM time;

### Call first five rows in each table

In [None]:
%%time
%%sql
SELECT * FROM users LIMIT 5; 

In [None]:
%%time
%%sql
SELECT * FROM songs LIMIT 5;

In [None]:
%%time
%%sql
SELECT * FROM artists LIMIT 5;

In [None]:
%%time
%%sql
SELECT * FROM songplays LIMIT 5;

In [None]:
%%time
%%sql
SELECT * FROM time LIMIT 5;

### Show week, artist name and their song play count for each of the weeks sorted by most played artists (in a given week), provided that total songs played by the artist exceeds 3 (in a given week)

In [None]:
%%time
%%sql
SELECT t.week, a.name, COUNT(sp.songplay_id)
FROM songplays sp
JOIN time t ON t.start_time = sp.start_time
JOIN artists a ON a.artist_id = sp.artist_id 
GROUP BY 1,2
HAVING COUNT(songplay_id) > 3
ORDER BY 1, COUNT(songplay_id) DESC;

### Show song title, artist and song count for the most played songs in the entire observation period

In [None]:
%%time
%%sql
SELECT s.title, a.name, COUNT(sp.songplay_id)
FROM songplays sp
JOIN songs s ON sp.song_id = s.song_id
JOIN artists a ON sp.artist_id = a.artist_id
GROUP BY s.title, a.name
HAVING COUNT(sp.songplay_id) > 5
ORDER BY 3 DESC;

### Show user_id, sum of songs played, first and last name and gender for 5 users with most songs played

In [None]:
%%time
%%sql

WITH vip_user AS (
                 SELECT user_id, COUNT(songplay_id) play_count
                 FROM songplays
                 GROUP BY user_id
                 ORDER BY 2 DESC
                 LIMIT 5)

SELECT DISTINCT vipu.user_id, vipu.play_count, u.first_name, u.last_name, u.gender
FROM vip_user vipu
JOIN users u
     ON u.user_id = vipu.user_id
ORDER BY 2 DESC;

### For 5 users with most songs played, show user_id, level, total songs played for a given level and user, total duration played for a given level and user, first and last name and gender of the user

In [None]:
%%time
%%sql
WITH vip_user AS (
                 SELECT user_id, COUNT(songplay_id) play_count
                 FROM songplays
                 GROUP BY user_id
                 ORDER BY 2 DESC
                 LIMIT 5),

     vip_songplay_count AS (
                 SELECT user_id, 
                        level, 
                        COUNT(songplay_id) play_count_by_level,
                        SUM(duration) total_duration_by_level
                 FROM songplays
                 JOIN songs
                      ON songplays.song_id = songs.song_id
                 WHERE user_id IN (
                                  SELECT user_id
                                  FROM vip_user)
                 GROUP BY 1,2
                 ORDER BY 1,3 DESC),

     vip_full_list AS(
                SELECT DISTINCT vip.user_id, 
                                vip.level, 
                                vip.play_count_by_level,
                                vip.total_duration_by_level,
                                u.first_name, 
                                u.last_name, 
                                u.gender
                FROM vip_songplay_count vip
                JOIN users u
                    ON vip.user_id = u.user_id
                ORDER BY 1, 3 DESC)

SELECT user_id, 
       level, 
       play_count_by_level,
       SUM(play_count_by_level) OVER (PARTITION BY user_id) AS play_count_by_user,
       ROUND(total_duration_by_level,2) AS total_duration_by_level_,
       ROUND(SUM(total_duration_by_level) OVER (PARTITION BY user_id),2) AS total_duration_by_user,
       first_name,
       last_name,
       gender
FROM vip_full_list
ORDER BY play_count_by_user DESC, play_count_by_level DESC;
