### Database Set-up

In [1]:
server='postgres'
user='postgres'
password='BigData1'
database='discogs'
connection_string=f'postgresql://{user}:{password}@{server}:5432/{database}'

In [2]:
%reload_ext sql
%sql $connection_string

In [3]:
%%sql
SELECT version();

 * postgresql://postgres:***@postgres:5432/discogs
1 rows affected.


version
"PostgreSQL 13.4 (Debian 13.4-1.pgdg100+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 8.3.0-6) 8.3.0, 64-bit"


### Questions
#### Question 1

In [4]:
%%sql
SELECT AVG(CAST(duration AS FLOAT)) FROM tracks;

 * postgresql://postgres:***@postgres:5432/discogs
1 rows affected.


avg
324.23738827711026


#### Question 2
How many artists have at least 10000 seconds of released music (i.e., total track duration >= 10000) and at least one release with the genre 'Classical'?

In [39]:
%%sql
WITH artist_tot_duration AS (
    SELECT artists.artist_id, artists.name AS artist_name, SUM(tracks.duration) AS total_track_duration
    FROM artists 
    JOIN released_by ON artists.artist_id = released_by.artist_id
    JOIN tracks ON released_by.release_id = tracks.release_id
    GROUP BY artists.artist_id
    HAVING SUM(tracks.duration) >= 10000
    ORDER BY total_track_duration DESC
),
final AS (
    SELECT artist_id, artist_name
    FROM artist_tot_duration
    WHERE EXISTS (
        SELECT NULL
        FROM released_by 
        JOIN releases ON released_by.release_id = releases.release_id
        WHERE artist_tot_duration.artist_id = released_by.artist_id
        AND releases.genre = 'Classical'
    )
)

SELECT COUNT(*) 
FROM final

 * postgresql://postgres:***@postgres:5432/discogs
1 rows affected.


count
149


#### Question 3
Write a query that returns the 10 genres with the most releases, in descending order of frequency. For each of the top 10 genres, find the ID of the artist that has released the most releases in that genre. 

You may assume that the top artist for each genre is unique (i.e., there is exactly one artist who has released the highest number of releases for each genre).

Make sure that you write the genre names exactly as they appear in the dataset.

In [50]:
%%sql
WITH top10_genres AS (
    SELECT releases.genre, COUNT(*) AS num_genre
    FROM releases
    GROUP BY releases.genre
    ORDER BY num_genre DESC
    LIMIT 10
)

SELECT * 
FROM top10_genres




 * postgresql://postgres:***@postgres:5432/discogs
10 rows affected.


genre,num_genre
Electronic,439213
Rock,48292
Hip Hop,25759
Funk / Soul,9069
Jazz,8251
Reggae,3527
Pop,1800
Non-Music,1247
Classical,592
Latin,414


In [61]:
%%sql
WITH temp AS (
    SELECT releases.genre, artists.artist_id, artists.name AS artist_name, COUNT(*) AS artist_releases_cnt_per_genre 
    FROM artists 
    JOIN released_by ON artists.artist_id = released_by.artist_id
    JOIN releases ON released_by.release_id = releases.release_id
    GROUP BY releases.genre, artists.artist_id
),
added_row_number AS (
    SELECT *, ROW_NUMBER() OVER(PARTITION BY genre ORDER BY artist_releases_cnt_per_genre DESC) AS row_number
    FROM temp
),
top_artist_per_genre AS (  
    SELECT *
    FROM added_row_number
    WHERE row_number = 1
),
top10_genres AS (
    SELECT releases.genre, COUNT(*) AS num_genre
    FROM releases
    GROUP BY releases.genre
    ORDER BY num_genre DESC
    LIMIT 10
)

SELECT tg.genre, tg.num_genre, tapg.artist_id, tapg.artist_name
FROM top10_genres AS tg
JOIN top_artist_per_genre AS tapg ON tg.genre = tapg.genre
ORDER BY tg.num_genre DESC


 * postgresql://postgres:***@postgres:5432/discogs
10 rows affected.


genre,num_genre,artist_id,artist_name
Electronic,439213,2725,Depeche Mode
Rock,48292,82730,The Beatles
Hip Hop,25759,10783,Beastie Boys
Funk / Soul,9069,12596,James Brown
Jazz,8251,23755,Miles Davis
Reggae,3527,21764,Bob Marley & The Wailers
Pop,1800,69866,ABBA
Non-Music,1247,451987,Michael Koser
Classical,592,999914,Pyotr Ilyich Tchaikovsky
Latin,414,99729,Celia Cruz
