In [81]:
from typing import LiteralString
import os
import duckdb

def process(sql: LiteralString, file: str) -> None:
    table = duckdb.sql(sql)
    table.to_parquet(f"./tmp/{file}.parquet")
    table.to_csv(f"./tmp/{file}.csv")
    print(f"{file} - {int(os.path.getsize(f"./tmp/{file}.parquet") / (1024 * 1024))}")
    table.show()

In [82]:
process("""
SELECT
    region,
    date,
    url[length('https://open.spotify.com/track/')+1:] as track_id,
    streams
FROM '../../data/charts_2017.csv'
WHERE chart = 'top200'
""", "charts_fmt")

charts_fmt - 53
┌───────────┬────────────┬────────────────────────┬─────────┐
│  region   │    date    │        track_id        │ streams │
│  varchar  │    date    │        varchar         │  int64  │
├───────────┼────────────┼────────────────────────┼─────────┤
│ Argentina │ 2017-01-01 │ 6mICuAdrwEjh6Y6lroV2Kg │  253019 │
│ Argentina │ 2017-01-01 │ 7DM4BPaS7uofFul3ywMe46 │  223988 │
│ Argentina │ 2017-01-01 │ 3AEZUABDXNtecAOSC1qTfo │  210943 │
│ Argentina │ 2017-01-01 │ 6rQSrBHf7HlZjtcMZ4S4bO │  173865 │
│ Argentina │ 2017-01-01 │ 58IL315gMSTD37DOZPJ2hf │  153956 │
│ Argentina │ 2017-01-01 │ 5J1c3M4EldCfNxXwrwt8mT │  151140 │
│ Argentina │ 2017-01-01 │ 1MpKZi1zTXpERKwxmOu1PH │  148369 │
│ Argentina │ 2017-01-01 │ 3QwBODjSEzelZyVjxPOHdq │  143004 │
│ Argentina │ 2017-01-01 │ 0sXvAOmXgjR2QUqLK1MltU │  126389 │
│ Argentina │ 2017-01-01 │ 20ZAJdsKB5IGbGj4ilRt2o │  112012 │
│   ·       │     ·      │           ·            │     ·   │
│   ·       │     ·      │           ·            │   

In [83]:
process("""
SELECT
    region,
    date,
    url[length('https://open.spotify.com/track/')+1:] as track_id,
    streams
FROM '../../data/charts_small.csv'
WHERE chart = 'top200'
""", "charts_fmt_small")

charts_fmt_small - 0
┌───────────┬────────────┬────────────────────────┬─────────┐
│  region   │    date    │        track_id        │ streams │
│  varchar  │    date    │        varchar         │  int64  │
├───────────┼────────────┼────────────────────────┼─────────┤
│ Argentina │ 2017-01-01 │ 6mICuAdrwEjh6Y6lroV2Kg │  253019 │
│ Argentina │ 2017-01-01 │ 7DM4BPaS7uofFul3ywMe46 │  223988 │
│ Argentina │ 2017-01-01 │ 3AEZUABDXNtecAOSC1qTfo │  210943 │
│ Argentina │ 2017-01-01 │ 6rQSrBHf7HlZjtcMZ4S4bO │  173865 │
│ Argentina │ 2017-01-01 │ 58IL315gMSTD37DOZPJ2hf │  153956 │
│ Poland    │ 2017-01-01 │ 4pdPtRcBmOSQDlJ3Fk945m │   26290 │
│ Poland    │ 2017-01-01 │ 5aAx2yezTd8zXrkmtKl66Z │   25198 │
│ Poland    │ 2017-01-01 │ 7BKLCZ1jbUBVqRi2FVlTVw │   24642 │
│ Poland    │ 2017-01-01 │ 7abpmGpF7PGep2rDU68GBR │   24630 │
│ Poland    │ 2017-01-01 │ 5knuzwU65gJK7IF5yJsuaW │   23163 │
├───────────┴────────────┴────────────────────────┴─────────┤
│ 10 rows                                        

In [84]:
process("""
SELECT
    region,
    date,
    SUM(streams) as streams
FROM './tmp/charts_fmt.parquet'
GROUP BY region, date
ORDER BY region, date
""", "charts_daily_sum")

charts_daily_sum - 0
┌───────────┬────────────┬─────────┐
│  region   │    date    │ streams │
│  varchar  │    date    │ int128  │
├───────────┼────────────┼─────────┤
│ Argentina │ 2017-01-01 │ 7888872 │
│ Argentina │ 2017-01-02 │ 6010041 │
│ Argentina │ 2017-01-03 │ 5921907 │
│ Argentina │ 2017-01-04 │ 6019573 │
│ Argentina │ 2017-01-05 │ 6223646 │
│ Argentina │ 2017-01-06 │ 6485179 │
│ Argentina │ 2017-01-07 │ 6840811 │
│ Argentina │ 2017-01-08 │ 5691182 │
│ Argentina │ 2017-01-09 │ 5880184 │
│ Argentina │ 2017-01-10 │ 5896533 │
│    ·      │     ·      │    ·    │
│    ·      │     ·      │    ·    │
│    ·      │     ·      │    ·    │
│ Ireland   │ 2017-04-12 │ 1787759 │
│ Ireland   │ 2017-04-13 │ 1811616 │
│ Ireland   │ 2017-04-14 │ 2074141 │
│ Ireland   │ 2017-04-15 │ 1941649 │
│ Ireland   │ 2017-04-16 │ 1644404 │
│ Ireland   │ 2017-04-17 │ 1591220 │
│ Ireland   │ 2017-04-18 │ 1805038 │
│ Ireland   │ 2017-04-19 │ 1760535 │
│ Ireland   │ 2017-04-20 │ 1776933 │
│ Ireland   │ 201

In [85]:
process("""
SELECT
    region,
    date,
    SUM(streams) as streams
FROM './tmp/charts_fmt_small.parquet'
GROUP BY region, date
ORDER BY region, date
""", "charts_daily_sum_small")

charts_daily_sum_small - 0
┌───────────┬────────────┬─────────┐
│  region   │    date    │ streams │
│  varchar  │    date    │ int128  │
├───────────┼────────────┼─────────┤
│ Argentina │ 2017-01-01 │ 1015771 │
│ Poland    │ 2017-01-01 │  123923 │
└───────────┴────────────┴─────────┘



In [86]:
process("""
SELECT
    region,
    YEAR(date) as year,
    AVG(streams) as stream_avg,
    STDDEV(streams) as stream_dev
FROM './tmp/charts_daily_sum.parquet'
GROUP BY region, year
ORDER BY region, year
""", "charts_yearly_stats")

charts_yearly_stats - 0
┌────────────────┬───────┬────────────────────┬────────────────────┐
│     region     │ year  │     stream_avg     │     stream_dev     │
│    varchar     │ int64 │       double       │       double       │
├────────────────┼───────┼────────────────────┼────────────────────┤
│ Argentina      │  2017 │  7424229.113259668 │ 1034041.7027895374 │
│ Australia      │  2017 │  9070440.314917127 │ 1217620.9157923514 │
│ Austria        │  2017 │  922355.2016574586 │ 150425.76480612514 │
│ Belgium        │  2017 │  1503151.359116022 │ 198449.60396451122 │
│ Bolivia        │  2017 │ 260551.65745856354 │  59982.49361972868 │
│ Brazil         │  2017 │ 18447148.599447515 │ 3181632.0441991403 │
│ Bulgaria       │  2017 │ 31598.537037037036 │ 10098.273587588357 │
│ Canada         │  2017 │  8635064.616022099 │  940246.5927111489 │
│ Chile          │  2017 │  5865933.679558011 │  884993.0267524644 │
│ Colombia       │  2017 │   2248082.73480663 │  196956.0484257076 │
│    ·    

In [87]:
process("""
SELECT
    region,
    YEAR(date) as year,
    AVG(streams) as stream_avg,
    STDDEV(streams) as stream_dev
FROM './tmp/charts_daily_sum_small.parquet'
GROUP BY region, year
ORDER BY region, year
""", "charts_yearly_stats_small")

charts_yearly_stats_small - 0


┌───────────┬───────┬────────────┬────────────┐
│  region   │ year  │ stream_avg │ stream_dev │
│  varchar  │ int64 │   double   │   double   │
├───────────┼───────┼────────────┼────────────┤
│ Argentina │  2017 │  1015771.0 │       NULL │
│ Poland    │  2017 │   123923.0 │       NULL │
└───────────┴───────┴────────────┴────────────┘



In [88]:
process("""
SELECT
    region,
    date,
    CASE
        WHEN stream_std < -1.5 THEN 'VERY LOW'
        WHEN stream_std < -0.5 THEN 'LOW'
        WHEN stream_std > 1.5 THEN 'VERY HIGH'
        WHEN stream_std > 0.5 THEN 'HIGH'
        ELSE 'AVERAGE'
    END AS popularity
FROM (
    SELECT
        d.region,
        d.date,
        (d.streams - y.stream_avg) / y.stream_dev as stream_std
    FROM './tmp/charts_daily_sum.parquet' AS d
    JOIN './tmp/charts_yearly_stats.parquet' AS y
        ON YEAR(d.date) = y.year AND d.region = y.region
)
""", "charts_daily_popularity")

charts_daily_popularity - 0
┌───────────┬────────────┬────────────┐
│  region   │    date    │ popularity │
│  varchar  │    date    │  varchar   │
├───────────┼────────────┼────────────┤
│ Argentina │ 2017-01-01 │ AVERAGE    │
│ Argentina │ 2017-01-02 │ LOW        │
│ Argentina │ 2017-01-03 │ LOW        │
│ Argentina │ 2017-01-04 │ LOW        │
│ Argentina │ 2017-01-05 │ LOW        │
│ Argentina │ 2017-01-06 │ LOW        │
│ Argentina │ 2017-01-07 │ LOW        │
│ Argentina │ 2017-01-08 │ VERY LOW   │
│ Argentina │ 2017-01-09 │ LOW        │
│ Argentina │ 2017-01-10 │ LOW        │
│    ·      │     ·      │  ·         │
│    ·      │     ·      │  ·         │
│    ·      │     ·      │  ·         │
│ Ireland   │ 2017-04-12 │ AVERAGE    │
│ Ireland   │ 2017-04-13 │ AVERAGE    │
│ Ireland   │ 2017-04-14 │ VERY HIGH  │
│ Ireland   │ 2017-04-15 │ HIGH       │
│ Ireland   │ 2017-04-16 │ LOW        │
│ Ireland   │ 2017-04-17 │ LOW        │
│ Ireland   │ 2017-04-18 │ AVERAGE    │
│ Ireland   

In [89]:
process("""
SELECT
    region,
    date,
    CASE
        WHEN stream_std < -1.5 THEN 'VERY LOW'
        WHEN stream_std < -0.5 THEN 'LOW'
        WHEN stream_std > 1.5 THEN 'VERY HIGH'
        WHEN stream_std > 0.5 THEN 'HIGH'
        ELSE 'AVERAGE'
    END AS popularity
FROM (
    SELECT
        d.region,
        d.date,
        (d.streams - y.stream_avg) / y.stream_dev as stream_std
    FROM './tmp/charts_daily_sum_small.parquet' AS d
    JOIN './tmp/charts_yearly_stats_small.parquet' AS y
        ON YEAR(d.date) = y.year AND d.region = y.region
)
""", "charts_daily_popularity_small")

charts_daily_popularity_small - 0
┌───────────┬────────────┬────────────┐
│  region   │    date    │ popularity │
│  varchar  │    date    │  varchar   │
├───────────┼────────────┼────────────┤
│ Argentina │ 2017-01-01 │ AVERAGE    │
│ Poland    │ 2017-01-01 │ AVERAGE    │
└───────────┴────────────┴────────────┘



In [90]:
process("""
SELECT
    c.country,
    w.date,
    AVG(w.temperature_c) as temperature_c,
    COALESCE(AVG(w.precipitation_mm), 0) as precipitation_mm
FROM (
    SELECT
        station_id,
        date::DATE as date,
        avg_temp_c as temperature_c,
        precipitation_mm,
    FROM '../../data/daily_weather_2017.csv'
    WHERE date BETWEEN '2017-01-01' AND '2021-12-31'
) AS w
JOIN '../../data/cities.csv' AS c
    ON w.station_id = c.station_id
GROUP BY c.country, w.date
HAVING temperature_c IS NOT NULL
ORDER BY c.country, w.date
""", "daily_country_weather")

daily_country_weather - 0
┌──────────────┬────────────┬──────────────────────┬─────────────────────┐
│   country    │    date    │    temperature_c     │  precipitation_mm   │
│   varchar    │    date    │        double        │       double        │
├──────────────┼────────────┼──────────────────────┼─────────────────────┤
│ Afghanistan  │ 2017-01-01 │   5.3166666666666655 │                 0.0 │
│ Afghanistan  │ 2017-01-02 │    5.016666666666667 │                 2.0 │
│ Afghanistan  │ 2017-01-03 │   3.0666666666666664 │              10.725 │
│ Afghanistan  │ 2017-01-04 │                 2.65 │               109.0 │
│ Afghanistan  │ 2017-01-05 │   1.9333333333333333 │               29.95 │
│ Afghanistan  │ 2017-01-06 │   0.9833333333333331 │                7.25 │
│ Afghanistan  │ 2017-01-07 │   0.2833333333333334 │                33.0 │
│ Afghanistan  │ 2017-01-08 │ -0.07999999999999999 │                18.0 │
│ Afghanistan  │ 2017-01-09 │ -0.21666666666666665 │                 0.0 │

In [91]:
process("""
SELECT
    c.country,
    w.date,
    AVG(w.temperature_c) as temperature_c,
    COALESCE(AVG(w.precipitation_mm), 0) as precipitation_mm
FROM (
    SELECT
        station_id,
        date::DATE as date,
        avg_temp_c as temperature_c,
        precipitation_mm,
    FROM '../../data/daily_weather_2017.csv'
) AS w
JOIN '../../data/cities.csv' AS c
    ON w.station_id = c.station_id
WHERE w.date = '2017-01-01' AND c.country IN ('Argentina', 'Poland')
GROUP BY c.country, w.date
HAVING temperature_c IS NOT NULL
ORDER BY c.country, w.date
""", "daily_country_weather_small")

daily_country_weather_small - 0
┌───────────┬────────────┬────────────────────┬──────────────────┐
│  country  │    date    │   temperature_c    │ precipitation_mm │
│  varchar  │    date    │       double       │      double      │
├───────────┼────────────┼────────────────────┼──────────────────┤
│ Argentina │ 2017-01-01 │ 26.084999999999997 │             11.0 │
│ Poland    │ 2017-01-01 │ 0.5777777777777778 │           0.0375 │
└───────────┴────────────┴────────────────────┴──────────────────┘



In [92]:
process("""
WITH wdi AS (
    SELECT
        "Country Name" as country,
        "Indicator Code" as code,
        "2016", "2017", "2018", "2019", "2020", "2021"
    FROM read_csv('../../data/WDIData_2017.csv', header=True)
    WHERE code IN ('SP.RUR.TOTL.ZS', 'SP.DYN.TFRT.IN', 'NY.GDP.PCAP.CD', 'IT.CEL.SETS.P2', 'SM.POP.REFG', 'SP.POP.TOTL')
)
SELECT
    country,
    year,
    rural_population_percent,
    fertility_rate,
    gdp_per_capita_usd,
    mobile_subscriptions_per_100,
    refugee_population / total_population * 1000 AS refugee_population_promille,
FROM (
    SELECT
        country, year,
        MAX(CASE WHEN code = 'SP.RUR.TOTL.ZS' THEN value END) AS rural_population_percent,
        MAX(CASE WHEN code = 'SP.DYN.TFRT.IN' THEN value END) AS fertility_rate,
        MAX(CASE WHEN code = 'NY.GDP.PCAP.CD' THEN value END) AS gdp_per_capita_usd,
        MAX(CASE WHEN code = 'IT.CEL.SETS.P2' THEN value END) AS mobile_subscriptions_per_100,
        MAX(CASE WHEN code = 'SM.POP.REFG' THEN value END) AS refugee_population,
        MAX(CASE WHEN code = 'SP.POP.TOTL' THEN value END) AS total_population
    FROM (
        SELECT country, code, 2016 AS year, "2016" AS value FROM wdi
        UNION ALL
        SELECT country, code, 2017 AS year, "2017" AS value FROM wdi
        UNION ALL
        SELECT country, code, 2018 AS year, "2018" AS value FROM wdi
        UNION ALL
        SELECT country, code, 2019 AS year, "2019" AS value FROM wdi
        UNION ALL
        SELECT country, code, 2020 AS year, "2020" AS value FROM wdi
        UNION ALL
        SELECT country, code, 2021 AS year, "2021" AS value FROM wdi
    )
    GROUP BY country, year
)
WHERE rural_population_percent IS NOT NULL
    AND fertility_rate IS NOT NULL
    AND gdp_per_capita_usd IS NOT NULL
    AND mobile_subscriptions_per_100 IS NOT NULL
    AND refugee_population IS NOT NULL
    AND total_population IS NOT NULL
ORDER BY country, year
""", "wdi_normalized")

wdi_normalized - 0
┌─────────────────────────────┬───────┬──────────────────────────┬──────────────────┬────────────────────┬──────────────────────────────┬─────────────────────────────┐
│           country           │ year  │ rural_population_percent │  fertility_rate  │ gdp_per_capita_usd │ mobile_subscriptions_per_100 │ refugee_population_promille │
│           varchar           │ int32 │          double          │      double      │       double       │            double            │           double            │
├─────────────────────────────┼───────┼──────────────────────────┼──────────────────┼────────────────────┼──────────────────────────────┼─────────────────────────────┤
│ Afghanistan                 │  2016 │                    74.98 │            5.262 │   520.252064031151 │                  62.37109623 │          1.7256508485470132 │
│ Afghanistan                 │  2017 │                    74.75 │            5.129 │   530.149830802984 │                  67.13641492 │    

In [93]:
process("""
SELECT
    prv.country,
    d.date,
    prv.rural_population_percent * (1 - (EXTRACT('dayofyear' FROM d.date) / 365)) + nxt.rural_population_percent * (EXTRACT('dayofyear' FROM d.date) / 365) AS rural_population_percent,
    prv.fertility_rate * (1 - (EXTRACT('dayofyear' FROM d.date) / 365)) + nxt.fertility_rate * (EXTRACT('dayofyear' FROM d.date) / 365) AS fertility_rate,
    prv.gdp_per_capita_usd * (1 - (EXTRACT('dayofyear' FROM d.date) / 365)) + nxt.gdp_per_capita_usd * (EXTRACT('dayofyear' FROM d.date) / 365) AS gdp_per_capita_usd,
    prv.mobile_subscriptions_per_100 * (1 - (EXTRACT('dayofyear' FROM d.date) / 365)) + nxt.mobile_subscriptions_per_100 * (EXTRACT('dayofyear' FROM d.date) / 365) AS mobile_subscriptions_per_100,
    prv.refugee_population_promille * (1 - (EXTRACT('dayofyear' FROM d.date) / 365)) + nxt.refugee_population_promille * (EXTRACT('dayofyear' FROM d.date) / 365) AS refugee_population_promille
FROM (
    SELECT generate_series::DATE AS date
    FROM generate_series(DATE '2017-01-01', DATE '2021-12-31', INTERVAL '1' DAY)
) AS d
JOIN './tmp/wdi_normalized.parquet' AS nxt
    ON YEAR(d.date) = nxt.year
JOIN './tmp/wdi_normalized.parquet' AS prv
    ON YEAR(d.date) - 1 = prv.year AND prv.country = nxt.country
ORDER BY prv.country, d.date
""", "wdi_interpolated")

wdi_interpolated - 14
┌─────────────┬────────────┬──────────────────────────┬────────────────────┬────────────────────┬──────────────────────────────┬─────────────────────────────┐
│   country   │    date    │ rural_population_percent │   fertility_rate   │ gdp_per_capita_usd │ mobile_subscriptions_per_100 │ refugee_population_promille │
│   varchar   │    date    │          double          │       double       │       double       │            double            │           double            │
├─────────────┼────────────┼──────────────────────────┼────────────────────┼────────────────────┼──────────────────────────────┼─────────────────────────────┤
│ Afghanistan │ 2017-01-01 │         74.9793698630137 │ 5.2616356164383555 │  520.2791812003888 │            62.38415189764383 │           1.726759155574102 │
│ Afghanistan │ 2017-01-02 │         74.9787397260274 │ 5.2612712328767115 │  520.3062983696269 │            62.39720756528767 │          1.7278674626011907 │
│ Afghanistan │ 2017-01-

In [94]:
process("""
SELECT
    prv.country,
    d.date,
    prv.rural_population_percent * (1 - (EXTRACT('dayofyear' FROM d.date) / 365)) + nxt.rural_population_percent * (EXTRACT('dayofyear' FROM d.date) / 365) AS rural_population_percent,
    prv.fertility_rate * (1 - (EXTRACT('dayofyear' FROM d.date) / 365)) + nxt.fertility_rate * (EXTRACT('dayofyear' FROM d.date) / 365) AS fertility_rate,
    prv.gdp_per_capita_usd * (1 - (EXTRACT('dayofyear' FROM d.date) / 365)) + nxt.gdp_per_capita_usd * (EXTRACT('dayofyear' FROM d.date) / 365) AS gdp_per_capita_usd,
    prv.mobile_subscriptions_per_100 * (1 - (EXTRACT('dayofyear' FROM d.date) / 365)) + nxt.mobile_subscriptions_per_100 * (EXTRACT('dayofyear' FROM d.date) / 365) AS mobile_subscriptions_per_100,
    prv.refugee_population_promille * (1 - (EXTRACT('dayofyear' FROM d.date) / 365)) + nxt.refugee_population_promille * (EXTRACT('dayofyear' FROM d.date) / 365) AS refugee_population_promille
FROM (
    SELECT generate_series::DATE AS date
    FROM generate_series(DATE '2017-01-01', DATE '2021-12-31', INTERVAL '1' DAY)
) AS d
JOIN './tmp/wdi_normalized.parquet' AS nxt
    ON YEAR(d.date) = nxt.year
JOIN './tmp/wdi_normalized.parquet' AS prv
    ON YEAR(d.date) - 1 = prv.year AND prv.country = nxt.country
WHERE prv.country IN ('Argentina', 'Poland') AND d.date = '2017-01-01'
ORDER BY prv.country, d.date
""", "wdi_interpolated_small")

wdi_interpolated_small - 0
┌───────────┬────────────┬──────────────────────────┬────────────────────┬────────────────────┬──────────────────────────────┬─────────────────────────────┐
│  country  │    date    │ rural_population_percent │   fertility_rate   │ gdp_per_capita_usd │ mobile_subscriptions_per_100 │ refugee_population_promille │
│  varchar  │    date    │          double          │       double       │       double       │            double            │           double            │
├───────────┼────────────┼──────────────────────────┼────────────────────┼────────────────────┼──────────────────────────────┼─────────────────────────────┤
│ Argentina │ 2017-01-01 │        8.372665753424657 │             2.2408 │  12795.23644407049 │            145.9031169849315 │         0.07494967452275388 │
│ Poland    │ 2017-01-01 │                  39.8222 │ 1.3902465753424658 │  12382.69604662878 │            137.5341305038356 │          0.3091731147958112 │
└───────────┴────────────┴─────

In [95]:
process("""
SELECT track_id, UNNEST(artists) AS artist FROM (
    SELECT
        track_id,
        string_split(MAX(artists)[3:-3], ''', ''') AS artists
    FROM '../../data/api.csv'
    WHERE track_id IN (
        SELECT DISTINCT track_id FROM './tmp/charts_fmt.parquet'
    )
    GROUP BY track_id
)
""", "api_track_to_artist")

api_track_to_artist - 0
┌────────────────────────┬────────────────────────────────────────┐
│        track_id        │                 artist                 │
│        varchar         │                varchar                 │
├────────────────────────┼────────────────────────────────────────┤
│ 0bzssi2eITqFX81J0fKCun │ KSI                                    │
│ 5dRN3UToofUByy58fJF2N6 │ Deniz Tekin                            │
│ 2SYzK3AXKXPk4bh3hWrmQb │ Hayce Lemsi', "Jok'air                 │
│ 7jn2j3IRq37mEmUI6UaoDR │ Hooss', "Rim'K                         │
│ 2ohQN4UOfgXGwtF2ISUkFb │ Melendi                                │
│ 2ohQN4UOfgXGwtF2ISUkFb │ ChocQuibTown                           │
│ 2svutFGaCycRmYc1mWbfQh │ N.E.R.D                                │
│ 2svutFGaCycRmYc1mWbfQh │ Kendrick Lamar                         │
│ 2svutFGaCycRmYc1mWbfQh │ Pharrell Williams                      │
│ 0XMCmQYUzuPa8spjSW62zm │ Orquesta Internacional Hermanos Flores │
│           ·           

In [96]:
process("""
SELECT track_id, UNNEST(artists) AS artist FROM (
    SELECT
        track_id,
        string_split(MAX(artists)[3:-3], ''', ''') AS artists
    FROM '../../data/api.csv'
    WHERE track_id IN (
        SELECT DISTINCT track_id FROM './tmp/charts_fmt_small.parquet'
    )
    GROUP BY track_id
)
""", "api_track_to_artist_small")

api_track_to_artist_small - 0
┌────────────────────────┬───────────────────┐
│        track_id        │      artist       │
│        varchar         │      varchar      │
├────────────────────────┼───────────────────┤
│ 58IL315gMSTD37DOZPJ2hf │ Daddy Yankee      │
│ 7BKLCZ1jbUBVqRi2FVlTVw │ The Chainsmokers  │
│ 7BKLCZ1jbUBVqRi2FVlTVw │ Halsey            │
│ 6rQSrBHf7HlZjtcMZ4S4bO │ J Balvin          │
│ 6rQSrBHf7HlZjtcMZ4S4bO │ Pharrell Williams │
│ 6rQSrBHf7HlZjtcMZ4S4bO │ BIA               │
│ 6rQSrBHf7HlZjtcMZ4S4bO │ Sky Rompiendo     │
│ 7abpmGpF7PGep2rDU68GBR │ Burak Yeter       │
│ 7abpmGpF7PGep2rDU68GBR │ Danelle Sandoval  │
│ 3AEZUABDXNtecAOSC1qTfo │ CNCO              │
│ 4pdPtRcBmOSQDlJ3Fk945m │ DJ Snake          │
│ 4pdPtRcBmOSQDlJ3Fk945m │ Justin Bieber     │
│ 5aAx2yezTd8zXrkmtKl66Z │ The Weeknd        │
│ 5aAx2yezTd8zXrkmtKl66Z │ Daft Punk         │
│ 5knuzwU65gJK7IF5yJsuaW │ Clean Bandit      │
│ 5knuzwU65gJK7IF5yJsuaW │ Sean Paul         │
│ 5knuzwU65gJK7IF5yJsuaW │ Ann

In [97]:
process("""
SELECT track_id, UNNEST(genres) AS genre FROM (
    SELECT
        track_id,
        string_split(MAX(artist_genres)[3:-3], ''', ''') AS genres
    FROM '../../data/api.csv'
    WHERE track_id IN (
        SELECT DISTINCT track_id FROM './tmp/charts_fmt.parquet'
    )
    GROUP BY track_id
)
""", "api_track_to_genre")

api_track_to_genre - 0
┌────────────────────────┬─────────────────────┐
│        track_id        │        genre        │
│        varchar         │       varchar       │
├────────────────────────┼─────────────────────┤
│ 3Fe0fahTqOILtmPOtcNam0 │ suomi rock          │
│ 3Fe0fahTqOILtmPOtcNam0 │ finnish dance pop   │
│ 3Fe0fahTqOILtmPOtcNam0 │ finnish pop         │
│ 4aiLpwwZw5Ek9KkBMnt6Zu │ traphall            │
│ 4aiLpwwZw5Ek9KkBMnt6Zu │ uk pop              │
│ 4aiLpwwZw5Ek9KkBMnt6Zu │ dancehall           │
│ 4aiLpwwZw5Ek9KkBMnt6Zu │ uk funky            │
│ 4aiLpwwZw5Ek9KkBMnt6Zu │ uk contemporary r&b │
│ 4aiLpwwZw5Ek9KkBMnt6Zu │ dance pop           │
│ 4aiLpwwZw5Ek9KkBMnt6Zu │ jamaican hip hop    │
│           ·            │    ·                │
│           ·            │    ·                │
│           ·            │    ·                │
│ 3gYHMSi0QXprEALlsYDgF5 │ iskelma             │
│ 3gYHMSi0QXprEALlsYDgF5 │ suomi rock          │
│ 5eyPMRoFmMtNO9af4b1yEJ │ dutch hip hop      

In [98]:
process("""
SELECT track_id, UNNEST(genres) AS genre FROM (
    SELECT
        track_id,
        string_split(MAX(artist_genres)[3:-3], ''', ''') AS genres
    FROM '../../data/api.csv'
    WHERE track_id IN (
        SELECT DISTINCT track_id FROM './tmp/charts_fmt_small.parquet'
    )
    GROUP BY track_id
)
""", "api_track_to_genre_small")

api_track_to_genre_small - 0
┌────────────────────────┬──────────────────────┐
│        track_id        │        genre         │
│        varchar         │       varchar        │
├────────────────────────┼──────────────────────┤
│ 7BKLCZ1jbUBVqRi2FVlTVw │ pop                  │
│ 7BKLCZ1jbUBVqRi2FVlTVw │ indie poptimism      │
│ 7BKLCZ1jbUBVqRi2FVlTVw │ etherpop             │
│ 7BKLCZ1jbUBVqRi2FVlTVw │ pop dance            │
│ 7BKLCZ1jbUBVqRi2FVlTVw │ electropop           │
│ 7BKLCZ1jbUBVqRi2FVlTVw │ dance pop            │
│ 7BKLCZ1jbUBVqRi2FVlTVw │ tropical house       │
│ 7BKLCZ1jbUBVqRi2FVlTVw │ edm                  │
│ 5aAx2yezTd8zXrkmtKl66Z │ pop                  │
│ 5aAx2yezTd8zXrkmtKl66Z │ filter house         │
│           ·            │  ·                   │
│           ·            │  ·                   │
│           ·            │  ·                   │
│ 5knuzwU65gJK7IF5yJsuaW │ edm                  │
│ 5knuzwU65gJK7IF5yJsuaW │ tropical house       │
│ 5knuzwU65gJK7IF5yJs

In [99]:
process("""
SELECT
    c.region,
    c.date,
    MD5(a.artist) AS artist_id,
    c.streams
FROM './tmp/charts_fmt.parquet' AS c
JOIN './tmp/api_track_to_artist.parquet' AS a
    ON c.track_id = a.track_id
""", "charts_artists")

charts_artists - 67
┌──────────┬────────────┬──────────────────────────────────┬─────────┐
│  region  │    date    │            artist_id             │ streams │
│ varchar  │    date    │             varchar              │  int64  │
├──────────┼────────────┼──────────────────────────────────┼─────────┤
│ Paraguay │ 2017-02-22 │ cc1e3a4a210d2b2d92fa83830e6d3af0 │    1235 │
│ Paraguay │ 2017-02-22 │ 7b1f9ae869f3817cf63fef5740433b48 │    1214 │
│ Paraguay │ 2017-02-22 │ c19162e3de8a97b9e8110ade142cd7c4 │    1208 │
│ Paraguay │ 2017-02-22 │ 9a7f503e143392edde7836acda016feb │    1204 │
│ Paraguay │ 2017-02-22 │ 69f2b1ceddc0ec0bde80faa79b36dcb8 │    1170 │
│ Paraguay │ 2017-02-22 │ 00c6985bf0231acf64a3645a5785d785 │    1147 │
│ Paraguay │ 2017-02-22 │ bdb634422ce2d4e7669be6c74933b3d5 │    1140 │
│ Paraguay │ 2017-02-22 │ aa968850a9d255494612acd0552f8fcd │    1122 │
│ Paraguay │ 2017-02-22 │ e7a32095be1f9eae17dd3f3362b39689 │    1116 │
│ Paraguay │ 2017-02-22 │ 387eeeaebd227162dddb1c3a6357ff3

In [100]:
process("""
SELECT
    c.region,
    c.date,
    t.genre,
    c.streams
FROM './tmp/charts_fmt.parquet' AS c
JOIN './tmp/api_track_to_genre.parquet' AS t
    ON c.track_id = t.track_id
""", "charts_genres")

charts_genres - 50
┌──────────┬────────────┬──────────────────────┬─────────┐
│  region  │    date    │        genre         │ streams │
│ varchar  │    date    │       varchar        │  int64  │
├──────────┼────────────┼──────────────────────┼─────────┤
│ Paraguay │ 2017-02-22 │ pop                  │    1235 │
│ Paraguay │ 2017-02-22 │ post-teen pop        │    1214 │
│ Paraguay │ 2017-02-22 │ uk pop               │    1208 │
│ Paraguay │ 2017-02-22 │ reggaeton            │    1204 │
│ Paraguay │ 2017-02-22 │ trap latino          │    1170 │
│ Paraguay │ 2017-02-22 │ trap latino          │    1147 │
│ Paraguay │ 2017-02-22 │ reggaeton            │    1140 │
│ Paraguay │ 2017-02-22 │ pop                  │    1122 │
│ Paraguay │ 2017-02-22 │ australian pop       │    1116 │
│ Paraguay │ 2017-02-22 │ uk pop               │    1107 │
│    ·     │     ·      │   ·                  │      ·  │
│    ·     │     ·      │   ·                  │      ·  │
│    ·     │     ·      │   ·        

In [101]:
process("""
SELECT
    region,
    date,
    COALESCE(MAX(CASE WHEN genre = 'pop' THEN total_streams END), 0) AS pop,
    COALESCE(MAX(CASE WHEN genre = 'rap' THEN total_streams END), 0) AS rap,
    COALESCE(MAX(CASE WHEN genre = 'rock' THEN total_streams END), 0) AS rock,
    COALESCE(MAX(CASE WHEN genre = 'edm' THEN total_streams END), 0) AS edm,
    COALESCE(MAX(CASE WHEN genre = 'hip hop' THEN total_streams END), 0) AS hip_hop,
    COALESCE(MAX(CASE WHEN genre = 'trap latino' THEN total_streams END), 0) AS trap_latino,
    COALESCE(MAX(CASE WHEN genre = 'reggaeton' THEN total_streams END), 0) AS reggaeton,
    COALESCE(MAX(CASE WHEN genre = 'electropop' THEN total_streams END), 0) AS electropop,
    COALESCE(MAX(CASE WHEN genre = 'dance pop' THEN total_streams END), 0) AS dance_pop,
    COALESCE(MAX(CASE WHEN genre = 'pop rap' THEN total_streams END), 0) AS pop_rap,
    COALESCE(MAX(CASE WHEN genre = 'musica mexicana' THEN total_streams END), 0) AS musica_mexicana,
    COALESCE(MAX(CASE WHEN genre = 'trap' THEN total_streams END), 0) AS trap,
    COALESCE(MAX(CASE WHEN genre = 'modern rock' THEN total_streams END), 0) AS modern_rock,
    COALESCE(MAX(CASE WHEN genre = 'classic rock' THEN total_streams END), 0) AS classic_rock,
    COALESCE(MAX(CASE WHEN genre = 'uk pop' THEN total_streams END), 0) AS uk_pop,
    COALESCE(MAX(CASE WHEN genre = 'k-pop' THEN total_streams END), 0) AS k_pop,
    COALESCE(MAX(CASE WHEN genre = 'tropical house' THEN total_streams END), 0) AS tropical_house,
    COALESCE(MAX(CASE WHEN genre = 'melodic rap' THEN total_streams END), 0) AS melodic_rap,
    COALESCE(MAX(CASE WHEN genre = 'canadian pop' THEN total_streams END), 0) AS canadian_pop,
    COALESCE(MAX(CASE WHEN genre = 'modern bollywood' THEN total_streams END), 0) AS modern_bollywood
FROM (
    SELECT
        region,
        date,
        genre,
        SUM(streams) AS total_streams
    FROM './tmp/charts_genres.parquet'
    WHERE genre IN ('pop', 'rap', 'rock', 'edm', 'hip hop', 'trap latino', 'reggaeton', 'electropop', 'dance pop', 'pop rap', 'musica mexicana', 'trap', 'modern rock', 'classic rock', 'uk pop', 'k-pop', 'tropical house', 'melodic rap', 'canadian pop', 'modern bollywood')
    GROUP BY region, date, genre
)
GROUP BY region, date
ORDER BY region, date
""", "charts_daily_genres")

charts_daily_genres - 1
┌───────────┬────────────┬─────────┬────────┬────────┬────────┬─────────┬─────────────┬───────────┬────────────┬───────────┬─────────┬─────────────────┬────────┬─────────────┬──────────────┬────────┬────────┬────────────────┬─────────────┬──────────────┬──────────────────┐
│  region   │    date    │   pop   │  rap   │  rock  │  edm   │ hip_hop │ trap_latino │ reggaeton │ electropop │ dance_pop │ pop_rap │ musica_mexicana │  trap  │ modern_rock │ classic_rock │ uk_pop │ k_pop  │ tropical_house │ melodic_rap │ canadian_pop │ modern_bollywood │
│  varchar  │    date    │ int128  │ int128 │ int128 │ int128 │ int128  │   int128    │  int128   │   int128   │  int128   │ int128  │     int128      │ int128 │   int128    │    int128    │ int128 │ int128 │     int128     │   int128    │    int128    │      int128      │
├───────────┼────────────┼─────────┼────────┼────────┼────────┼─────────┼─────────────┼───────────┼────────────┼───────────┼─────────┼─────────────────┼──

In [102]:
process("""
SELECT
    g.region,
    g.date,
    g.pop / s.streams AS pop,
    g.rap / s.streams AS rap,
    g.rock / s.streams AS rock,
    g.edm / s.streams AS edm,
    g.hip_hop / s.streams AS hip_hop,
    g.trap_latino / s.streams AS trap_latino,
    g.reggaeton / s.streams AS reggaeton,
    g.electropop / s.streams AS electropop,
    g.dance_pop / s.streams AS dance_pop,
    g.pop_rap / s.streams AS pop_rap,
    g.musica_mexicana / s.streams AS musica_mexicana,
    g.trap / s.streams AS trap,
    g.modern_rock / s.streams AS modern_rock,
    g.classic_rock / s.streams AS classic_rock,
    g.uk_pop / s.streams AS uk_pop,
    g.k_pop / s.streams AS k_pop,
    g.tropical_house / s.streams AS tropical_house,
    g.melodic_rap / s.streams AS melodic_rap,
    g.canadian_pop / s.streams AS canadian_pop,
    g.modern_bollywood / s.streams AS modern_bollywood
FROM './tmp/charts_daily_genres.parquet' AS g
JOIN './tmp/charts_daily_sum.parquet' AS s
    ON g.region = s.region AND g.date = s.date
""", "charts_genre_popularity")

charts_genre_popularity - 2
┌───────────┬────────────┬─────────────────────┬──────────────────────┬──────────────────────┬─────────────────────┬──────────────────────┬───────────────────────┬───────────────────────┬──────────────────────┬─────────────────────┬─────────────────────┬─────────────────┬──────────────────────┬───────────────────────┬───────────────────────┬──────────────────────┬────────┬──────────────────────┬───────────────────────┬──────────────────────┬──────────────────┐
│  region   │    date    │         pop         │         rap          │         rock         │         edm         │       hip_hop        │      trap_latino      │       reggaeton       │      electropop      │      dance_pop      │       pop_rap       │ musica_mexicana │         trap         │      modern_rock      │     classic_rock      │        uk_pop        │ k_pop  │    tropical_house    │      melodic_rap      │     canadian_pop     │ modern_bollywood │
│  varchar  │    date    │       double   

In [103]:
process("""
SELECT
    cgp.region AS country,
    cgp.date,
    cds.streams AS daily_streams,
    cdp.popularity AS daily_popularity,
    dcw.temperature_c AS temperature_c,
    dcw.precipitation_mm AS precipitation_mm,
    wdi.rural_population_percent AS "wdi:rural_population_percent",
    wdi.fertility_rate AS "wdi:fertility_rate",
    wdi.gdp_per_capita_usd AS "wdi:gdp_per_capita_usd",
    wdi.mobile_subscriptions_per_100 AS "wdi:mobile_subscriptions_per_100",
    wdi.refugee_population_promille AS "wdi:refugee_population_promille",
    cgp.pop AS "genre:pop",
    cgp.rap AS "genre:rap",
    cgp.rock AS "genre:rock",
    cgp.edm AS "genre:edm",
    cgp.hip_hop AS "genre:hip_hop",
    cgp.trap_latino AS "genre:trap_latino",
    cgp.reggaeton AS "genre:reggaeton",
    cgp.electropop AS "genre:electropop",
    cgp.dance_pop AS "genre:dance_pop",
    cgp.pop_rap AS "genre:pop_rap",
    cgp.musica_mexicana AS "genre:musica_mexicana",
    cgp.trap AS "genre:trap",
    cgp.modern_rock AS "genre:modern_rock",
    cgp.classic_rock AS "genre:classic_rock",
    cgp.uk_pop AS "genre:uk_pop",
    cgp.k_pop AS "genre:k_pop",
    cgp.tropical_house AS "genre:tropical_house",
    cgp.melodic_rap AS "genre:melodic_rap",
    cgp.canadian_pop AS "genre:canadian_pop",
    cgp.modern_bollywood AS "genre:modern_bollywood",
FROM './tmp/charts_genre_popularity.parquet' AS cgp
JOIN './tmp/charts_daily_popularity.parquet' AS cdp
    ON cgp.region = cdp.region AND cgp.date = cdp.date
JOIN './tmp/charts_daily_sum.parquet' AS cds
    ON cgp.region = cds.region AND cgp.date = cds.date
JOIN './tmp/daily_country_weather.parquet' AS dcw
    ON cgp.region = dcw.country AND cgp.date = dcw.date
JOIN './tmp/wdi_interpolated.parquet' AS wdi
    ON cgp.region = wdi.country AND cgp.date = wdi.date
ORDER BY cgp.region, cgp.date
""", "output")

output - 2
┌───────────┬────────────┬───────────────┬──────────────────┬────────────────────┬────────────────────┬──────────────────────────────┬────────────────────┬────────────────────────┬──────────────────────────────────┬─────────────────────────────────┬─────────────────────┬──────────────────────┬──────────────────────┬─────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬─────────────────────┬─────────────────────┬───────────────────────┬──────────────────────┬──────────────────────┬────────────────────┬──────────────────────┬─────────────┬──────────────────────┬───────────────────────┬──────────────────────┬────────────────────────┐
│  country  │    date    │ daily_streams │ daily_popularity │   temperature_c    │  precipitation_mm  │ wdi:rural_population_percent │ wdi:fertility_rate │ wdi:gdp_per_capita_usd │ wdi:mobile_subscriptions_per_100 │ wdi:refugee_population_promille │      genre:pop      │      genre:rap    