# PySpark + CockroachDB Analytical Notebook

## 1. Import Required Libraries
We begin by importing PySpark libraries and loading the `.env` variables containing CockroachDB and MongoDB credentials.

In [1]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, desc, countDistinct, max as spark_max, min as spark_min, explode

# Load environment variables from .env
load_dotenv()

print(" COCKROACH_USER:", os.getenv("COCKROACH_USER"))
print(" COCKROACH_PASS:", os.getenv("COCKROACH_PASS"))
print(" COCKROACH_HOST:", os.getenv("COCKROACH_HOST"))
print(" COCKROACH_PORT:", os.getenv("COCKROACH_PORT"))
print(" MONGO URI:", os.getenv("MONGO_ATLAS_URI"))

 COCKROACH_USER: shubh
 COCKROACH_PASS: nrTPuxNNk9Oggf1lCXDkYw
 COCKROACH_HOST: bowing-slime-10451.j77.aws-ap-south-1.cockroachlabs.cloud
 COCKROACH_PORT: 26257
 MONGO URI: mongodb+srv://shbpndr:CrVz9nzipaLOZFVk@sounds-similar.8sd8tnl.mongodb.net/


## 2. Initialize Spark Session
We initialize the Spark session and configure the required JDBC driver for CockroachDB.

In [2]:
spark = SparkSession.builder \
    .appName("CockroachDB_PySpark_Project") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.2.18") \
    .getOrCreate()

In [3]:
spark

## 3. CockroachDB Connection Properties
We read the CockroachDB credentials from environment variables and build the JDBC connection string.

In [7]:
COCKROACH_USER = os.getenv("COCKROACH_USER")
COCKROACH_PASS = os.getenv("COCKROACH_PASS")
COCKROACH_HOST = os.getenv("COCKROACH_HOST")
COCKROACH_PORT = os.getenv("COCKROACH_PORT")
DATABASE_NAME = "music"

jdbc_url = f"jdbc:postgresql://{COCKROACH_HOST}:{COCKROACH_PORT}/{DATABASE_NAME}?sslmode=require"

connection_properties = {
    "user": COCKROACH_USER,
    "password": COCKROACH_PASS,
    "driver": "org.postgresql.Driver"
}

## 4. Load Tables into DataFrames
We load all four tables into Spark DataFrames for processing: `track_link`, `audio_features`, `track_reference`, and `lyrics`.

In [8]:
def load_table(table_name):
    return spark.read.jdbc(
        url=jdbc_url,
        table=table_name,
        properties=connection_properties
    )

track_link_df = load_table("track_links")
audio_features_df = load_table("audio_features")
track_reference_df = load_table("track_reference")
lyrics_df = load_table("lyrics")

In [14]:
track_link_df.show(5, truncate=False)

+------------------------------------+--------------------------------------------------+---------------------------+-------------------------------------------+
|musicbrainz_id                      |track_title                                       |channel                    |webpage_url                                |
+------------------------------------+--------------------------------------------------+---------------------------+-------------------------------------------+
|00b1397d-7f3e-4c59-bb42-ccd7fa17ee10|Ariana Grande - raindrops (an angel cried) (Audio)|Ariana Grande              |https://www.youtube.com/watch?v=-ZoJSLB2N18|
|00c9dcab-4abf-47f5-9755-c5c805b779c7|Through The Wire                                  |Kanye West                 |https://www.youtube.com/watch?v=AE8y25CcE6s|
|012e3459-b54d-49e9-b48d-d0922d295c5a|I'll Cry Instead (Remastered 2009)                |The Beatles                |https://www.youtube.com/watch?v=zfnkMBOSIUQ|
|013a7fe3-0113-4604-a295-f74

In [15]:
audio_features_df.show(5, truncate=False)

+------------------------------------+------------------+-----------+------------------+------------------+------------+-------------------+--------------------+------------------+------------------+--------------------+-------------------+------------------+------------------+------------------+--------------------+--------------------+-------------------+------------------+------------------+------------------+-------------------+-------------------+------------------+-------------------+--------------------+-------------------+-------------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+----

In [16]:
track_reference_df.show(5, truncate=False)

+------------------------------------+--------------------------+---------------+------------------------------------+---------------------------------------------------------+------------------------------------+------------+-------+------+
|musicbrainz_id                      |title                     |artist         |artist_id                           |album                                                    |album_id                            |release_date|country|length|
+------------------------------------+--------------------------+---------------+------------------------------------+---------------------------------------------------------+------------------------------------+------------+-------+------+
|00b1397d-7f3e-4c59-bb42-ccd7fa17ee10|raindrops (an angel cried)|Ariana Grande  |f4fdbb4c-e4b7-47a0-b83b-d91bbfcfa387|sweetener / thank u, next tour - live at Coachella 2019  |6cd36f2a-0c90-45ea-b63b-0e922f1df4ba|2019-04-19  |XW     |36000 |
|00c9dcab-4abf-47f5-9755-c5c805b

In [17]:
lyrics_df.show(5, truncate=False)

+------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## 5. Show Sample Records
Let's display a few records from each table to get an understanding of their structure.

In [18]:
track_link_df.join(track_reference_df, on="musicbrainz_id").select("track_title", "artist", "album").show(5, truncate=False)

+------------------------------------------------------------------+---------------------+---------------------+
|track_title                                                       |artist               |album                |
+------------------------------------------------------------------+---------------------+---------------------+
|Madison Beer - King of Everything (Official Lyric Video)          |Madison Beer         |Silence Between Songs|
|Ariana Grande - the light is coming ft. Nicki Minaj               |Ariana Grande        |Sweetener            |
|Bitch Niggaz                                                      |Dr. Dre              |2001: Instrumentals  |
|Harry Styles - As It Was (Official Video)                         |Harry Styles         |As It Was            |
|Piotr Ilich Tchaikovsky - The Nutcracker, Dance of the Reed Flutes|Пётр Ильич Чайковский|100 Hits: Classical  |
+------------------------------------------------------------------+---------------------+------

## Join track metadata with artist and album
Join `track_link` with `track_reference` to retrieve track title, artist, and album info.

In [19]:
audio_features_df.orderBy(col("energy").desc()).select("musicbrainz_id", "energy").show(5, truncate=False)

+------------------------------------+-------------------+
|musicbrainz_id                      |energy             |
+------------------------------------+-------------------+
|635ca7c5-1ef5-4acf-894f-aa6d6eda4e6b|0.38299816846847534|
|a3b54a75-8fc8-4b18-916a-8ac951c0c7ed|0.37580251693725586|
|2e8f9c02-ebc6-4098-84cb-224d46e46ef8|0.3742404580116272 |
|196a9bc5-6468-455e-a602-00f923b34198|0.369456022977829  |
|d8ccf243-a78b-4397-b331-18da5ecc5759|0.3678611218929291 |
+------------------------------------+-------------------+
only showing top 5 rows



## Combine track metadata with energy and valence scores
Join track reference, audio features, and lyrics to get combined metadata with energy and valence.

In [20]:
track_reference_df \
    .join(audio_features_df, on="musicbrainz_id") \
    .join(lyrics_df, on="musicbrainz_id") \
    .select("title", "artist", "energy", "valence") \
    .show(5, truncate=False)

+------------------------+---------------------+--------------------+-------------------+
|title                   |artist               |energy              |valence            |
+------------------------+---------------------+--------------------+-------------------+
|King of Everything      |Madison Beer         |0.18618640303611755 |0.38116011023521423|
|the light is coming     |Ariana Grande        |0.29869845509529114 |0.5234681963920593 |
|Bitch Niggaz            |Dr. Dre              |0.26235342025756836 |0.4938736855983734 |
|As It Was               |Harry Styles         |0.34368205070495605 |0.397524356842041  |
|Dance of the Reed Flutes|Пётр Ильич Чайковский|0.018364010378718376|0.3955545723438263 |
+------------------------+---------------------+--------------------+-------------------+
only showing top 5 rows



## Average tempo grouped by country
Compute the average tempo of tracks for each country.

In [21]:
track_reference_df \
    .join(audio_features_df, on="musicbrainz_id") \
    .groupBy("country") \
    .agg(avg("tempo") \
    .alias("avg_tempo")) \
    .orderBy(desc("avg_tempo")) \
    .show(5, truncate=False)

+-------+------------------+
|country|avg_tempo         |
+-------+------------------+
|BH     |139.6748310810811 |
|CZ     |139.02354958751792|
|NL     |132.51201923076923|
|CA     |131.91143939994564|
|TW     |129.19921875      |
+-------+------------------+
only showing top 5 rows



## Happy and danceable tracks
Filter tracks with valence > 0.7 and danceability > 0.7.

In [31]:
audio_features_df \
    .filter((col("valence") > 0.5) & (col("danceability") >= 0.5)) \
    .select("musicbrainz_id", "valence", "danceability") \
    .orderBy("valence", ascending=False) \
    .show(10, truncate=False)

+------------------------------------+------------------+------------+
|musicbrainz_id                      |valence           |danceability|
+------------------------------------+------------------+------------+
|bdb0435f-9d43-4d62-a1c7-35ef01b82ba0|0.663307249546051 |0.5         |
|edcf70c1-2d55-4430-a46c-c3b113bd221b|0.6343459486961365|0.5         |
|7362fccc-b012-4541-9792-9f359c1881a0|0.6107845902442932|0.5         |
|1a1519e4-8184-4015-91de-28c767b7f523|0.6073976159095764|0.5         |
|814f3f62-db19-469d-a96b-12f4a1b0adc3|0.6005522012710571|0.5         |
|516bf889-cdd2-4d55-b9ab-8f5c9cdb7de9|0.5983216166496277|0.5         |
|cfc5f79b-b25b-4309-8ac4-816463323472|0.5980272889137268|0.5         |
|13c5ba92-c6dd-4c8c-88ae-5810626edd6a|0.5961501598358154|0.5         |
|2bf5444e-9d88-4b34-aec0-5a423b4d348a|0.5958881378173828|0.5         |
|6cf20e9f-a788-4291-90a0-ffd8ffc9bf60|0.5939623117446899|0.5         |
+------------------------------------+------------------+------------+
only s

## Count of Tracks per Artist
Display artists and their respective track counts in descending order

In [32]:
track_reference_df \
    .groupBy("artist") \
    .count() \
    .orderBy(desc("count")) \
    .show(5, truncate=False)

+-------------+-----+
|artist       |count|
+-------------+-----+
|Taylor Swift |72   |
|Ariana Grande|72   |
|Beyoncé      |54   |
|Justin Bieber|48   |
|The Beatles  |44   |
+-------------+-----+
only showing top 5 rows



## Tracks with available Genius lyrics
Filter tracks that have available lyrics from Genius platform.

In [33]:
lyrics_df \
    .filter(col("genius_lyrics").isNotNull()) \
    .select("musicbrainz_id", "genius_url") \
    .show(5, truncate=False)

+------------------------------------+----------------------------------------------------------------+
|musicbrainz_id                      |genius_url                                                      |
+------------------------------------+----------------------------------------------------------------+
|00b1397d-7f3e-4c59-bb42-ccd7fa17ee10|https://genius.com/Ariana-grande-raindrops-an-angel-cried-lyrics|
|00c9dcab-4abf-47f5-9755-c5c805b779c7|https://genius.com/Kanye-west-through-the-wire-lyrics           |
|012e3459-b54d-49e9-b48d-d0922d295c5a|https://genius.com/The-beatles-ill-cry-instead-lyrics           |
|013a7fe3-0113-4604-a295-f74a0b88bf05|https://genius.com/Billy-joel-shes-always-a-woman-lyrics        |
|01564f1c-99b2-466a-a60d-4e22a5008525|https://genius.com/Kacey-musgraves-angel-lyrics                 |
+------------------------------------+----------------------------------------------------------------+
only showing top 5 rows



## Count of Unique Artists per Country
Get the count of unique artists available per country

In [34]:
track_reference_df \
    .groupBy("country") \
    .agg(countDistinct("artist").alias("unique_artists")) \
    .orderBy(desc("unique_artists")) \
    .show(5, truncate=False)

+-------+--------------+
|country|unique_artists|
+-------+--------------+
|NULL   |33            |
|XW     |31            |
|US     |27            |
|GB     |18            |
|XE     |12            |
+-------+--------------+
only showing top 5 rows



## Count of Tracks per Release Date
Count the tracks for each release date and order them in descending order

In [35]:
track_reference_df \
    .groupBy("release_date") \
    .count() \
    .orderBy(desc("count")) \
    .show(5, truncate=False)

+------------+-----+
|release_date|count|
+------------+-----+
|null        |80   |
|2024-03-29  |25   |
|2025-03-14  |24   |
|2022-03-16  |18   |
|2005-01-01  |17   |
+------------+-----+
only showing top 5 rows



## Top 5 most danceable tracks
Retrieve the most danceable tracks ordered by danceability.

In [37]:
audio_features_df \
    .orderBy(desc("danceability")) \
    .select("musicbrainz_id", "danceability") \
    .show(5, truncate=False)

+------------------------------------+------------+
|musicbrainz_id                      |danceability|
+------------------------------------+------------+
|00b1397d-7f3e-4c59-bb42-ccd7fa17ee10|0.5         |
|00c9dcab-4abf-47f5-9755-c5c805b779c7|0.5         |
|012e3459-b54d-49e9-b48d-d0922d295c5a|0.5         |
|013a7fe3-0113-4604-a295-f74a0b88bf05|0.5         |
|01564f1c-99b2-466a-a60d-4e22a5008525|0.5         |
+------------------------------------+------------+
only showing top 5 rows



## Count of tracks grouped by sample rate
Group tracks by their sample rate and count occurrences.

In [38]:
audio_features_df \
    .groupBy("sample_rate") \
    .count() \
    .orderBy(desc("count")) \
    .show(5, truncate=False)

+-----------+-----+
|sample_rate|count|
+-----------+-----+
|44100      |1093 |
+-----------+-----+



## Tracks released in the US
Filter tracks that are marked with country as US.

In [41]:
track_reference_df \
    .filter(col("country") == "US") \
    .select("title", "artist") \
    .show(10, truncate=False)

+------------------------+------------+
|title                   |artist      |
+------------------------+------------+
|Through the Wire        |Ye          |
|She’s Always a Woman    |Billy Joel  |
|SWEET ★ HONEY ★ BUCKIIN’|Beyoncé     |
|Touch the Sky           |Ye          |
|STOP TRYING TO BE GOD   |Travis Scott|
|Let Down                |Radiohead   |
|The National Anthem     |Radiohead   |
|Three Empty Words       |Shawn Mendes|
|Fuck You                |Dr. Dre     |
|Electioneering          |Radiohead   |
+------------------------+------------+
only showing top 10 rows



## Average MFCC 1 feature per track
Compute the average MFCC 1 value grouped by track.

In [42]:
audio_features_df \
    .groupBy("musicbrainz_id") \
    .agg(avg("mfcc_1").alias("avg_mfcc_1")) \
    .orderBy(desc("avg_mfcc_1")) \
    .show(5, truncate=False)

+------------------------------------+-------------------+
|musicbrainz_id                      |avg_mfcc_1         |
+------------------------------------+-------------------+
|4807e1a1-8766-44e3-9884-e586d7a45ef8|-18.950777053833008|
|ea2915f0-b307-448f-9081-b5051fd820f6|-37.45595169067383 |
|27659585-8b8a-4720-abb8-1b59826552e6|-38.347564697265625|
|2bbd29fa-b951-4e78-9e24-85918142f1f1|-39.8458137512207  |
|2e8f9c02-ebc6-4098-84cb-224d46e46ef8|-42.14936828613281 |
+------------------------------------+-------------------+
only showing top 5 rows



## Max and min tempo per track
Get max and min tempo for each track.

In [44]:
audio_features_df \
    .groupBy("musicbrainz_id") \
    .agg(spark_max("tempo").alias("max_tempo"), spark_min("tempo").alias("min_tempo")) \
    .show(5)

+--------------------+------------------+------------------+
|      musicbrainz_id|         max_tempo|         min_tempo|
+--------------------+------------------+------------------+
|6ade8d7b-0206-483...|         147.65625|         147.65625|
|775290ba-0c56-480...| 99.38401442307692| 99.38401442307692|
|9d2d9942-d66b-40c...|166.70866935483872|166.70866935483872|
|e0bf757c-e18c-47a...|         114.84375|         114.84375|
|16c9e040-c6d3-458...| 139.6748310810811| 139.6748310810811|
+--------------------+------------------+------------------+
only showing top 5 rows



## Tracks released after 2015
Filter tracks released after Jan 1, 2015.

In [45]:
track_reference_df \
    .filter(col("release_date") > "2015-01-01") \
    .select("title", "artist") \
    .show(5, truncate=False)

+--------------------------+-----------------------+
|title                     |artist                 |
+--------------------------+-----------------------+
|raindrops (an angel cried)|Ariana Grande          |
|angel                     |Kacey Musgraves        |
|Where the Money Flows     |Peter Cat Recording Co.|
|SWEET ★ HONEY ★ BUCKIIN’  |Beyoncé                |
|King Kunta                |Kendrick Lamar         |
+--------------------------+-----------------------+
only showing top 5 rows



## Long tracks with duration > 300 seconds
Filter tracks longer than 300 seconds.

In [46]:
track_reference_df \
    .filter(col("length") > 300) \
    .select("title", "length") \
    .show(5, truncate=False)

+--------------------------+------+
|title                     |length|
+--------------------------+------+
|raindrops (an angel cried)|36000 |
|Through the Wire          |270386|
|I'll Cry Instead          |107000|
|She’s Always a Woman      |120560|
|angel                     |140000|
+--------------------------+------+
only showing top 5 rows



## Tracks with high speechiness (> 0.2)
Filter tracks where speechiness is greater than 0.2.

In [48]:
audio_features_df \
    .filter(col("speechiness") > 0.2) \
    .select("musicbrainz_id", "speechiness") \
    .show(5, truncate=False)

+------------------------------------+------------------+
|musicbrainz_id                      |speechiness       |
+------------------------------------+------------------+
|2421ac91-d401-4c69-9a4c-feca38b95d71|0.2122621617854542|
|b4f22410-ef06-4d44-b789-f32c185acbad|0.4694865198610758|
+------------------------------------+------------------+



## Instrumental tracks (instrumentalness > 0.8)
Filter tracks where instrumentalness is greater than 0.8.

In [50]:
audio_features_df \
    .filter(col("instrumentalness") > 0.8) \
    .select("musicbrainz_id", "instrumentalness") \
    .show(5, truncate=False)

+------------------------------------+------------------+
|musicbrainz_id                      |instrumentalness  |
+------------------------------------+------------------+
|013a7fe3-0113-4604-a295-f74a0b88bf05|0.8456417181947923|
|01564f1c-99b2-466a-a60d-4e22a5008525|0.8156021586000006|
|02159dd4-018a-4264-acca-19d2edc233d2|0.843158409831264 |
|039326dc-e853-4de8-84c5-dd91c31d4256|0.8024577167645605|
|0501d394-60d9-4d55-8415-ce420dad9b54|0.8600274110418155|
+------------------------------------+------------------+
only showing top 5 rows



## Tracks with available Last.fm wiki summary
Filter tracks with a non-null wiki summary from Last.fm.

In [53]:
lyrics_df \
    .filter(col("lastfm_wiki_summary").isNotNull()) \
    .select("musicbrainz_id", "lastfm_wiki_summary") \
    .show(5, truncate=False)

+------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|musicbrainz_id                      |lastfm_wiki_summary                                                                                                                                                                                                                                                                                                     

## Musicbrainz ids with Zero Crossing Rate > 0.1
Filter musicbrainz ids with zero crossing rate > 0.1

In [54]:
audio_features_df \
    .filter(col("zero_crossing_rate") > 0.1) \
    .select("musicbrainz_id", "zero_crossing_rate") \
    .show(5, truncate=False)

+------------------------------------+-------------------+
|musicbrainz_id                      |zero_crossing_rate |
+------------------------------------+-------------------+
|04484bf3-6617-4a99-ae67-f15a440fcd6c|0.1010217366085619 |
|0a62410e-efe8-4370-a4e5-e325a54fae16|0.13758016650732952|
|1e79f696-1018-4cc5-b89d-2e9a03652ab0|0.11445952868852459|
|2421ac91-d401-4c69-9a4c-feca38b95d71|0.2122621617854542 |
|27659585-8b8a-4720-abb8-1b59826552e6|0.10341092069309507|
+------------------------------------+-------------------+
only showing top 5 rows

