In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [46]:
# Determine ideal number of partitions

conf = sc.getConf()

N = int(conf.get("spark.executor.instances"))
M = int(conf.get("spark.executor.cores"))
partitions = 4 * N * M

print(f'ideal # partitions = {partitions}')

ideal # partitions = 32


In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

0,1
spark.dynamicAllocation.enabled,false
spark.executor.instances,4
spark.sql.warehouse.dir,file:/users/home/uwi14/Uditha/Assignment2/spark-warehouse
spark.driver.port,40651
spark.driver.memory,4g
spark.executor.memory,4g
spark.master,spark://masternode2:7077
spark.app.startTime,1717366645773
spark.executor.id,driver
spark.executor.cores,2


In [3]:
# Write your imports and code here or insert cells below

from pyspark.sql import functions as F
from pyspark.sql.types import *

# Data processing 


# Q1-A

In [4]:
! hdfs dfs -ls /data/msd/

Found 4 items
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/audio
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/genre
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:28 /data/msd/main
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/tasteprofile


In [5]:
!hdfs dfs -du -h /data/msd/

12.3 G   98.1 G   /data/msd/audio
30.1 M   241.0 M  /data/msd/genre
174.4 M  1.4 G    /data/msd/main
490.4 M  3.8 G    /data/msd/tasteprofile


# Audio

In [6]:
! hdfs dfs -du /data/msd/audio/

105513       844104        /data/msd/audio/attributes
13125542239  105004337912  /data/msd/audio/features
42224669     337797352     /data/msd/audio/statistics


In [7]:
! hdfs dfs -du -h /data/msd/audio/

103.0 K  824.3 K  /data/msd/audio/attributes
12.2 G   97.8 G   /data/msd/audio/features
40.3 M   322.1 M  /data/msd/audio/statistics


In [8]:
! hdfs dfs -du /data/msd/audio/attributes

1051   8408    /data/msd/audio/attributes/msd-jmir-area-of-moments-all-v1.0.attributes.csv
671    5368    /data/msd/audio/attributes/msd-jmir-lpc-all-v1.0.attributes.csv
484    3872    /data/msd/audio/attributes/msd-jmir-methods-of-moments-all-v1.0.attributes.csv
898    7184    /data/msd/audio/attributes/msd-jmir-mfcc-all-v1.0.attributes.csv
777    6216    /data/msd/audio/attributes/msd-jmir-spectral-all-all-v1.0.attributes.csv
777    6216    /data/msd/audio/attributes/msd-jmir-spectral-derivatives-all-all-v1.0.attributes.csv
12317  98536   /data/msd/audio/attributes/msd-marsyas-timbral-v1.0.attributes.csv
9990   79920   /data/msd/audio/attributes/msd-mvd-v1.0.attributes.csv
1390   11120   /data/msd/audio/attributes/msd-rh-v1.0.attributes.csv
34913  279304  /data/msd/audio/attributes/msd-rp-v1.0.attributes.csv
3942   31536   /data/msd/audio/attributes/msd-ssd-v1.0.attributes.csv
9990   79920   /data/msd/audio/attributes/msd-trh-v1.0.attributes.csv
28313  226504  /data/msd/a


## Atributes

In [9]:
!hdfs dfs -ls -R /data/msd/audio/attributes

!hdfs dfs -du -h /data/msd/audio/attributes

!hdfs fsck /data/msd/audio/attributes -files -blocks

-rwxr-xr-x   8 jsw93 supergroup       1051 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-area-of-moments-all-v1.0.attributes.csv
-rwxr-xr-x   8 jsw93 supergroup        671 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-lpc-all-v1.0.attributes.csv
-rwxr-xr-x   8 jsw93 supergroup        484 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-methods-of-moments-all-v1.0.attributes.csv
-rwxr-xr-x   8 jsw93 supergroup        898 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-mfcc-all-v1.0.attributes.csv
-rwxr-xr-x   8 jsw93 supergroup        777 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-spectral-all-all-v1.0.attributes.csv
-rwxr-xr-x   8 jsw93 supergroup        777 2021-09-29 10:35 /data/msd/audio/attributes/msd-jmir-spectral-derivatives-all-all-v1.0.attributes.csv
-rwxr-xr-x   8 jsw93 supergroup      12317 2021-09-29 10:35 /data/msd/audio/attributes/msd-marsyas-timbral-v1.0.attributes.csv
-rwxr-xr-x   8 jsw93 supergroup       9990 2021-09-29 10:35 /data/msd/a

## Features

In [10]:
!hdfs dfs -ls -R /data/msd/audio/features

!hdfs dfs -du -h /data/msd/audio/features

!hdfs fsck /data/msd/audio/features -files -blocks

drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:31 /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv
-rwxr-xr-x   8 jsw93 supergroup    8635110 2021-09-29 10:31 /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/part-00000.csv.gz
-rwxr-xr-x   8 jsw93 supergroup    8636689 2021-09-29 10:31 /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/part-00001.csv.gz
-rwxr-xr-x   8 jsw93 supergroup    8632696 2021-09-29 10:31 /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/part-00002.csv.gz
-rwxr-xr-x   8 jsw93 supergroup    8635186 2021-09-29 10:31 /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/part-00003.csv.gz
-rwxr-xr-x   8 jsw93 supergroup    8635805 2021-09-29 10:31 /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/part-00004.csv.gz
-rwxr-xr-x   8 jsw93 supergroup    8632126 2021-09-29 10:31 /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv/part-00005.csv.gz
-rwxr-xr-x   8 jsw93 supergroup

65.5 M   524.2 M  /data/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv
53.1 M   424.6 M  /data/msd/audio/features/msd-jmir-lpc-all-v1.0.csv
35.8 M   286.5 M  /data/msd/audio/features/msd-jmir-methods-of-moments-all-v1.0.csv
70.8 M   566.1 M  /data/msd/audio/features/msd-jmir-mfcc-all-v1.0.csv
51.1 M   408.9 M  /data/msd/audio/features/msd-jmir-spectral-all-all-v1.0.csv
51.1 M   408.9 M  /data/msd/audio/features/msd-jmir-spectral-derivatives-all-all-v1.0.csv
412.2 M  3.2 G    /data/msd/audio/features/msd-marsyas-timbral-v1.0.csv
1.3 G    10.3 G   /data/msd/audio/features/msd-mvd-v1.0.csv
240.3 M  1.9 G    /data/msd/audio/features/msd-rh-v1.0.csv
4.0 G    32.3 G   /data/msd/audio/features/msd-rp-v1.0.csv
640.6 M  5.0 G    /data/msd/audio/features/msd-ssd-v1.0.csv
1.4 G    11.5 G   /data/msd/audio/features/msd-trh-v1.0.csv
3.9 G    31.0 G   /data/msd/audio/features/msd-tssd-v1.0.csv
Connecting to namenode via http://masternode2:9870/fsck?ugi=uwi14&files=1&blocks=1&path=%2Fdata%2

# Statistics

In [11]:
!hdfs dfs -ls -R /data/msd/audio/statistics

!hdfs dfs -du -h /data/msd/audio/statistics

!hdfs fsck /data/msd/audio/statistics -files -blocks

-rwxr-xr-x   8 jsw93 supergroup   42224669 2021-09-29 10:28 /data/msd/audio/statistics/sample_properties.csv.gz
40.3 M  322.1 M  /data/msd/audio/statistics/sample_properties.csv.gz
Connecting to namenode via http://masternode2:9870/fsck?ugi=uwi14&files=1&blocks=1&path=%2Fdata%2Fmsd%2Faudio%2Fstatistics
FSCK started by uwi14 (auth:SIMPLE) from /192.168.40.11 for path /data/msd/audio/statistics at Mon Jun 03 10:18:20 NZST 2024

/data/msd/audio/statistics <dir>
/data/msd/audio/statistics/sample_properties.csv.gz 42224669 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761717_20895 len=42224669 Live_repl=8


Status: HEALTHY
 Number of data-nodes:	32
 Number of racks:		1
 Total dirs:			1
 Total symlinks:		0

Replicated Blocks:
 Total size:	42224669 B
 Total files:	1
 Total blocks (validated):	1 (avg. block size 42224669 B)
 Minimally replicated blocks:	1 (100.0 %)
 Over-replicated blocks:	0 (0.0 %)
 Under-replicated blocks:	0 (0.0 %)
 M

# genre

In [12]:
! hdfs dfs -du /data/msd/genre/

11625230  93001840  /data/msd/genre/msd-MAGD-genreAssignment.tsv
8820054   70560432  /data/msd/genre/msd-MASD-styleAssignment.tsv
11140605  89124840  /data/msd/genre/msd-topMAGD-genreAssignment.tsv


In [13]:
!hdfs dfs -ls -R /data/msd/genre/

!hdfs dfs -du -h /data/msd/genre/

!hdfs fsck /data/msd/genre -files -blocks


-rwxr-xr-x   8 jsw93 supergroup   11625230 2021-09-29 10:35 /data/msd/genre/msd-MAGD-genreAssignment.tsv
-rwxr-xr-x   8 jsw93 supergroup    8820054 2021-09-29 10:35 /data/msd/genre/msd-MASD-styleAssignment.tsv
-rwxr-xr-x   8 jsw93 supergroup   11140605 2021-09-29 10:35 /data/msd/genre/msd-topMAGD-genreAssignment.tsv
11.1 M  88.7 M  /data/msd/genre/msd-MAGD-genreAssignment.tsv
8.4 M   67.3 M  /data/msd/genre/msd-MASD-styleAssignment.tsv
10.6 M  85.0 M  /data/msd/genre/msd-topMAGD-genreAssignment.tsv
Connecting to namenode via http://masternode2:9870/fsck?ugi=uwi14&files=1&blocks=1&path=%2Fdata%2Fmsd%2Fgenre
FSCK started by uwi14 (auth:SIMPLE) from /192.168.40.11 for path /data/msd/genre at Mon Jun 03 10:18:29 NZST 2024

/data/msd/genre <dir>
/data/msd/genre/msd-MAGD-genreAssignment.tsv 11625230 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761918_21096 len=11625230 Live_repl=8

/data/msd/genre/msd-MASD-styleAssignment.tsv 8820054 

# main

In [14]:
! hdfs dfs -du /data/msd/main/summary

58658141   469265128  /data/msd/main/summary/analysis.csv.gz
124211304  993690432  /data/msd/main/summary/metadata.csv.gz


In [15]:
!hdfs dfs -ls -R /data/msd/main/summary

!hdfs dfs -du -h /data/msd/main/summary

!hdfs fsck /data/msd/main/summary -files -blocks

-rwxr-xr-x   8 jsw93 supergroup   58658141 2021-09-29 10:28 /data/msd/main/summary/analysis.csv.gz
-rwxr-xr-x   8 jsw93 supergroup  124211304 2021-09-29 10:28 /data/msd/main/summary/metadata.csv.gz
55.9 M   447.5 M  /data/msd/main/summary/analysis.csv.gz
118.5 M  947.7 M  /data/msd/main/summary/metadata.csv.gz
Connecting to namenode via http://masternode2:9870/fsck?ugi=uwi14&files=1&blocks=1&path=%2Fdata%2Fmsd%2Fmain%2Fsummary
FSCK started by uwi14 (auth:SIMPLE) from /192.168.40.11 for path /data/msd/main/summary at Mon Jun 03 10:18:38 NZST 2024

/data/msd/main/summary <dir>
/data/msd/main/summary/analysis.csv.gz 58658141 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761716_20894 len=58658141 Live_repl=8

/data/msd/main/summary/metadata.csv.gz 124211304 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761715_20893 len=124211304 Live_repl=8


Status: HEALTHY
 Number of data-nod

# Taste Profile

In [16]:
! hdfs dfs -du /data/msd/tasteprofile

2117524    16940192    /data/msd/tasteprofile/mismatches
512139195  4097113560  /data/msd/tasteprofile/triplets.tsv


In [17]:
!hdfs dfs -ls -R /data/msd/tasteprofile

!hdfs dfs -du -h /data/msd/tasteprofile

!hdfs fsck /data/msd/tasteprofile -files -blocks

drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/tasteprofile/mismatches
-rwxr-xr-x   8 jsw93 supergroup      91342 2021-09-29 10:35 /data/msd/tasteprofile/mismatches/sid_matches_manually_accepted.txt
-rwxr-xr-x   8 jsw93 supergroup    2026182 2021-09-29 10:35 /data/msd/tasteprofile/mismatches/sid_mismatches.txt
drwxr-xr-x   - jsw93 supergroup          0 2021-09-29 10:35 /data/msd/tasteprofile/triplets.tsv
-rwxr-xr-x   8 jsw93 supergroup   64020759 2021-09-29 10:35 /data/msd/tasteprofile/triplets.tsv/part-00000.tsv.gz
-rwxr-xr-x   8 jsw93 supergroup   64038083 2021-09-29 10:35 /data/msd/tasteprofile/triplets.tsv/part-00001.tsv.gz
-rwxr-xr-x   8 jsw93 supergroup   64077499 2021-09-29 10:35 /data/msd/tasteprofile/triplets.tsv/part-00002.tsv.gz
-rwxr-xr-x   8 jsw93 supergroup   64102442 2021-09-29 10:35 /data/msd/tasteprofile/triplets.tsv/part-00003.tsv.gz
-rwxr-xr-x   8 jsw93 supergroup   63998697 2021-09-29 10:35 /data/msd/tasteprofile/triplets.tsv/part-00004.tsv.gz

## Mismatches

In [18]:
! hdfs dfs -du /data/msd/tasteprofile/mismatches

91342    730736    /data/msd/tasteprofile/mismatches/sid_matches_manually_accepted.txt
2026182  16209456  /data/msd/tasteprofile/mismatches/sid_mismatches.txt


In [19]:
!hdfs dfs -ls -R /data/msd/tasteprofile/mismatches

!hdfs dfs -du -h /data/msd/tasteprofile/mismatches

!hdfs fsck /data/msd/tasteprofile/mismatches -files -blocks

-rwxr-xr-x   8 jsw93 supergroup      91342 2021-09-29 10:35 /data/msd/tasteprofile/mismatches/sid_matches_manually_accepted.txt
-rwxr-xr-x   8 jsw93 supergroup    2026182 2021-09-29 10:35 /data/msd/tasteprofile/mismatches/sid_mismatches.txt
89.2 K  713.6 K  /data/msd/tasteprofile/mismatches/sid_matches_manually_accepted.txt
1.9 M   15.5 M   /data/msd/tasteprofile/mismatches/sid_mismatches.txt
Connecting to namenode via http://masternode2:9870/fsck?ugi=uwi14&files=1&blocks=1&path=%2Fdata%2Fmsd%2Ftasteprofile%2Fmismatches
FSCK started by uwi14 (auth:SIMPLE) from /192.168.40.11 for path /data/msd/tasteprofile/mismatches at Mon Jun 03 10:18:56 NZST 2024

/data/msd/tasteprofile/mismatches <dir>
/data/msd/tasteprofile/mismatches/sid_matches_manually_accepted.txt 91342 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1073761915_21093 len=91342 Live_repl=8

/data/msd/tasteprofile/mismatches/sid_mismatches.txt 2026182 bytes, replicated: replicat

# Q1 - B

# Audio

## Attributes Count

In [20]:
Audio_Attributes = (
    spark.read.format("com.databricks.spark.csv")
    .option("header", "false")
    .option("inferSchema", "true")
    .load("hdfs:///data/msd/audio/attributes/*.csv")
)
Audio_Attributes.show(10)

# Count the number of rows
count = Audio_Attributes.count()

print(f"Number of rows in Audio_Attributes: {count}")


+------------+-------+
|         _c0|    _c1|
+------------+-------+
| component_1|NUMERIC|
| component_2|NUMERIC|
| component_3|NUMERIC|
| component_4|NUMERIC|
| component_5|NUMERIC|
| component_6|NUMERIC|
| component_7|NUMERIC|
| component_8|NUMERIC|
| component_9|NUMERIC|
|component_10|NUMERIC|
+------------+-------+
only showing top 10 rows

Number of rows in Audio_Attributes: 3929


## Feature Count

In [21]:
Audio_Features = (
    spark.read.format("com.databricks.spark.csv")
    .option("header", "false")
    .option("inferSchema", "true")
    .load("hdfs:///data/msd/audio/features/*.csv")
)
Audio_Features.show(10)

# Count the number of rows
count = Audio_Features.count()

print(f"Number of rows in Audio_Features: {count}")

+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--

Number of rows in Audio_Features: 12927867


In [22]:
# Calculate the number of unique songs
num_track_unique_audio_features = Audio_Features.select(F.col('_c1440')).distinct().count()

# Print the conclusion
print(f"Unique songs in 'Audio Features' is {num_track_unique_audio_features }.")

Unique songs in 'Audio Features' is 994176.


# Statistics Count

In [23]:
Audio_Statistics = (
    spark.read.format("com.databricks.spark.csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("hdfs:///data/msd/audio/statistics/sample_properties.csv.gz")
)
Audio_Statistics.show(10)
Audio_Statistics.schema

# Count the number of rows
count = Audio_Statistics.count()

print(f"Number of rows in Audio_Statistics: {count}")

+------------------+--------------------+--------------------+---------+----------+--------------+-------------+-----------+-----------+--------------+--------+
|          track_id|               title|         artist_name| duration|7digita_Id|sample_bitrate|sample_length|sample_rate|sample_mode|sample_version|filesize|
+------------------+--------------------+--------------------+---------+----------+--------------+-------------+-----------+-----------+--------------+--------+
|TRMMMYQ128F932D901|        Silent Night|    Faster Pussy cat|252.05506|   7032331|           128|60.1935770567|      22050|          1|             2|  960887|
|TRMMMKD128F425225D|         Tanssi vaan|    Karkkiautomaatti|156.55138|   1514808|            64|30.2244270016|      22050|          1|             2|  242038|
|TRMMMRX128F93187D9|   No One Could Ever|      Hudson Mohawke|138.97098|   6945353|           128|60.1935770567|      22050|          1|             2|  960887|
|TRMMMCH128F425532C|       Si Vos 

In [24]:
# Calculate the number of unique songs
num_track_unique_audio_statistics = Audio_Statistics.select(F.col('track_id')).distinct().count()

# Print the conclusion
print(f"Unique songs in 'Main_Summary' is {num_track_unique_audio_statistics}.")

Unique songs in 'Main_Summary' is 992865.


In [25]:
! hdfs dfs -du /data/msd/genre/

11625230  93001840  /data/msd/genre/msd-MAGD-genreAssignment.tsv
8820054   70560432  /data/msd/genre/msd-MASD-styleAssignment.tsv
11140605  89124840  /data/msd/genre/msd-topMAGD-genreAssignment.tsv


# Genre Count

## Genre Assignment Count

In [26]:
!hdfs dfs -cat /data/msd/genre/msd-MAGD-genreAssignment.tsv* | wc -l

422714


In [55]:
Genre_Assignment = (
    spark.read
    .option("header", "false")
    .option("inferSchema", "true")
    .option("delimiter", "\t")
    .csv("hdfs:///data/msd/genre/msd-MAGD-genreAssignment.tsv")
)

Genre_Assignment.show()

# Calculate the number of unique songs
num_song_unique_Genre_Assignment = Genre_Assignment.select(F.col('_c0')).distinct().count()

# Print the conclusion
print(f"Unique songs in 'Genre_Assignment' is {num_song_unique_Genre_Assignment}.")

+------------------+--------------+
|               _c0|           _c1|
+------------------+--------------+
|TRAAAAK128F9318786|      Pop_Rock|
|TRAAAAV128F421A322|      Pop_Rock|
|TRAAAAW128F429D538|           Rap|
|TRAAABD128F429CF47|      Pop_Rock|
|TRAAACV128F423E09E|      Pop_Rock|
|TRAAADT12903CCC339|Easy_Listening|
|TRAAAED128E0783FAB|         Vocal|
|TRAAAEF128F4273421|      Pop_Rock|
|TRAAAEM128F93347B9|    Electronic|
|TRAAAFD128F92F423A|      Pop_Rock|
|TRAAAFP128F931B4E3|           Rap|
|TRAAAGR128F425B14B|      Pop_Rock|
|TRAAAGW12903CC1049|         Blues|
|TRAAAHD128F42635A5|      Pop_Rock|
|TRAAAHE12903C9669C|      Pop_Rock|
|TRAAAHJ128F931194C|      Pop_Rock|
|TRAAAHZ128E0799171|           Rap|
|TRAAAIR128F1480971|           RnB|
|TRAAAJG128F9308A25|          Folk|
|TRAAAMO128F1481E7F|     Religious|
+------------------+--------------+
only showing top 20 rows

Unique songs in 'Genre_Assignment' is 422714.


## Style Assignment Count

In [27]:
!hdfs dfs -cat /data/msd/genre/msd-MASD-styleAssignment.tsv*| wc -l

273936


In [53]:
Style_Assignment = (
    spark.read
    .option("header", "false")
    .option("inferSchema", "true")
    .option("delimiter", "\t")
    .csv("hdfs:///data/msd/genre/msd-MASD-styleAssignment.tsv")
)

Style_Assignment.show()

# Calculate the number of unique songs
num_song_unique_Style_Assignment = Style_Assignment.select(F.col('_c0')).distinct().count()

# Print the conclusion
print(f"Unique songs in 'Style_Assignment' is {num_song_unique_Style_Assignment}.")

+------------------+--------------------+
|               _c0|                 _c1|
+------------------+--------------------+
|TRAAAAK128F9318786|   Metal_Alternative|
|TRAAAAV128F421A322|                Punk|
|TRAAAAW128F429D538|         Hip_Hop_Rap|
|TRAAACV128F423E09E|Rock_Neo_Psychedelia|
|TRAAAEF128F4273421|           Pop_Indie|
|TRAAAFP128F931B4E3|         Hip_Hop_Rap|
|TRAAAGR128F425B14B|    Pop_Contemporary|
|TRAAAHD128F42635A5|           Rock_Hard|
|TRAAAHJ128F931194C|           Pop_Indie|
|TRAAAHZ128E0799171|         Hip_Hop_Rap|
|TRAAAIR128F1480971|    Pop_Contemporary|
|TRAAAJG128F9308A25| Country_Traditional|
|TRAAAMO128F1481E7F|              Gospel|
|TRAAAMQ128F1460CD3|         Hip_Hop_Rap|
|TRAAANK128F428B515|Rock_Neo_Psychedelia|
|TRAAARJ128F9320760|    Pop_Contemporary|
|TRAAAVO128F93133D4|Rock_Neo_Psychedelia|
|TRAAAZU128F4226F7A|    Rock_Alternative|
|TRAABAH128F423B788|           Pop_Indie|
|TRAABBY128F930C3B5|Rock_Neo_Psychedelia|
+------------------+--------------

## TopMAGD-genreAssignment Count

In [28]:
!hdfs dfs -cat /data/msd/genre/msd-topMAGD-genreAssignment.tsv*| wc -l

406427


In [56]:
Top_Genre_Assignment = (
    spark.read
    .option("header", "false")
    .option("inferSchema", "true")
    .option("delimiter", "\t")
    .csv("hdfs:///data/msd/genre/msd-topMAGD-genreAssignment.tsv")
)

Top_Genre_Assignment.show()

# Calculate the number of unique songs
num_song_unique_Top_Genre_Assignment = Top_Genre_Assignment.select(F.col('_c0')).distinct().count()

# Print the conclusion
print(f"Unique songs in 'Top_Genre_Assignment' is {num_song_unique_Top_Genre_Assignment}.")

+------------------+----------+
|               _c0|       _c1|
+------------------+----------+
|TRAAAAK128F9318786|  Pop_Rock|
|TRAAAAV128F421A322|  Pop_Rock|
|TRAAAAW128F429D538|       Rap|
|TRAAABD128F429CF47|  Pop_Rock|
|TRAAACV128F423E09E|  Pop_Rock|
|TRAAAED128E0783FAB|     Vocal|
|TRAAAEF128F4273421|  Pop_Rock|
|TRAAAEM128F93347B9|Electronic|
|TRAAAFD128F92F423A|  Pop_Rock|
|TRAAAFP128F931B4E3|       Rap|
|TRAAAGR128F425B14B|  Pop_Rock|
|TRAAAGW12903CC1049|     Blues|
|TRAAAHD128F42635A5|  Pop_Rock|
|TRAAAHE12903C9669C|  Pop_Rock|
|TRAAAHJ128F931194C|  Pop_Rock|
|TRAAAHZ128E0799171|       Rap|
|TRAAAIR128F1480971|       RnB|
|TRAAAJG128F9308A25|      Folk|
|TRAAAMQ128F1460CD3|       Rap|
|TRAAANK128F428B515|  Pop_Rock|
+------------------+----------+
only showing top 20 rows

Unique songs in 'Top_Genre_Assignment' is 406427.


In [29]:
! hdfs dfs -du /data/msd/main/summary

58658141   469265128  /data/msd/main/summary/analysis.csv.gz
124211304  993690432  /data/msd/main/summary/metadata.csv.gz


# main summary count

In [30]:
 Main_Summary = (
    spark.read.format("com.databricks.spark.csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("hdfs:///data/msd/main/summary/*.csv.gz")
)
Main_Summary.show(10)

# Count the number of rows
count = Main_Summary.count()

print(f"Number of rows in Main_Summary: {count}")

+----------------+-----------------+-------------------+-------------------+------------------+---------------+--------------------+----------------+--------------------+--------------------+---------------+-----+----------------+-------------------+--------------------+------------------+------------------+------------------+--------------------+----------------+
|analyzer_version|artist_7digitalid| artist_familiarity|  artist_hotttnesss|         artist_id|artist_latitude|     artist_location|artist_longitude|         artist_mbid|         artist_name|artist_playmeid|genre|idx_artist_terms|idx_similar_artists|             release|release_7digitalid|   song_hotttnesss|           song_id|               title|track_7digitalid|
+----------------+-----------------+-------------------+-------------------+------------------+---------------+--------------------+----------------+--------------------+--------------------+---------------+-----+----------------+-------------------+----------------

In [31]:
# Calculate the number of unique songs
num_song_unique_main_summary = Main_Summary.select(F.col('song_id')).distinct().count()

# Print the conclusion
print(f"Unique songs in 'Main_Summary' is {num_song_unique_main_summary}.")

Unique songs in 'Main_Summary' is 998964.


In [42]:
! hdfs dfs -du -h /data/msd/tasteprofile

2.0 M    16.2 M  /data/msd/tasteprofile/mismatches
488.4 M  3.8 G   /data/msd/tasteprofile/triplets.tsv


# Taste Profile Count

## Triplets Count

In [33]:
!hdfs dfs -cat /data/msd/tasteprofile/triplets.tsv/* | gunzip | wc -l

48373586


In [34]:
schema_routes = StructType([
    StructField("user_id", StringType(), True),
    StructField("song_id", StringType(), True),
    StructField("play_count", StringType(), True)])

In [35]:
taste_profile = (
    spark.read
    .option("header", "false")
    .option("inferSchema", "false")
    .schema(schema_routes)
    .option("delimiter", "\t")
    .csv("hdfs:///data/msd/tasteprofile/triplets.tsv")
)

taste_profile.show()

+--------------------+------------------+----------+
|             user_id|           song_id|play_count|
+--------------------+------------------+----------+
|f1bfc2a4597a3642f...|SOQEFDN12AB017C52B|         1|
|f1bfc2a4597a3642f...|SOQOIUJ12A6701DAA7|         2|
|f1bfc2a4597a3642f...|SOQOKKD12A6701F92E|         4|
|f1bfc2a4597a3642f...|SOSDVHO12AB01882C7|         1|
|f1bfc2a4597a3642f...|SOSKICX12A6701F932|         1|
|f1bfc2a4597a3642f...|SOSNUPV12A8C13939B|         1|
|f1bfc2a4597a3642f...|SOSVMII12A6701F92D|         1|
|f1bfc2a4597a3642f...|SOTUNHI12B0B80AFE2|         1|
|f1bfc2a4597a3642f...|SOTXLTZ12AB017C535|         1|
|f1bfc2a4597a3642f...|SOTZDDX12A6701F935|         1|
|f1bfc2a4597a3642f...|SOTZTVF12A58A79B9F|         1|
|f1bfc2a4597a3642f...|SOUGTZZ12A8C13B8CC|         1|
|f1bfc2a4597a3642f...|SOVDLVW12A6701F92F|         1|
|f1bfc2a4597a3642f...|SOVKHBC12AF72A5DE7|         1|
|f1bfc2a4597a3642f...|SOVKJMM12AF72AAF3C|         1|
|f1bfc2a4597a3642f...|SOVMWUC12A8C13750B|     

In [43]:
rawcount = taste_profile.count()


print(f"rawcount 'triplets.tsv' is {rawcount}")


rawcount 'triplets.tsv' is 48373586


In [44]:
# Calculate the number of unique songs
raw_num_song_unique = taste_profile.select(F.col('song_id')).distinct().count()

# Print the conclusion
print(f"Unique songs in 'triplets.tsv' is {raw_num_song_unique}.")


Unique songs in 'triplets.tsv' is 384546.


## Mismatches row count

In [47]:
sid_mismatches = spark.read.text("hdfs:///data/msd/tasteprofile/mismatches/sid_mismatches.txt").repartition(partitions)
sid_mismatches = (sid_mismatches
                 .select(
                     F.substring(F.col('value'), 9,18).alias('song_id'),
                     F.substring(F.col('value'), 28,18).alias('track_id') ,
                     ))


sid_mismatches.printSchema()
show_as_html(sid_mismatches, 5)

print('\n')
print(f'The mismatches table has {sid_mismatches.count()} rows.')

root
 |-- song_id: string (nullable = true)
 |-- track_id: string (nullable = true)



Unnamed: 0,song_id,track_id
0,SOSHAHT12A8AE498DF,TRGHCMN128F42422EB
1,SOTVIGC12CF5F87EFA,TRGLNBL128F9324F09
2,SOMIVVH12A58A7A4F3,TRBAYLZ128F933841B
3,SOLPMEO12A8C136EC2,TRBWLFD128F4261A6A
4,SOCVMRY12A8AE486A6,TRGEDPP128F428AE71




The mismatches table has 19094 rows.


## Mismatches unique songs count

In [48]:
# Calculate the number of unique songs
raw_num_song_unique = sid_mismatches.select(F.col('song_id')).distinct().count()

# Print the conclusion
print(f"Unique songs in 'sid_mismatches' is {raw_num_song_unique}.")

Unique songs in 'sid_mismatches' is 18913.


In [57]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()