In [None]:
#Setting up the environment

!java -version

#Install Spark
#download file
!wget -q http://apache.osuosl.org/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
#extract the file
!tar xf spark-3.3.0-bin-hadoop3.tgz
#install findspark package
!pip install -q findspark

openjdk version "11.0.15" 2022-04-19
OpenJDK Runtime Environment (build 11.0.15+10-Ubuntu-0ubuntu0.20.04.1)
OpenJDK 64-Bit Server VM (build 11.0.15+10-Ubuntu-0ubuntu0.20.04.1, mixed mode, sharing)


In [1]:
# 

import os
pathToSpark = "/mnt/c/Users/walid/Desktop/Big Data/Final/spark-3.3.0-bin-hadoop3"
os.environ["SPARK_HOME"] = pathToSpark

In [2]:
import findspark
findspark.init()

# create entry points to spark
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = SparkConf().setAppName("FinalProject").setMaster("local[*]")
sc=SparkContext(conf = conf)
spark = SparkSession(sparkContext=sc)

In [3]:
# Reading in the directory containing all the JSON files and storing them into
# Spark's RDD format

import time

#Filepath = "/mnt/c/Users/walid/Desktop/Big Data/Final/mpdata/*.json"
Filepath = "/mnt/c/Users/walid/Desktop/Big Data/Final/mpdata/mpd.slice.0-999.json"

start = time.time()

df = spark.read.option("multiline", "true").json(Filepath)

end = time.time()
total_time = end-start

print("took " + str(total_time) + " seconds")

took 3.3237714767456055 seconds


In [4]:
# Taking a look at the schema, we see that all of the headers were successfully imported and 
# we have all of the data we need to begin analyzing
#
# The format of the JSON files is deeply nested and so we only have two main columns:
# (1) info regarding the slice/part of the json files and when it was generated... info that is useless to us
# and (2) a column of playlists, which consists of rows of arrays, meaning each row contains multiple playlists
# 
# We will have to flatten and normalize this dataframe first before we can analyze the data

df.show(5)
df.printSchema()

+--------------------+--------------------+
|                info|           playlists|
+--------------------+--------------------+
|{2017-12-03 08:41...|[{false, null, 11...|
+--------------------+--------------------+

root
 |-- info: struct (nullable = true)
 |    |-- generated_on: string (nullable = true)
 |    |-- slice: string (nullable = true)
 |    |-- version: string (nullable = true)
 |-- playlists: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- collaborative: string (nullable = true)
 |    |    |-- description: string (nullable = true)
 |    |    |-- duration_ms: long (nullable = true)
 |    |    |-- modified_at: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- num_albums: long (nullable = true)
 |    |    |-- num_artists: long (nullable = true)
 |    |    |-- num_edits: long (nullable = true)
 |    |    |-- num_followers: long (nullable = true)
 |    |    |-- num_tracks: long (nullable = true)
 |    

In [5]:
from pyspark.sql import functions as F

In [6]:
# Here, we've removed the useless info column and exploded the array of playlists so that
# each playlist was it's own row. 
#
# To double check, we counted the number of rows (1,000,000 playlists)

df = df.select(F.explode(F.col("playlists")).alias("playlists"))
#df1.printSchema()
df.show()
print("counting # of playlists...")
df.count()

+--------------------+
|           playlists|
+--------------------+
|{false, null, 115...|
|{false, null, 116...|
|{false, null, 140...|
|{false, null, 289...|
|{false, null, 433...|
|{false, null, 191...|
|{false, null, 340...|
|{false, null, 126...|
|{false, null, 994...|
|{false, null, 429...|
|{false, null, 164...|
|{false, null, 356...|
|{false, null, 198...|
|{false, null, 303...|
|{false, null, 258...|
|{false, null, 181...|
|{false, null, 246...|
|{false, null, 176...|
|{false, null, 159...|
|{false, null, 216...|
+--------------------+
only showing top 20 rows

counting # of playlists...


1000

In [7]:
# Here we take the "playlists" column exploded from earlier and select every column to be mapped to its proper header
# 
# We can now work with the playlists data to analyze, calculate, and answer questions.

df = df.select("playlists.*") #TODO: change * to only columns that we need to speed up operation times
df.show(3)
df.printSchema()

+-------------+-----------+-----------+-----------+----------------+----------+-----------+---------+-------------+----------+---+--------------------+
|collaborative|description|duration_ms|modified_at|            name|num_albums|num_artists|num_edits|num_followers|num_tracks|pid|              tracks|
+-------------+-----------+-----------+-----------+----------------+----------+-----------+---------+-------------+----------+---+--------------------+
|        false|       null|   11532414| 1493424000|      Throwbacks|        47|         37|        6|            1|        52|  0|[{The Cookbook, s...|
|        false|       null|   11656470| 1506556800|Awesome Playlist|        23|         21|        5|            1|        39|  1|[{Eye Of The Tige...|
|        false|       null|   14039958| 1505692800|         korean |        51|         31|       18|            1|        64|  2|[{On And On, spot...|
+-------------+-----------+-----------+-----------+----------------+----------+---------

In [8]:
# Here we isolate the tracks column, which has a lot of information about the tracks nested inside arrays
# we explode it and only select the columns that we are interested in looking at...

df_tracks = df.select("tracks")
df_tracks = df_tracks.select(F.explode(F.col("tracks")).alias("tracks"))
df_tracks = df_tracks.select("tracks.album_name", "tracks.artist_name", "tracks.track_name", "tracks.duration_ms") \
        .where(F.col("duration_ms") > 30000) # ONlY songs > 30 sec 
df_tracks.show(10)
df_tracks.count()

+--------------------+------------------+--------------------+-----------+
|          album_name|       artist_name|          track_name|duration_ms|
+--------------------+------------------+--------------------+-----------+
|        The Cookbook|     Missy Elliott|Lose Control (fea...|     226863|
|         In The Zone|    Britney Spears|               Toxic|     198800|
|Dangerously In Lo...|           Beyoncé|       Crazy In Love|     235933|
|           Justified| Justin Timberlake|      Rock Your Body|     267266|
|            Hot Shot|            Shaggy|        It Wasn't Me|     227600|
|         Confessions|             Usher|               Yeah!|     250373|
|         Confessions|             Usher|              My Boo|     223440|
|                 PCD|The Pussycat Dolls|             Buttons|     225560|
|The Writing's On ...|   Destiny's Child|         Say My Name|     271333|
|Speakerboxxx/The ...|           OutKast|Hey Ya! - Radio M...|     235213|
+--------------------+---

67450

# Track Analysis
---

In [9]:
 # Q1: Who were the most popular artists in 2010-2017 in users' playlists?

df_tracks.groupBy("artist_name").count().orderBy(['count'], ascending=[False]).show()

# TODO: Find % of playlists with drake, kanye or kendrick in them... this is just the count of how many times they appear

+-----------------+-----+
|      artist_name|count|
+-----------------+-----+
|            Drake|  939|
|       Kanye West|  415|
|   Kendrick Lamar|  385|
|          Rihanna|  350|
|           Eminem|  332|
|       The Weeknd|  296|
|     Lil Uzi Vert|  292|
|       Ed Sheeran|  285|
|           Future|  265|
|      Chris Brown|  259|
|    Justin Bieber|  251|
|        Lil Wayne|  242|
|          Beyoncé|  234|
| The Chainsmokers|  232|
|Twenty One Pilots|  226|
|         Big Sean|  222|
|      Post Malone|  221|
|          J. Cole|  219|
|    Kenny Chesney|  204|
|         Maroon 5|  203|
+-----------------+-----+
only showing top 20 rows



In [10]:
# Q2: What were the most popular tracks in 2010-2017 in users' playlists?

df_tracks.groupBy("track_name", "artist_name").count().orderBy(['count'], ascending=[False]).show(15)

+--------------------+----------------+-----+
|          track_name|     artist_name|count|
+--------------------+----------------+-----+
|           One Dance|           Drake|   55|
|             HUMBLE.|  Kendrick Lamar|   52|
|Broccoli (feat. L...|            DRAM|   50|
|              Closer|The Chainsmokers|   46|
|     Congratulations|     Post Malone|   44|
|   Don't Let Me Down|The Chainsmokers|   42|
|               Roses|The Chainsmokers|   39|
|         Bounce Back|        Big Sean|   39|
|iSpy (feat. Lil Y...|            KYLE|   39|
|             Jumpman|           Drake|   39|
|            Mask Off|          Future|   38|
|Bad and Boujee (f...|           Migos|   38|
|       XO TOUR Llif3|    Lil Uzi Vert|   37|
|       White Iverson|     Post Malone|   36|
|               Panda|       Desiigner|   36|
+--------------------+----------------+-----+
only showing top 15 rows



In [11]:
# Q3: What is the average song duration? What is the longest song? Shortest?

# TODO: convert avg song to mm:ss
# TODO: filter -1/0 values from duration_ms
# TODO: investigate why max song is so high

df_tracks

df_tracks.select((F.concat(F.round(F.avg("duration_ms")/60000,2), F.lit(" mins"))).alias("Avg Track Duration"), \
                 (F.concat(F.round(F.max("duration_ms")/60000,2), F.lit(" mins"))).alias("Max Track Duration"), \
                 (F.concat(F.round(F.min("duration_ms")/60000,2), F.lit(" mins"))).alias("Min Track Duration")).show()

+------------------+------------------+------------------+
|Avg Track Duration|Max Track Duration|Min Track Duration|
+------------------+------------------+------------------+
|         3.89 mins|         40.4 mins|          0.5 mins|
+------------------+------------------+------------------+



# Playlist Analysis
---

In [12]:
df_playlists = df.select("pid", "name", "description", "num_tracks", "num_artists", "num_albums", "duration_ms", "tracks")

In [13]:
# Q1: What is the most common word in the playlist name?

# Split playlist name, then capitalize the first letter (to group up same words), 
# then split name into multiple words, then explode each word into its own row
# then do some filtering, grouping, and sorting to get the result

df_playlists.withColumn('name', F.explode(F.split(F.initcap(F.col('name')), ' '))) \
  .where(F.col('name') != '') \
  .groupBy('name') \
  .count() \
  .sort('count', ascending=False) \
  .show()

+---------+-----+
|     name|count|
+---------+-----+
|  Country|   35|
|    Songs|   34|
|    Music|   28|
|    Chill|   24|
|   Summer|   23|
|    Party|   21|
|     Rock|   19|
|       My|   18|
|      New|   17|
|      Mix|   17|
| Playlist|   17|
|     Good|   17|
|Christmas|   16|
|      Rap|   14|
|      The|   13|
|      Old|   12|
|     2016|   11|
|      You|   10|
|  Workout|   10|
|       Up|    9|
+---------+-----+
only showing top 20 rows



In [14]:
# Q2: What is the most common word in the playlist description?

df_playlists.withColumn('description', F.explode(F.split(F.col('description'), ' '))) \
  .groupBy('description') \
  .count() \
  .sort('count', ascending=False) \
  .show()

+-------------+-----+
|  description|count|
+-------------+-----+
|           to|    4|
|          the|    3|
|        songs|    3|
|         from|    2|
|            I|    2|
|       listen|    2|
|           of|    2|
|            a|    2|
|       music.|    1|
|     espanish|    1|
|     chilllll|    1|
|   everything|    1|
|   motorcycle|    1|
|hypochondriac|    1|
|       Buenos|    1|
|          you|    1|
|       couple|    1|
|          sit|    1|
|          mix|    1|
|   &lt;Insert|    1|
+-------------+-----+
only showing top 20 rows



In [15]:
# The most common words are not really useful here in the description,
# So some filtering must be done to remove the common word and find common KEY words

# TODO: Add more filters or find a way to add values into array and filter by that

pl_clean_description = df_playlists.withColumn('words', F.explode(F.split(F.initcap(F.col('description')), ' '))) \
  .where(F.col('words') != '') \
  .groupBy('words') \
  .count() \
  .filter((F.col("words") != "the") & (F.col("words") != "to") & (F.col("words") != " ") & (F.col("words") != "and") &\
          (F.col("words") != "of") & (F.col("words") != "a") & (F.col("words") != "for") & (F.col("words") != "you") &\
          (F.col("words") != "that") & (F.col("words") != "in") & (F.col("words") != "my") & (F.col("words") != "I") &\
          (F.col("words") != "is") & (F.col("words") != "this") & (F.col("words") != "i") & (F.col("words") != "your")&\
          (F.col("words") != "with") & (F.col("words") != "from") & (F.col("words") != "all") & (F.col("words") != "me")&\
          (F.col("words") != "some") & (F.col("words") != "on") & (F.col("words") != "when") & (F.col("words") != "it")&\
          (F.col("words") != "just") & (F.col("words") != "like") & (F.col("words") != "are") & (F.col("words") != "or")&\
          (F.col("words") != "The") & (F.col("words") != "be") & (F.col("words") != " ") & (F.col("words") != "A")&\
          (F.col("words") != "have") & (F.col("words") != "at") & (F.col("words") != "these") & (F.col("words") != "I") &\
          (F.col("words") != "that") & (F.col("words") != "in") & (F.col("words") != "my") & (F.col("words") != "I") &\
          (F.col("words") != "but") & (F.col("words") != "get") & (F.col("words") != "by") & (F.col("words") != "not")) \
  .sort('count', ascending=False) \
  
pl_clean_description.show()

+----------+-----+
|     words|count|
+----------+-----+
|        To|    4|
|     Songs|    3|
|    Listen|    2|
|        Of|    2|
|      From|    2|
|       Sit|    1|
|     Chill|    1|
|       Mix|    1|
|       But|    1|
|      High|    1|
|    Little|    1|
|    Buenos|    1|
|        On|    1|
|        In|    1|
|Motorcycle|    1|
|       Edm|    1|
|   Moment.|    1|
|      Make|    1|
|   Teenage|    1|
|   Pumping|    1|
+----------+-----+
only showing top 20 rows



In [16]:
#list_of_genres = {"pop", "rock", "hip hop", "latin", "dance", "edm", "r&b", "country", "classical", "metal", "jazz"}

In [17]:
# Q3: What is the average playlist duration? Is there a maximum duration to a playlist? min?

df_playlists.select((F.concat(F.round(F.avg("duration_ms")/60000,2), F.lit(" mins"))).alias("Avg Playlist Duration"), \
                    (F.concat(F.round(F.max("duration_ms")/60000,2), F.lit(" mins"))).alias("Max Playlist Duration"), \
                    (F.concat(F.round(F.min("duration_ms")/60000,2), F.lit(" mins"))).alias("Min Playlist Duration")).show()


+---------------------+---------------------+---------------------+
|Avg Playlist Duration|Max Playlist Duration|Min Playlist Duration|
+---------------------+---------------------+---------------------+
|          262.34 mins|          963.62 mins|           16.72 mins|
+---------------------+---------------------+---------------------+



In [18]:
# Q4: What is the average playlist track number? Is there a maximum amount of tracks to a playlist? min?

df_playlists.select((F.avg("num_tracks")).alias("Avg Number of Tracks"), \
                    (F.max("num_tracks")).alias("Max Number of Tracks"), \
                    (F.min("num_tracks")).alias("Min Number of Tracks")).show()

+--------------------+--------------------+--------------------+
|Avg Number of Tracks|Max Number of Tracks|Min Number of Tracks|
+--------------------+--------------------+--------------------+
|              67.503|                 245|                   5|
+--------------------+--------------------+--------------------+



In [19]:
# we do not have the actual playlist ID from spotify unfortunatly
df_playlists.select(F.col("pid")).show(5)

+---+
|pid|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+
only showing top 5 rows



# Spotify API

In [20]:
# installing spotipy- a Spotify API library for python
!pip install spotipy -q --upgrade

In [21]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from pprint import pprint

os.environ["SPOTIPY_CLIENT_ID"] = "57a3616292974a8592cf33c6b9860806"
os.environ["SPOTIPY_CLIENT_SECRET"] = "9b8d5ee86c2b450ea75b0114bde7c99c"

spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [22]:
# example using a random track's uri- which we can find in the playlist track data

example_uri = 'spotify:track:7Feaw9WAEREY0DUOSXJLOM'
sp_track = spotify.track(example_uri)
pprint(sp_track)

{'album': {'album_type': 'single',
           'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/60d24wfXkVzDSfLS6hyCjZ'},
                        'href': 'https://api.spotify.com/v1/artists/60d24wfXkVzDSfLS6hyCjZ',
                        'id': '60d24wfXkVzDSfLS6hyCjZ',
                        'name': 'Martin Garrix',
                        'type': 'artist',
                        'uri': 'spotify:artist:60d24wfXkVzDSfLS6hyCjZ'},
                       {'external_urls': {'spotify': 'https://open.spotify.com/artist/3JhNCzhSMTxs9WLGJJxWOY'},
                        'href': 'https://api.spotify.com/v1/artists/3JhNCzhSMTxs9WLGJJxWOY',
                        'id': '3JhNCzhSMTxs9WLGJJxWOY',
                        'name': 'Macklemore',
                        'type': 'artist',
                        'uri': 'spotify:artist:3JhNCzhSMTxs9WLGJJxWOY'},
                       {'external_urls': {'spotify': 'https://open.spotify.com/artist/4UXqAaa6dQYAk18Lv7PEgX'},
      

In [23]:
# Sooooo.. the track metadata does not have the genre associated with the track
# but the artist has a genre key which lets us know the genre of music they create, which we can use to generalize
# the actual genre of the track/album

artist_uri = 'spotify:artist:3JhNCzhSMTxs9WLGJJxWOY' # Macklemore
sp_artist = spotify.artist(artist_uri)
pprint(sp_artist)

artist_genres = sp_artist['genres'] #reference to genres
print(artist_genres[0])

{'external_urls': {'spotify': 'https://open.spotify.com/artist/3JhNCzhSMTxs9WLGJJxWOY'},
 'followers': {'href': None, 'total': 1888667},
 'genres': ['pop rap', 'seattle hip hop'],
 'href': 'https://api.spotify.com/v1/artists/3JhNCzhSMTxs9WLGJJxWOY',
 'id': '3JhNCzhSMTxs9WLGJJxWOY',
 'images': [{'height': 640,
             'url': 'https://i.scdn.co/image/ab6761610000e5eb98ae73d49666f2795824b168',
             'width': 640},
            {'height': 320,
             'url': 'https://i.scdn.co/image/ab6761610000517498ae73d49666f2795824b168',
             'width': 320},
            {'height': 160,
             'url': 'https://i.scdn.co/image/ab6761610000f17898ae73d49666f2795824b168',
             'width': 160}],
 'name': 'Macklemore',
 'popularity': 75,
 'type': 'artist',
 'uri': 'spotify:artist:3JhNCzhSMTxs9WLGJJxWOY'}
pop rap
