In [1]:
# DEFINE FUNCTIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

def get_access_token(client_id:str, client_sc:str):
    import requests
    
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
    }
    data = f'grant_type=client_credentials&client_id={client_id}&client_secret={client_sc}'.encode()
    response = requests.post('https://accounts.spotify.com/api/token', headers=headers, data=data).json()
    access_token = response['access_token']

    return access_token

def get_response(access_token:str, endpoint:str, params:dict=None):
    import requests, json

    url = f"https://api.spotify.com/v1/{endpoint}"
    headers = {
        'Authorization': f'Bearer {access_token}',
    }

    if params != None:
        response = requests.get(url=url, params=params, headers=headers)
    else:
        response = requests.get(url=url, headers=headers)
    print(response)
    
    if response.status_code == 200:
        try:
            data = response.json()
            return data
        except json.decoder.JSONDecodeError:
            raise ValueError(f"API Server Error - {endpoint} - Invalid JSON content in response: {response.text}")
    else:
        raise ValueError(f"API Server Error - {endpoint} - Non-200 status code received: {response.status_code}")
    

def post_response(access_token:str, endpoint:str, data:dict=None):
    import requests

    url = f"https://api.spotify.com/v1/{endpoint}"
    headers = {
        'Authorization': f'Bearer {access_token}',
    }

    response = requests.post(url=url, headers=headers)
    print(response)
    
    if response.status_code == 200:
        pass
    else:
        raise ValueError(f"API Server Error - {endpoint} - Non-200 status code received: {response.status_code}")

In [11]:
# INFOS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

from configparser import ConfigParser

config = ConfigParser()
config.read("/home/hooniegit/git/Spotify-DemoProject/recommendation/demo/config.ini")

client_id = config.get("spotify", "client_id")
client_sc = config.get("spotify", "client_sc")
user_id = config.get("spotify", "user_id")

In [3]:
# START CODE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from math import ceil
import json

In [12]:
### Build Session
spark = SparkSession.builder \
    .master(config.get("spark", "master")) \
    .appName("pipeline_demo") \
    .getOrCreate()

### Create Access Token
access_token = get_access_token(client_id=client_id, client_sc=client_sc)

### Create Playlist Lists
endpoint = f"users/{user_id}/playlists"
params = {
    "limit": 50,
    "offset": 0
}

playlists = get_response(access_token=access_token, endpoint=endpoint, params=params)
json_string  = json.dumps(playlists)
json_rdd = spark.sparkContext.parallelize([json_string])
df_plinfo = spark.read.json(json_rdd, multiLine=True)

items = df_plinfo \
    .withColumn("items", explode("items")) \
    .select("items.id") \
    .rdd.flatMap(lambda x: x).collect()

### Create Playlist Item Lists
track_list = [] # <---------- "Need To Use"
for id in items:
    endpoint = f"playlists/{id}/tracks"
    playlist_spec = get_response(access_token=access_token, endpoint=endpoint)
    
    json_string  = json.dumps(playlist_spec)
    json_rdd = spark.sparkContext.parallelize([json_string])
    df_playlist_spec = spark.read.json(json_rdd, multiLine=True)
    
    ids = df_playlist_spec \
    .withColumn("items", explode("items")) \
    .select("items.track.id") \
    .rdd.flatMap(lambda x: x).collect()
    
    track_list += ids
    
    total = df_playlist_spec.select("total").first()[0]
    print(total)
    left = int(total)-100
    cnt = ceil(left/100)
    
    for i in range(cnt):
        offset = 100 + 100 * i
        params = {"offset":offset}
        
        playlist_spec = get_response(access_token=access_token, endpoint=endpoint, params=params)
        
        json_string  = json.dumps(playlist_spec)
        json_rdd = spark.sparkContext.parallelize([json_string])
        df_playlist_spec = spark.read.json(json_rdd, multiLine=True)
        
        ids = df_playlist_spec \
        .withColumn("items", explode("items")) \
        .select("items.track.id") \
        .rdd.flatMap(lambda x: x).collect()
        
        track_list += ids      

cnt = ceil(len(track_list)/50)

big_list = []
for j in range(cnt):
    big_list.append(track_list[j*50:(j+1)*50])


<Response [200]>


                                                                                

<Response [200]>


                                                                                

121
<Response [200]>


                                                                                

In [13]:
# Create Dataframe : main_df
main_df = None
cnt = 0
for small_list in big_list:
    
    print(cnt)
    
    tracks = ""
    for id in small_list:
        tracks += f",{id}"
    tracks = tracks[1:]
    
    endpoint = "tracks"
    params = {"ids":tracks}
    track = get_response(access_token=access_token, endpoint=endpoint, params=params)
    
    json_string  = json.dumps(track)
    json_rdd = spark.sparkContext.parallelize([json_string])
    df_tracks = spark.read.json(json_rdd, multiLine=True)
    
    df_tracks = spark.read.json(json_rdd, multiLine=True) \
        .withColumn("tracks", explode("tracks")) \
        .selectExpr("tracks.id",
                    "tracks.popularity")
    
    endpoint = "audio-features"
    params = {"ids":tracks}
    audio_features = get_response(access_token=access_token, endpoint=endpoint, params=params)
    
    json_string  = json.dumps(audio_features)
    json_rdd = spark.sparkContext.parallelize([json_string])
    df_audio_features = spark.read.json(json_rdd, multiLine=True) \
        .withColumn("audio_features", explode("audio_features")) \
        .selectExpr("audio_features.id",
                    "audio_features.key",
                    "audio_features.mode",
                    "audio_features.time_signature",
                    "audio_features.tempo",
                    "audio_features.acousticness",
                    "audio_features.danceability",
                    "audio_features.energy",
                    "audio_features.instrumentalness",
                    "audio_features.liveness",
                    "audio_features.loudness",
                    "audio_features.speechiness",
                    "audio_features.valence")
    
    result_track_df = df_tracks.join(df_audio_features, "id", "left")
    if cnt == 0:
        main_df = result_track_df
    else:
        main_df = main_df.union(result_track_df)
    cnt += 1

0
<Response [200]>
<Response [200]>
1
<Response [200]>
<Response [200]>
2
<Response [200]>
<Response [200]>


In [14]:
### Load Dataframe : df_dw
dw_tracks = spark.read.parquet("file:///home/hooniegit/git/Spotify-DemoProject/spark/data/parquet/tracks/main/*")
dw_audioFeatures = spark.read.parquet("file:///home/hooniegit/git/Spotify-DemoProject/spark/data/parquet/tracks/audio_features/*")
df_dw = dw_tracks.join(dw_audioFeatures, "id", "inner")

df_dw.show() # << TEST



+--------------------+----------+---+----+--------------+-------+------------+------------+------+----------------+--------+--------+-----------+-------+
|                  id|popularity|key|mode|time_signature|  tempo|acousticness|danceability|energy|instrumentalness|liveness|loudness|speechiness|valence|
+--------------------+----------+---+----+--------------+-------+------------+------------+------+----------------+--------+--------+-----------+-------+
|000O8nXpAK5QAppKv...|         5|  1|   1|             4|128.018|     5.12E-4|       0.504|  0.93|          0.0107|   0.281|   -2.89|     0.0539|  0.355|
|000kSCs9tKtH1VXI3...|        11|  2|   1|             4| 129.85|       0.863|       0.425| 0.266|             0.0|  0.0989|  -6.791|     0.0337|  0.253|
|0010mZpCCwlPwoBiB...|        38|  3|   1|             3|124.993|       0.108|       0.527| 0.793|         3.28E-6|   0.144|  -4.823|     0.0352|  0.597|
|0026hQeV7FZ0PaZpW...|        20| 11|   1|             4|169.358|        0.3

                                                                                

In [83]:
print(main_df.count())
print(df_dw.count())

                                                                                

121




1581470


                                                                                

In [85]:
### Union Dataframe : df
from pyspark.sql.functions import col

df = df_dw \
    .filter(~col("id").isin(track_list)) \
    .union(main_df)

df.count() # << TEST

                                                                                

1581569

In [None]:
# 고려해야 할 사항들
"""
    0. 스케일링 가능 항목 : acousticness, danceability, energy, instrumentalness, liveness, speechiness, valence
    1. loudness 항목 : 절댓값 반환 필요 / 계산식 확인 필요
    1. mode 항목 : 0 또는 1만 가짐
    2. key, tempo 항목 : 이상치가 큰 영향을 끼칠 수 있음
"""

In [86]:
from pyspark.sql.functions import abs

df = df \
    .withColumn("loudness", abs("loudness"))
df.show()



+--------------------+----------+---+----+--------------+-------+------------+------------+------+----------------+--------+--------+-----------+-------+
|                  id|popularity|key|mode|time_signature|  tempo|acousticness|danceability|energy|instrumentalness|liveness|loudness|speechiness|valence|
+--------------------+----------+---+----+--------------+-------+------------+------------+------+----------------+--------+--------+-----------+-------+
|000O8nXpAK5QAppKv...|         5|  1|   1|             4|128.018|     5.12E-4|       0.504|  0.93|          0.0107|   0.281|    2.89|     0.0539|  0.355|
|000kSCs9tKtH1VXI3...|        11|  2|   1|             4| 129.85|       0.863|       0.425| 0.266|             0.0|  0.0989|   6.791|     0.0337|  0.253|
|0010mZpCCwlPwoBiB...|        38|  3|   1|             3|124.993|       0.108|       0.527| 0.793|         3.28E-6|   0.144|   4.823|     0.0352|  0.597|
|0026hQeV7FZ0PaZpW...|        20| 11|   1|             4|169.358|        0.3

                                                                                

In [87]:
### Create Features
from pyspark.ml.feature import VectorAssembler

# selected_features = ["acousticness", "danceability", "energy", "instrumentalness", "liveness", "speechiness", "valence"]
selected_features = ["popularity", "key", "mode", "time_signature", "tempo", "acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "valence"]
assembler = VectorAssembler(inputCols=selected_features, outputCol="features")
df_assembled = assembler.transform(df)

df_assembled.show()



+--------------------+----------+---+----+--------------+-------+------------+------------+------+----------------+--------+--------+-----------+-------+--------------------+
|                  id|popularity|key|mode|time_signature|  tempo|acousticness|danceability|energy|instrumentalness|liveness|loudness|speechiness|valence|            features|
+--------------------+----------+---+----+--------------+-------+------------+------------+------+----------------+--------+--------+-----------+-------+--------------------+
|000O8nXpAK5QAppKv...|         5|  1|   1|             4|128.018|     5.12E-4|       0.504|  0.93|          0.0107|   0.281|    2.89|     0.0539|  0.355|[5.0,1.0,1.0,4.0,...|
|000kSCs9tKtH1VXI3...|        11|  2|   1|             4| 129.85|       0.863|       0.425| 0.266|             0.0|  0.0989|   6.791|     0.0337|  0.253|[11.0,2.0,1.0,4.0...|
|0010mZpCCwlPwoBiB...|        38|  3|   1|             3|124.993|       0.108|       0.527| 0.793|         3.28E-6|   0.144| 

                                                                                

In [45]:
# <----------------------DEMO--------------------------------

from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col

assembler_demo = VectorAssembler(inputCols=selected_features, outputCol="features")
df_assembled_demo = assembler_demo.transform(df)

# create model == train dataset
kmeans = KMeans().setK(4).setSeed(1)
model = kmeans.fit(df_assembled_demo)

# check centers
centers = model.clusterCenters()
print("Cluster Centers:")
for center in centers:
    print(center)

# test dataset
df_result = model.transform(df_assembled_demo)

# check prediction results
df_result.select("features", "prediction").show()

# check mean values
df_result.groupBy("prediction").agg(
    col("prediction")
).show()


                                                                                

Cluster Centers:
[ 5.30442388e+00  5.25986151e+00  6.33216098e-01  3.90068808e+00
  1.26251960e+02  3.85885709e-01  5.81514163e-01  5.61546456e-01
  2.81342948e-01  2.00185878e-01 -1.07680766e+01  7.88508693e-02
  4.65834796e-01]
[ 1.23390427e+01  5.23408698e+00  6.55525586e-01  3.84587689e+00
  1.67466397e+02  3.35039040e-01  4.92082926e-01  6.10620140e-01
  1.94414210e-01  2.08495522e-01 -9.22966755e+00  1.16590383e-01
  4.89392808e-01]
[ 3.81960187e+01  5.27521194e+00  6.05877286e-01  3.93880933e+00
  1.14890647e+02  2.95583223e-01  6.30137124e-01  6.18648537e-01
  1.27659892e-01  1.97130981e-01 -8.14928326e+00  9.33018380e-02
  4.76946989e-01]
[  6.34248749   5.21078692   0.64991465   3.80168955  86.14469493
   0.57447802   0.4768827    0.41216808   0.28717292   0.19524888
 -13.72237139   0.09805415   0.39089009]


                                                                                

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[5.0,1.0,1.0,4.0,...|         0|
|[11.0,2.0,1.0,4.0...|         0|
|[38.0,3.0,1.0,3.0...|         2|
|[20.0,11.0,1.0,4....|         1|
|[19.0,1.0,1.0,4.0...|         3|
|[2.0,8.0,1.0,4.0,...|         3|
|[3.0,8.0,0.0,5.0,...|         1|
|[0.0,6.0,0.0,3.0,...|         3|
|[0.0,10.0,1.0,4.0...|         0|
|[25.0,6.0,0.0,4.0...|         2|
|[0.0,11.0,0.0,5.0...|         1|
|[0.0,3.0,1.0,4.0,...|         3|
|[11.0,10.0,1.0,4....|         3|
|[10.0,9.0,0.0,3.0...|         1|
|[17.0,8.0,0.0,4.0...|         3|
|[3.0,0.0,1.0,4.0,...|         1|
|[21.0,7.0,0.0,4.0...|         3|
|[2.0,2.0,1.0,1.0,...|         0|
|[9.0,1.0,1.0,4.0,...|         3|
|[13.0,4.0,1.0,5.0...|         3|
+--------------------+----------+
only showing top 20 rows





+----------+----------+
|prediction|prediction|
+----------+----------+
|         1|         1|
|         3|         3|
|         2|         2|
|         0|         0|
+----------+----------+



                                                                                

In [88]:
from pyspark.ml.feature import MinMaxScaler

# vector 컬럼이 1개 이상 존재할 수 없음
minmax_scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
minmax_model = minmax_scaler.fit(df_assembled)
minmax_scaled_df = minmax_model.transform(df_assembled) \
    # .drop('features') \
    # .withColumnRenamed('scaledFeatures', 'features')

                                                                                

In [89]:
from pyspark.sql.functions import sum

minmax_scaled_df \
    .select([sum(col(column).isNull().cast("int")).alias(column + "_null_count") for column in df.columns]) \
    .show()



+-------------+---------------------+--------------+---------------+-------------------------+----------------+-----------------------+-----------------------+-----------------+---------------------------+-------------------+-------------------+----------------------+------------------+
|id_null_count|popularity_null_count|key_null_count|mode_null_count|time_signature_null_count|tempo_null_count|acousticness_null_count|danceability_null_count|energy_null_count|instrumentalness_null_count|liveness_null_count|loudness_null_count|speechiness_null_count|valence_null_count|
+-------------+---------------------+--------------+---------------+-------------------------+----------------+-----------------------+-----------------------+-----------------+---------------------------+-------------------+-------------------+----------------------+------------------+
|            0|                    0|             0|              0|                        0|               0|                      0| 

                                                                                

In [90]:
from pyspark.sql.functions import col

minmax_scaled_train = minmax_scaled_df.filter(col("id").isin(track_list))
minmax_scaled_test = minmax_scaled_df.filter(~col("id").isin(track_list))

In [91]:
print(len(track_list))
print(minmax_scaled_train.count())
print(minmax_scaled_test.count())

121


                                                                                

121




1581448


                                                                                

In [92]:
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col

In [95]:
# create model == train dataset
kmeans = KMeans(featuresCol="scaledFeatures", k=4, seed=1)
model = kmeans.fit(minmax_scaled_train)

# check centers
centers = model.clusterCenters()
print("Cluster Centers:")
for center in centers:
    print(center)

# test dataset
df_result = model.transform(minmax_scaled_train)
df_result.show()

AssertionError: 

In [94]:
spark.stop()