In [3]:
# DEFINE FUNCTIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

def get_access_token(client_id:str, client_sc:str):
    import requests
    
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
    }
    data = f'grant_type=client_credentials&client_id={client_id}&client_secret={client_sc}'.encode()
    response = requests.post('https://accounts.spotify.com/api/token', headers=headers, data=data).json()
    access_token = response['access_token']

    return access_token

def get_response(access_token:str, endpoint:str, params:dict=None):
    import requests, json

    url = f"https://api.spotify.com/v1/{endpoint}"
    headers = {
        'Authorization': f'Bearer {access_token}',
    }

    if params != None:
        response = requests.get(url=url, params=params, headers=headers)
    else:
        response = requests.get(url=url, headers=headers)
    print(response)
    
    if response.status_code == 200:
        try:
            data = response.json()
            return data
        except json.decoder.JSONDecodeError:
            raise ValueError(f"API Server Error - {endpoint} - Invalid JSON content in response: {response.text}")
    else:
        raise ValueError(f"API Server Error - {endpoint} - Non-200 status code received: {response.status_code}")
    

def post_response(access_token:str, endpoint:str, data:dict=None):
    import requests

    url = f"https://api.spotify.com/v1/{endpoint}"
    headers = {
        'Authorization': f'Bearer {access_token}',
    }

    response = requests.post(url=url, headers=headers)
    print(response)
    
    if response.status_code == 200:
        pass
    else:
        raise ValueError(f"API Server Error - {endpoint} - Non-200 status code received: {response.status_code}")

In [4]:
# INFOS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

from configparser import ConfigParser

config = ConfigParser()
config.read("/home/hooniegit/git/Spotify-DemoProject/recommendation/demo/config.ini")

client_id = config.get("spotify", "client_id")
client_sc = config.get("spotify", "client_sc")
user_id = config.get("spotify", "user_id")

In [2]:
# START CODE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from math import ceil
import json

In [5]:
### Build Session
spark = SparkSession.builder \
    .master(config.get("spark", "master")) \
    .appName("pipeline_demo") \
    .getOrCreate()

### Create Access Token
access_token = get_access_token(client_id=client_id, client_sc=client_sc)

### Create Playlist Lists
endpoint = f"users/{user_id}/playlists"
params = {
    "limit": 50,
    "offset": 0
}

playlists = get_response(access_token=access_token, endpoint=endpoint, params=params)
json_string  = json.dumps(playlists)
json_rdd = spark.sparkContext.parallelize([json_string])
df_plinfo = spark.read.json(json_rdd, multiLine=True)

items = df_plinfo \
    .withColumn("items", explode("items")) \
    .select("items.id") \
    .rdd.flatMap(lambda x: x).collect()

### Create Playlist Item Lists
track_list = [] # <---------- "Need To Use"
for id in items:
    endpoint = f"playlists/{id}/tracks"
    playlist_spec = get_response(access_token=access_token, endpoint=endpoint)
    
    json_string  = json.dumps(playlist_spec)
    json_rdd = spark.sparkContext.parallelize([json_string])
    df_playlist_spec = spark.read.json(json_rdd, multiLine=True)
    
    ids = df_playlist_spec \
    .withColumn("items", explode("items")) \
    .select("items.track.id") \
    .rdd.flatMap(lambda x: x).collect()
    
    track_list += ids
    
    total = df_playlist_spec.select("total").first()[0]
    print(total)
    left = int(total)-100
    cnt = ceil(left/100)
    
    for i in range(cnt):
        offset = 100 + 100 * i
        params = {"offset":offset}
        
        playlist_spec = get_response(access_token=access_token, endpoint=endpoint, params=params)
        
        json_string  = json.dumps(playlist_spec)
        json_rdd = spark.sparkContext.parallelize([json_string])
        df_playlist_spec = spark.read.json(json_rdd, multiLine=True)
        
        ids = df_playlist_spec \
        .withColumn("items", explode("items")) \
        .select("items.track.id") \
        .rdd.flatMap(lambda x: x).collect()
        
        track_list += ids      

cnt = ceil(len(track_list)/50)

big_list = []
for j in range(cnt):
    big_list.append(track_list[j*50:(j+1)*50])


24/01/07 17:55:12 WARN Utils: Your hostname, workspace resolves to a loopback address: 127.0.0.1; using 220.118.158.128 instead (on interface eno1)
24/01/07 17:55:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/07 17:55:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


<Response [200]>


                                                                                

<Response [200]>


                                                                                

121
<Response [200]>


                                                                                

In [6]:
# Create Dataframe : main_df
main_df = None
cnt = 0
for small_list in big_list:
    
    print(cnt)
    
    tracks = ""
    for id in small_list:
        tracks += f",{id}"
    tracks = tracks[1:]
    
    endpoint = "tracks"
    params = {"ids":tracks}
    track = get_response(access_token=access_token, endpoint=endpoint, params=params)
    
    json_string  = json.dumps(track)
    json_rdd = spark.sparkContext.parallelize([json_string])
    df_tracks = spark.read.json(json_rdd, multiLine=True)
    
    df_tracks = spark.read.json(json_rdd, multiLine=True) \
        .withColumn("tracks", explode("tracks")) \
        .selectExpr("tracks.id",
                    "tracks.popularity")
    
    endpoint = "audio-features"
    params = {"ids":tracks}
    audio_features = get_response(access_token=access_token, endpoint=endpoint, params=params)
    
    json_string  = json.dumps(audio_features)
    json_rdd = spark.sparkContext.parallelize([json_string])
    df_audio_features = spark.read.json(json_rdd, multiLine=True) \
        .withColumn("audio_features", explode("audio_features")) \
        .selectExpr("audio_features.id",
                    "audio_features.key",
                    "audio_features.mode",
                    "audio_features.time_signature",
                    "audio_features.tempo",
                    "audio_features.acousticness",
                    "audio_features.danceability",
                    "audio_features.energy",
                    "audio_features.instrumentalness",
                    "audio_features.liveness",
                    "audio_features.loudness",
                    "audio_features.speechiness",
                    "audio_features.valence")
    
    result_track_df = df_tracks.join(df_audio_features, "id", "left")
    if cnt == 0:
        main_df = result_track_df
    else:
        main_df = main_df.union(result_track_df)
    cnt += 1

0
<Response [200]>


                                                                                

<Response [200]>


                                                                                

1
<Response [200]>


                                                                                

<Response [200]>


                                                                                

2
<Response [200]>


                                                                                

<Response [200]>


In [7]:
### Load Dataframe : df_dw
dw_tracks = spark.read.parquet("file:///home/hooniegit/git/Spotify-DemoProject/spark/data/parquet/tracks/main/*")
dw_audioFeatures = spark.read.parquet("file:///home/hooniegit/git/Spotify-DemoProject/spark/data/parquet/tracks/audio_features/*")
df_dw = dw_tracks.join(dw_audioFeatures, "id", "inner")

df_dw.show() # << TEST

[Stage 25:>                                                         (0 + 1) / 1]

+--------------------+----------+---+----+--------------+-------+------------+------------+------+----------------+--------+--------+-----------+-------+
|                  id|popularity|key|mode|time_signature|  tempo|acousticness|danceability|energy|instrumentalness|liveness|loudness|speechiness|valence|
+--------------------+----------+---+----+--------------+-------+------------+------------+------+----------------+--------+--------+-----------+-------+
|000O8nXpAK5QAppKv...|         5|  1|   1|             4|128.018|     5.12E-4|       0.504|  0.93|          0.0107|   0.281|   -2.89|     0.0539|  0.355|
|000kSCs9tKtH1VXI3...|        11|  2|   1|             4| 129.85|       0.863|       0.425| 0.266|             0.0|  0.0989|  -6.791|     0.0337|  0.253|
|0010mZpCCwlPwoBiB...|        38|  3|   1|             3|124.993|       0.108|       0.527| 0.793|         3.28E-6|   0.144|  -4.823|     0.0352|  0.597|
|0026hQeV7FZ0PaZpW...|        20| 11|   1|             4|169.358|        0.3

                                                                                

In [28]:
### TEST : EACH

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import KMeans

# Define Assembler
selected_features = ["popularity", "key", "mode", "time_signature", "tempo", "acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "valence"]
assembler = VectorAssembler(inputCols=selected_features, outputCol="features")

# Assemble Features
df_assembled_main = assembler.transform(main_df)
df_assembled_s3 = assembler.transform(df_dw)

# Define Scaler
standard_scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

# Scale Assembled Parquet DF - Playlist
standard_model_main = standard_scaler.fit(df_assembled_main)
scaled_df_assembled_main = standard_model_main.transform(df_assembled_main)

# Scale Assembled Parquet DF - DW
standard_model_s3 = standard_scaler.fit(df_assembled_s3)
scaled_df_assembled_s3 = standard_model_s3.transform(df_assembled_s3)

# Create Model == Train Dataset(Playlist)
kmeans = KMeans(featuresCol="scaledFeatures", k=4, seed=1)
model = kmeans.fit(scaled_df_assembled_main)

# Check Centers
centers = model.clusterCenters()
print("Cluster Centers:")
for center in centers:
    print(center)

# Test Dataset(DW)
df_result = model.transform(scaled_df_assembled_s3)
df_result.show()


"""
    parquet에서 불러온 데이터프레임의 경우
        - Vector Assembling 각기 진행
        - Standard Scaling 진행
        - KMeans 군집화 진행
        => 정상 동작
"""

                                                                                

Cluster Centers:
[ 2.09515646e+00  1.50236128e+00  1.65469882e+00  3.07697694e+01
  5.96699065e+00  2.16875808e+00  4.39646952e+00  3.00768884e+00
  1.56655839e-03  1.11411292e+00 -3.11720687e+00  7.99477989e-01
  1.45901860e+00]
[ 2.64677188  1.62550565  0.         31.24315051  5.25612042  0.56045294
  6.42199462  4.90811984 10.59962709  0.6982741  -1.16375907  1.01272458
  3.6305724 ]
[ 2.63658022  1.33925308  1.14476019 31.24315051  5.87248036  0.43306078
  5.64519108  5.0335193   0.08347468  1.51129427 -1.67051884  1.72694773
  3.09453942]
[ 1.53944895e+00  1.80877344e+00  8.32756920e-01  3.12431505e+01
  5.48980704e+00  7.70542800e-01  4.91380253e+00  4.58578034e+00
  2.59466620e-03  1.85139590e+00 -2.11223799e+00  1.15618116e+00
  1.82347482e+00]




+--------------------+----------+---+----+--------------+-------+------------+------------+------+----------------+--------+--------+-----------+-------+--------------------+--------------------+----------+
|                  id|popularity|key|mode|time_signature|  tempo|acousticness|danceability|energy|instrumentalness|liveness|loudness|speechiness|valence|            features|      scaledFeatures|prediction|
+--------------------+----------+---+----+--------------+-------+------------+------------+------+----------------+--------+--------+-----------+-------+--------------------+--------------------+----------+
|000O8nXpAK5QAppKv...|         5|  1|   1|             4|128.018|     5.12E-4|       0.504|  0.93|          0.0107|   0.281|   -2.89|     0.0539|  0.355|[5.0,1.0,1.0,4.0,...|[0.34752277128777...|         0|
|000kSCs9tKtH1VXI3...|        11|  2|   1|             4| 129.85|       0.863|       0.425| 0.266|             0.0|  0.0989|  -6.791|     0.0337|  0.253|[11.0,2.0,1.0,4.0..

                                                                                

'\n    parquet에서 불러온 데이터프레임의 경우\n        - Vector Assembling 진행\n        - Standard Scaling 진행\n        - KMeans 군집화 진행\n        => 정상 동작\n'

In [8]:
### Union Dataframe : df
from pyspark.sql.functions import col

df = df_dw \
    .filter(~col("id").isin(track_list)) \
    .union(main_df)

In [None]:
# 고려해야 할 사항들
"""
    0. 스케일링 가능 항목 : acousticness, danceability, energy, instrumentalness, liveness, speechiness, valence
    1. loudness 항목 : 절댓값 반환 필요 / 계산식 확인 필요
            from pyspark.sql.functions import abs
            df = df \
                .withColumn("loudness", abs("loudness"))
    2. mode 항목 : 0 또는 1만 가짐
    3. key, tempo 항목 : 이상치가 큰 영향을 끼칠 수 있음
    4. Null 값 확인
            #1번 - scaled_df.filter(col("features").isNull()).count()
            #2번 - from pyspark.sql.functions import sum
                   df \
                       .select([sum(col(column).isNull().cast("int")).alias(column + "_null_count") for column in df.columns]) \
                       .show()
"""

In [29]:
### TEST : UNION
# Define Assembler
selected_features = ["popularity", "key", "mode", "time_signature", "tempo", "acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "valence"]

# Assemble Features
assembler = VectorAssembler(inputCols=selected_features, outputCol="features")
df_assembled = assembler.transform(df)

# Define Scaler
standard_scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

# Scale Dataframe - Union
standard_model = standard_scaler.fit(df_assembled)
scaled_df = standard_model.transform(df_assembled)

# Create Model == Train Dataset
kmeans = KMeans(featuresCol="scaledFeatures", k=4, seed=1)
model = kmeans.fit(scaled_df)

# Check Centers
centers = model.clusterCenters()
print("Cluster Centers:")
for center in centers:
    print(center)

# Test Dataset
df_result = model.transform(scaled_df)
df_result.show()

"""
    StandardScaler 모델의 부분적 한계
        : 현재 시나리오에서 적용 불가
        (전체 데이터셋에 대해서 스케일링 -> 데이터 분할(이 부분이 불가) -> Playlist 데이터셋으로 학습 -> S3 데이터셋으로 추출)
    해결 방법
        : 원본 데이터의 각 컬럼 스케일링
"""

                                                                                

Cluster Centers:
[ 0.3538954   1.42194005  1.4411359   6.95556903  3.42287009  2.31031371
  1.72909809  0.61484765  1.37453573  0.91152591 -2.82729011  0.43023123
  0.84939467]
[ 0.89789533  1.34585524  2.08204658  7.52045263  4.08341734  0.69845268
  2.96760305  2.29603622  0.40922837  1.19286778 -1.1168708   0.58428837
  1.94149372]
[ 0.93704904  1.728517    0.          7.54733868  4.04320023  0.59334652
  3.10980173  2.36243712  0.51894269  1.14234023 -1.11574621  0.69537472
  1.8821592 ]
[ 0.9043051   1.50472815  1.26642417  7.50527711  3.83416934  0.80946278
  3.33430537  2.15850055  0.10341378  1.45230817 -1.30025187  3.4962747
  2.04674485]




+--------------------+----------+---+----+--------------+-------+------------+------------+------+----------------+--------+--------+-----------+-------+--------------------+--------------------+----------+
|                  id|popularity|key|mode|time_signature|  tempo|acousticness|danceability|energy|instrumentalness|liveness|loudness|speechiness|valence|            features|      scaledFeatures|prediction|
+--------------------+----------+---+----+--------------+-------+------------+------------+------+----------------+--------+--------+-----------+-------+--------------------+--------------------+----------+
|000O8nXpAK5QAppKv...|         5|  1|   1|             4|128.018|     5.12E-4|       0.504|  0.93|          0.0107|   0.281|   -2.89|     0.0539|  0.355|[5.0,1.0,1.0,4.0,...|[0.34749044357066...|         1|
|000kSCs9tKtH1VXI3...|        11|  2|   1|             4| 129.85|       0.863|       0.425| 0.266|             0.0|  0.0989|  -6.791|     0.0337|  0.253|[11.0,2.0,1.0,4.0..

                                                                                

In [9]:
from pyspark.sql.functions import mean, stddev

raw_scaled_df = df

# Standard Scaling
for col_name in df.columns[1:]:
    mean_val = df.select(mean(col_name)).collect()[0][0]
    stds_val = df.select(stddev(col_name)).collect()[0][0]
    raw_scaled_df = raw_scaled_df.withColumn(col_name, (col(col_name) - mean_val) / stds_val)

raw_scaled_df.show()



+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+
|                  id|          popularity|                key|               mode|     time_signature|              tempo|        acousticness|        danceability|              energy|   instrumentalness|            liveness|           loudness|         speechiness|             valence|
+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+
|000O8nXpAK5QAppKv...| -0.4076598013353838|-1.1919803361617896| 0.7516617180263379|0.26169225141541075| 0.3565664069530006|  -1.13

                                                                                

In [10]:
from pyspark.sql.functions import col

scaled_train = raw_scaled_df.filter(col("id").isin(track_list))
scaled_test = raw_scaled_df.filter(~col("id").isin(track_list))

In [12]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

selected_features = ["popularity", "key", "mode", "time_signature", "tempo", "acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "valence"]

# Define Assembler
assembler = VectorAssembler(inputCols=selected_features, outputCol="features")

# Assemble Features
df_assembled_train = assembler.transform(scaled_train)
df_assembled_test = assembler.transform(scaled_test)

# Create Model == Train Dataset
kmeans = KMeans(featuresCol="features", k=4, seed=1)
model = kmeans.fit(df_assembled_train)

# Check Centers
centers = model.clusterCenters()
print("Cluster Centers:")
for center in centers:
    print(center)

# Test Dataset
df_result = model.transform(df_assembled_test)
df_result.show()

24/01/07 17:59:35 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/01/07 17:59:50 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

Cluster Centers:
[ 1.94263108 -0.05917615  0.43620012  0.14610517  0.30600786  0.131075
 -0.0107263   0.16158431 -0.68087618 -0.29575747  0.83650879 -0.45908257
 -0.49890063]
[ 4.18651376 -0.18603946 -0.01540807  0.26169225  0.3869236  -0.79874266
  1.01583229  0.94800697 -0.67980518 -0.28380646  0.99853156 -0.15071017
  0.89261601]
[ 2.15449421 -0.04895536 -0.45129853  0.26169225  0.20362544 -0.79026544
  0.70446     1.13851189 -0.67644901 -0.09185571  1.0886037  -0.18437219
  0.95686589]
[ 0.42631438  0.83407828 -0.28936157  0.26169225  0.44684811 -0.56809359
  0.18485176  0.92947314 -0.68080928 -0.07755237  1.01142602 -0.34365202
  0.18815987]




+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+----------+
|                  id|          popularity|                key|               mode|     time_signature|              tempo|        acousticness|        danceability|              energy|   instrumentalness|            liveness|           loudness|         speechiness|             valence|            features|prediction|
+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+----------+
|000O8nXpAK5QAppKv...| -0.40765980

                                                                                

In [13]:
model.getDistanceMeasure()

'euclidean'

In [17]:
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import numpy as np

numpy_coordinates = centers[0]

def calculate_distance(vector):
    return float(np.linalg.norm(vector.toArray() - numpy_coordinates))

calculate_distance_udf = udf(calculate_distance, DoubleType())

df_with_distance = df_result \
    .filter("prediction=0") \
    .select("id", "features") \
    .withColumn('distance', calculate_distance_udf(col('features')))

# NumPy 배열을 Broadcast 변수로 변환하여 전체 클러스터에서 공유
# broadcasted_coordinates = spark.sparkContext.broadcast(numpy_coordinates)
# df_with_distance = df.withColumn('distance', calculate_distance_udf(col('features'), broadcasted_coordinates.value))

[Stage 887:>                                                        (0 + 1) / 1]

+--------------------+--------------------+------------------+
|                  id|            features|          distance|
+--------------------+--------------------+------------------+
|000kSCs9tKtH1VXI3...|[0.00932728974916...| 2.679730034605562|
|0010mZpCCwlPwoBiB...|[1.88576919962963...| 2.516595387109374|
|003Zj5utQfbYDofOq...|[0.56531007786189...| 2.813559663794831|
|0099y8U9Px3fIJX5L...|[-0.7551490439058...|3.2147991916433445|
|009BdcwF2lB7RZiTS...|[0.00932728974916...| 4.012952033482548|
|00B1efLzCB1lUNPT8...|[-0.5466554983635...|3.5230749456986974|
|00CYT9MwnEJva7cCy...|[-0.1296684072790...|3.3332530097812096|
|00CZIZmqtInPEWdxG...|[0.14832298677734...|3.6987545826728816|
|00DxOoAYGMwVBTdhu...|[2.58074768477055...| 2.794095131817052|
|00FjRu5wKkgDfiHog...|[0.77380362340417...| 4.924506791931357|
|00JK3hvSqMRgTs0BO...|[-0.7551490439058...| 5.681175045657208|
|00JOPzHXgTnWpxYzO...|[-0.6161533468776...|  4.59888989364851|
|00L5ETjPbcP9TKG0s...|[0.28731868380553...|3.2198471883

                                                                                

In [20]:
df_with_distance\
    .orderBy("distance", ascending=True)\
    .show()



+--------------------+--------------------+------------------+
|                  id|            features|          distance|
+--------------------+--------------------+------------------+
|2kLaIBv7OwCNbrDlf...|[2.02476489665782...|0.6357580389694055|
|1NghJBnRfOjjH7ffK...|[1.95526704814372...|0.6773515540748449|
|1725ZPzVwEYaFTztQ...|[2.09426274517191...|0.6849070037733087|
|6XLdLztXhlGZsdp7s...|[1.67727565408736...|  0.71556916592349|
|0Cwgls4XXwoIhR0Kk...|[2.02476489665782...|0.7228839190108275|
|6cfGK1pbi8f7ODCeK...|[1.95526704814372...| 0.731438903134188|
|6ZyAlU7tYUGrto7hc...|[2.09426274517191...|0.7427190547223598|
|0iq3HEoB7LZIF4kwY...|[1.81627135111554...|0.7518801998154157|
|1JjK0xXcFxQqF6LRB...|[1.53827995705918...|0.7670359113729857|
|1ntlpw8zsYg1O0nJU...|[1.67727565408736...|0.7871181984300836|
|7lHjxYXlvS3GOPy1Z...|[1.74677350260145...|0.7938864450373423|
|5rUDkGoOp21WT7y8C...|[1.81627135111554...|0.7945844290277523|
|4gQLLI4oii6Dt4IpB...|[2.09426274517191...|0.7953099037

                                                                                

In [94]:
spark.stop()