## Spark ML Music Recommendation (Implicit)

- dataset: http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-1K.html

In [1]:
from pyspark.conf import SparkConf
from pyspark import StorageLevel

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Spark ML") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

In [3]:
schema = StructType([
    StructField("userId", IntegerType()),
    StructField("artistId", IntegerType()),
    StructField("count", IntegerType())
])

In [4]:
df = spark.read.csv("../dataset/user_artist_data_small.txt", schema=schema, sep=" ").cache()
df.show(10)

+-------+--------+-----+
| userId|artistId|count|
+-------+--------+-----+
|1059637| 1000010|  238|
|1059637| 1000049|    1|
|1059637| 1000056|    1|
|1059637| 1000062|   11|
|1059637| 1000094|    1|
|1059637| 1000112|  423|
|1059637| 1000113|    5|
|1059637| 1000114|    2|
|1059637| 1000123|    2|
|1059637| 1000130|19129|
+-------+--------+-----+
only showing top 10 rows



In [5]:
df.describe().show()

+-------+------------------+------------------+------------------+
|summary|            userId|          artistId|             count|
+-------+------------------+------------------+------------------+
|  count|             49481|             49481|             49481|
|   mean|1328420.1949435137|2003155.0297285826|130.57579677047755|
| stddev| 452991.3131262286|2489609.3644763026|3034.3847545693047|
|    min|           1000647|                 1|                 1|
|    max|           2288164|          10788218|            439771|
+-------+------------------+------------------+------------------+



## Alternating Least Square

In [6]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

In [7]:
(train, test) = df.randomSplit([0.8, 0.2])

In [8]:
als = ALS(userCol="userId", itemCol="artistId", ratingCol="count") \
    .setColdStartStrategy("drop") \
    .setNonnegative(True) \
    .setRank(14) \
    .setMaxIter(20) \
    .setRegParam(.17) \
    .setAlpha(3.0)

alsModel = als.fit(train)
alsModel.userFactors.show(5)

+-------+--------------------+
|     id|            features|
+-------+--------------------+
|1001440|[0.0, 0.0, 0.0, 0...|
|1017610|[0.0, 0.0, 0.0, 0...|
|1021940|[0.0, 0.0, 0.0, 0...|
|1058890|[0.0, 0.0, 0.0, 0...|
|2005710|[0.0, 0.0, 0.0, 0...|
+-------+--------------------+
only showing top 5 rows



In [9]:
predictions = alsModel.transform(test)
predictions.sort("userId", "count").show(5)

+-------+--------+-----+----------+
| userId|artistId|count|prediction|
+-------+--------+-----+----------+
|1000647| 1035554|    1|       0.0|
|1000647|    1598|    1|       0.0|
|1000647| 1004088|    1|       0.0|
|1000647|    4037|    2|       0.0|
|1000647| 1002220|    5|       0.0|
+-------+--------+-----+----------+
only showing top 5 rows



In [10]:
spark.stop()