# Будем использовать Spark

https://spark.apache.org/docs/latest/rdd-programming-guide.html

http://spark.apache.org/docs/latest/sql-getting-started.html

In [None]:
import pyspark
from pyspark.sql import SparkSession, Row
sc = spark.sparkContext
se = spark

# Датасет Яндекс.Музыка

In [None]:
! head -n 5 artists.jsonl

{"artistId":0,"artistName":"Mack Gordon"}
{"artistId":1,"artistName":"Kenny Dorham"}
{"artistId":2,"artistName":"Max Roach"}
{"artistId":3,"artistName":"Francis Rossi"}
{"artistId":4,"artistName":"Status Quo"}


In [None]:
! head -n 5 events.csv

userId,artistId,plays,skips
0,335,1,0
0,708,1,0
0,710,2,1
0,815,1,1


# Загружаем данные

In [None]:
artists = se.read.json("artists.jsonl")
artists.registerTempTable("artists")
artists.limit(5).toPandas()

Unnamed: 0,artistId,artistName
0,0,Mack Gordon
1,1,Kenny Dorham
2,2,Max Roach
3,3,Francis Rossi
4,4,Status Quo


In [None]:
events = se.read.csv("events.csv", header=True, 
                     schema='userId bigint, artistId bigint, plays INT, skips INT')
events.registerTempTable("events")
events.limit(5).toPandas()

Unnamed: 0,userId,artistId,plays,skips
0,0,335,1,0
1,0,708,1,0
2,0,710,2,1
3,0,815,1,1
4,0,880,1,1


In [None]:
%%time
# статистики
se.sql("""
select
    count(distinct userId) as users,
    count(distinct artistId) as artists,
    count(*) as interactions,
    count(*) / (count(distinct userId) * count(distinct artistId)) as density
from 
    events
""").toPandas()

CPU times: user 4.57 ms, sys: 7.24 ms, total: 11.8 ms
Wall time: 9.94 s


Unnamed: 0,users,artists,interactions,density
0,4999,53031,3412504,0.012872


In [None]:
%%time
# самые популярные исполнители
se.sql("""
select
    artists.artistName,
    sum(plays) as popularity
from 
    events join artists on events.artistId = artists.artistId
group by artistName
order by popularity desc
limit 30
""").toPandas()

CPU times: user 11 ms, sys: 0 ns, total: 11 ms
Wall time: 4.91 s


Unnamed: 0,artistName,popularity
0,Imagine Dragons,43447
1,Би-2,29415
2,Баста,27264
3,Ленинград,26311
4,Сплин,25062
5,Queen,24905
6,Sia,22803
7,LOBODA,21923
8,Noize MC,21774
9,Linkin Park,21584


# Обучаем iALS

Будем считать, что рейтинг – это plays

In [None]:
%%time
train, test = events.rdd.randomSplit([0.95, 0.05], seed=0)

# кэшируем для скорости, будем обращаться несколько раз
train.cache()
test.cache()

train.count()
test.count()

CPU times: user 18.4 ms, sys: 7.98 ms, total: 26.3 ms
Wall time: 23.7 s


170048

In [None]:
train.take(5)

[Row(userId=0, artistId=335, plays=1, skips=0),
 Row(userId=0, artistId=708, plays=1, skips=0),
 Row(userId=0, artistId=710, plays=2, skips=1),
 Row(userId=0, artistId=815, plays=1, skips=1),
 Row(userId=0, artistId=880, plays=1, skips=1)]

In [None]:
%%time
from pyspark.mllib.recommendation import ALS
import numpy as np
model = ALS().trainImplicit(
    train.map(lambda x: (x.userId, x.artistId, np.log2(x.plays + 1))),
    rank=32, iterations=10, lambda_=0.01, alpha=10.0, seed=0
)

conf = 1 + alpha * log(plays + 1)
pref = log(plays + 1) > 0?

CPU times: user 36.4 ms, sys: 6.48 ms, total: 42.9 ms
Wall time: 43.2 s


In [None]:
# достаем все профили исполнителей
import numpy as np

artist_to_name = {}
for row in artists.collect():
    artist_to_name[row.artistId] = row.artistName

artist_ids = []
artist_names = []
artist_profiles = []

for artistId, profile in model.productFeatures().collect():
    artist_ids.append(artistId)
    artist_names.append(artist_to_name[artistId])
    artist_profiles.append(profile)

artist_ids = np.array(artist_ids)
artist_names = np.array(artist_names)
artist_profiles = np.vstack(artist_profiles)
print(artist_profiles.shape)

(52657, 32)


# Похожести исполнителей

In [None]:
target_artists = {index: v 
                  for index, v in enumerate(artist_names) 
                  if "Led Zeppelin" == v or "50 Cent" == v or "AC/DC" == v}
target_artists

{19576: 'AC/DC', 39708: '50 Cent', 46770: 'Led Zeppelin'}

In [None]:
import scipy
import scipy.spatial

for index, name in target_artists.items():
    print("#############", name, "#############")
    
    cosines = (-scipy.spatial.distance.cdist([artist_profiles[index]], artist_profiles, metric='cosine') + 1)[0]
    cosines[np.isnan(cosines)] = -1e20

    for idx in np.argsort(cosines)[::-1][:10]:
        print(artist_names[idx], "\t", cosines[idx])

############# AC/DC #############
AC/DC 	 1.0
The Offspring 	 0.8920983660176897
Nirvana 	 0.869637282794584
Red Hot Chili Peppers 	 0.8636267038822294
Metallica 	 0.8588127269321177
System of A Down 	 0.8548621643954725
Bon Jovi 	 0.8430942520155917
Limp Bizkit 	 0.8370104483034991
Nickelback 	 0.8320685883899654
Kiss 	 0.8268261566662816
############# 50 Cent #############
50 Cent 	 1.0
Dr. Dre 	 0.8901604614284715
2Chainz 	 0.8311867507297994
Lloyd Banks 	 0.8245230568564915
Ludacris 	 0.8191309370754879
Fat Joe 	 0.809364906119338
Jay-Z 	 0.8060270943142882
Cashis 	 0.8050292055930613
Missy  Elliott 	 0.7990513407065353
Akon 	 0.7939836276765622
############# Led Zeppelin #############
Led Zeppelin 	 1.0
The Rolling Stones 	 0.9450603019905758
The Doors 	 0.9412713431930472
Guns N' Roses 	 0.9025949012228445
Bob  Dylan 	 0.8825744315861317
Pink Floyd 	 0.8665565712846943
Deep Purple 	 0.8592175257359826
Aerosmith 	 0.8544523351053174
Ozzy Osbourne 	 0.8427703981778234
David Bowie 	

# NDCG

In [None]:
def dcg(ratings):
    return float(np.sum((2 ** np.array(ratings, np.float32) - 1) / np.log2(np.arange(1, len(ratings) + 1) + 1)))


def ndcg(ratings, at=None):
    idcg = dcg(sorted(ratings, reverse=True))
    return dcg(ratings) / idcg if idcg > 0 else 0


def ndcg_score(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    order = np.argsort(y_pred)[::-1]
    return ndcg(y_true[order])


# tests
def test1():
    y_true = np.array([  0,   0,   2,   1,   0])
    y_pred = np.array([0.2, 0.1, 0.5, 0.3, 0.4])
    correct_ndcg = (3 / np.log(1 + 1) + 1 / np.log(3 + 1)) / (3 / np.log(1 + 1) + 1 / np.log(2 + 1))
    assert np.allclose(ndcg_score(y_true, y_pred), correct_ndcg)

    
def test2():
    y_true = np.array([  0,   0,   0,   0,   0])
    y_pred = np.array([0.2, 0.1, 0.5, 0.3, 0.4])
    assert np.allclose(ndcg_score(y_true, y_pred), 0.0)

    
def test3():
    y_true = np.array([  1,   0,   0,   0,   0])
    y_pred = np.array([0.2, 0.1, 0.5, 0.3, 0.4])
    correct_ndcg = (1 / np.log(4 + 1)) / (1 / np.log(1 + 1))
    assert np.allclose(ndcg_score(y_true, y_pred), correct_ndcg)


test1()
test2()
test3()

In [None]:
print(dcg([5, 4, 3, 2, 1]))
print(dcg([3, 4, 5, 2, 1]))
print(dcg([5, 4, 1, 2, 3]))

45.64282878502658
33.64282878502658
44.963945628433834


# Считаем NDCG для базового решения

Всегда ранжируем исполнителей по популярности

In [None]:
artist_to_popularity = (
    train
    .map(lambda x: (x.artistId, x.plays))
    .reduceByKey(lambda a, b: a + b)
    .collect()
)

artist_to_popularity = {a: p for a, p in artist_to_popularity}

In [None]:
predictions_and_ratings_per_user = (
    test
    .map(lambda x: (x.userId, (artist_to_popularity.get(x.artistId, 0), np.log2(x.plays + 1))))
    .groupByKey()
    .map(lambda x: (x[0], list(x[1])))
)

In [None]:
predictions_and_ratings_per_user.take(1)

[(0,
  [(2393, 1.0),
   (21848, 3.321928094887362),
   (624, 1.0),
   (7273, 1.0),
   (900, 1.584962500721156),
   (494, 1.584962500721156),
   (4011, 1.0),
   (2271, 1.584962500721156),
   (788, 1.0),
   (1024, 1.584962500721156),
   (4428, 3.584962500721156),
   (230, 1.0),
   (1515, 1.0),
   (2313, 1.584962500721156),
   (1243, 2.321928094887362),
   (5501, 2.321928094887362),
   (7768, 5.977279923499917),
   (783, 1.0),
   (4757, 1.584962500721156),
   (1228, 1.0),
   (47, 0.0),
   (4281, 0.0),
   (3577, 0.0),
   (1263, 0.0),
   (2080, 0.0),
   (181, 0.0),
   (1763, 0.0),
   (975, 0.0),
   (6877, 0.0),
   (9, 0.0),
   (276, 0.0),
   (3294, 0.0),
   (5314, 0.0),
   (444, 0.0),
   (54, 0.0),
   (751, 0.0)])]

In [None]:
def ndcg_for_user(x):
    y_pred = np.array([e[0] for e in x])
    y_true = np.array([e[1] for e in x])
    return ndcg_score(y_true, y_pred)
    
(
    predictions_and_ratings_per_user
    .map(lambda x: ndcg_for_user(x[1]))
    .mean()
)

0.6641733135543155

# NDCG для iALS

In [None]:
predictions = (
    model
    .predictAll(test.map(lambda x: (x.userId, x.artistId)))
    .map(lambda x: ((x[0], x[1]), x[2]))
)

In [None]:
predictions.take(5)

[((2760, 57436), 0.3116623521863906),
 ((3013, 57436), 0.38307138485970177),
 ((4698, 57436), 0.6615872307063082),
 ((679, 57436), 0.2201103537036474),
 ((4031, 57436), 0.2704980830675262)]

In [None]:
predictions_and_ratings_per_user = (
    predictions
    .join(test.map(lambda x: ((x.userId, x.artistId), np.log2(x.plays + 1))))
    .map(lambda x: (x[0][0], x[1]))
    .groupByKey()
    .map(lambda x: (x[0], list(x[1])))
)

In [None]:
predictions_and_ratings_per_user.take(1)

[(424,
  [(0.4947066591796494, 0.0),
   (0.07256945746376073, 1.0),
   (1.0418824718617612, 1.0),
   (0.8732245315003361, 3.0),
   (0.7800387958389986, 0.0),
   (0.5205558335897855, 1.0),
   (0.8574898932174937, 1.0),
   (0.6705422577860198, 1.0),
   (0.9413614476191352, 1.0),
   (0.5795242043086068, 1.0),
   (0.8416199638867807, 1.0),
   (0.854267842790669, 0.0),
   (0.9193556147106452, 1.0),
   (0.707306054103123, 1.584962500721156),
   (0.5243437466645112, 0.0),
   (0.22297562673322074, 0.0),
   (0.7891040260573013, 1.0),
   (0.9171743495871868, 2.0),
   (0.6758397982855767, 1.0),
   (0.3647776046700404, 1.0),
   (0.8036689210819883, 1.0),
   (0.7416945180026882, 1.584962500721156),
   (0.8661558105316505, 4.700439718141092),
   (-0.012526388109644643, 0.0),
   (0.9405575171945975, 3.584962500721156),
   (0.3767758169618822, 1.0),
   (0.05987667474975478, 2.807354922057604),
   (0.8369160338209602, 1.0),
   (0.20230316995550413, 1.0),
   (0.6800615343067239, 0.0),
   (0.610816912638

In [None]:
def ndcg_for_user(x):
    y_pred = np.array([e[0] for e in x])
    y_true = np.array([e[1] for e in x])
    return ndcg_score(y_true, y_pred)
    
(
    predictions_and_ratings_per_user
    .map(lambda x: ndcg_for_user(x[1]))
    .mean()
)

0.719044590203335

In [None]:
print("Улучшение на {:0.3} процентов!".format(100 * (0.716 / 0.661 - 1)))

Улучшение на 8.32 процентов!
