Build dataset


In [29]:
from typing import TypedDict, Type, Any, Callable

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from spark.config import views
from spark.create_session import create_session

from IPython.display import display

from fitter import Fitter, get_common_distributions

In [30]:
VIEWS = views("v3")
spark = create_session()

for view, file in VIEWS.items():
    df = spark.read.json(file)
    df.createOrReplaceTempView(view)

In [31]:
sessions_full = spark.sql(f"SELECT * FROM sessions").toPandas()
sessions = spark.sql(f"SELECT DISTINCT user_id, track_id FROM sessions WHERE event_type like 'like' order by user_id, track_id").toPandas()
tracks = spark.sql(
    f"SELECT DISTINCT id, acousticness, danceability, duration_ms, energy, instrumentalness, key, liveness, loudness, popularity, EXTRACT(year from `release_date`) as release_year, speechiness, tempo, valence FROM tracks ").toPandas()

In [32]:
display(sessions_full)
display(sessions)
display(tracks)

Unnamed: 0,event_type,session_id,timestamp,track_id,user_id
0,play,124,2023-09-20T07:18:17,4flBNxpPLGVAdufOzRBIen,101
1,like,124,2023-09-20T07:18:30.069000,4flBNxpPLGVAdufOzRBIen,101
2,play,125,2023-04-27T00:20:57.181000,3uMYq07Kj5m564OQwdSCrD,101
3,play,126,2023-02-11T05:34:54.160000,2RChe0r2cMoyOvuKobZy44,101
4,advertisment,126,2023-02-11T05:34:56.685000,,101
...,...,...,...,...,...
580241,like,102015,2023-11-12T09:06:01.022000,4cLdpErILMO8Db8pQVAVcZ,1100
580242,play,102015,2023-11-12T09:09:17.564000,3j8ja2Hq824OaRqIENJPTH,1100
580243,like,102015,2023-11-12T09:12:19.739000,3j8ja2Hq824OaRqIENJPTH,1100
580244,skip,102015,2023-11-12T09:12:58.093000,3j8ja2Hq824OaRqIENJPTH,1100


Unnamed: 0,user_id,track_id
0,101,03LNdMgu3l3Ldc3QMl1bvZ
1,101,0BVCEJJFVsb8nrQGI11Dj2
2,101,0PCpQRd0hMAWjBLOmJdR7X
3,101,0bLOiofyBB62YU2cNnONJG
4,101,0iJfN2CqrX7O8hkzgAMMAf
...,...,...
80366,1100,7FdAQ7CXm9yS4DBtIKopLi
80367,1100,7a9aeLVkn7DIqFjbanKz0k
80368,1100,7an1exwMnfYRcdVQm0yDev
80369,1100,7cioKB5CHVzk09SOtTyn0T


Unnamed: 0,id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,popularity,release_year,speechiness,tempo,valence
0,3KfbEIOC7YIv90FIfNSZpo,0.4490,0.688,146333,0.435,0.000000,9,0.1130,-11.359,74,1965,0.0323,103.239,0.435
1,1dxbAIfCASqv6jix2R1Taj,0.7840,0.698,148413,0.293,0.010600,0,0.0936,-11.361,55,1967,0.0332,117.613,0.724
2,17PXXzOygMyXXUNLngVN5u,0.8820,0.539,173733,0.114,0.009020,2,0.1060,-20.575,60,1970,0.0346,141.378,0.608
3,3Um9toULmYFGCpvaIPFw7l,0.4470,0.283,233000,0.716,0.000000,1,0.3990,-9.632,72,1971,0.0986,201.960,0.828
4,0GjEhVFGZW8afUYGChu3Rr,0.3580,0.543,230400,0.870,0.000939,9,0.7920,-6.514,81,1976,0.0428,100.804,0.754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22407,3VbsXTdxBl6IavGjFZbl3R,0.0394,0.720,225012,0.824,0.000000,10,0.0509,-5.086,53,2020,0.1280,165.998,0.333
22408,2zurBuX8TGOHkZLhye9Xme,0.0551,0.769,179080,0.922,0.000000,9,0.1940,-2.779,51,2020,0.1020,119.991,0.804
22409,3ev1ilsHdVbpXeT7FG6Kw7,0.6410,0.838,229187,0.461,0.000000,1,0.0825,-9.610,54,2017,0.2170,104.979,0.497
22410,5qTwnr6WbWeeUDS9dcl5g6,0.3370,0.444,303097,0.523,0.000000,1,0.1130,-4.522,54,2019,0.0286,185.872,0.420


In [33]:
# d = spark.sql(
#     """
#     SELECT s.user_id, s.track_id, s.weight, acousticness, danceability, duration_ms, energy, instrumentalness, key, liveness, loudness, popularity, EXTRACT(year from `release_date`) as release_year, speechiness, tempo, valence
#     FROM (
#         select user_id, track_id, sum(event_weight) as weight
#         from (
#             SELECT user_id, track_id, CASE WHEN event_type like 'like' THEN 1 ELSE 0.02 END as event_weight
#             FROM sessions
#             WHERE event_type like 'like' or event_type like 'play'
#             ) 
#         group by user_id, track_id
#     ) s
#     inner join tracks t on s.track_id = t.id
#     order by s.user_id, t.id
#     """).toPandas()
d = spark.sql(
    """
    SELECT s.user_id, s.track_id, s.weight, acousticness, danceability, duration_ms, energy, instrumentalness, key, liveness, loudness, popularity, EXTRACT(year from `release_date`) as release_year, speechiness, tempo, valence
    FROM (
        select user_id, track_id, sum(event_weight) as weight
        from (
            SELECT user_id, track_id, 1 as event_weight
            FROM sessions
            WHERE event_type like 'like'
            ) 
        group by user_id, track_id
    ) s
    inner join tracks t on s.track_id = t.id
    order by s.user_id, t.id
    """).toPandas()
d

Unnamed: 0,user_id,track_id,weight,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,popularity,release_year,speechiness,tempo,valence
0,101,03LNdMgu3l3Ldc3QMl1bvZ,1,0.54500,0.674,182549,0.6370,0.000002,0,0.0483,-3.400,52,2014,0.0330,139.867,0.755
1,101,0BVCEJJFVsb8nrQGI11Dj2,1,0.00522,0.656,235253,0.9230,0.000000,2,0.3260,-3.541,56,2011,0.0339,139.930,0.652
2,101,0PCpQRd0hMAWjBLOmJdR7X,1,0.02280,0.481,166040,0.8820,0.000220,2,0.1520,-5.068,51,1999,0.0407,141.721,0.718
3,101,0bLOiofyBB62YU2cNnONJG,1,0.32900,0.457,404933,0.4280,0.000002,1,0.1690,-9.796,55,1975,0.0283,129.980,0.151
4,101,0iJfN2CqrX7O8hkzgAMMAf,1,0.76700,0.222,355634,0.0555,0.923000,0,0.1180,-22.118,53,1994,0.0321,98.941,0.033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80366,1100,7FdAQ7CXm9yS4DBtIKopLi,1,0.88100,0.444,250880,0.2660,0.000000,9,0.1460,-9.001,56,2004,0.0366,130.430,0.404
80367,1100,7a9aeLVkn7DIqFjbanKz0k,1,0.31900,0.145,271867,0.4520,0.920000,2,0.0753,-13.560,55,2007,0.0448,82.297,0.146
80368,1100,7an1exwMnfYRcdVQm0yDev,1,0.21700,0.418,239013,0.4820,0.000000,5,0.1230,-5.769,54,2006,0.0266,175.558,0.261
80369,1100,7cioKB5CHVzk09SOtTyn0T,1,0.10300,0.436,369040,0.5340,0.006430,1,0.5070,-9.416,59,2014,0.0773,122.822,0.325


In [34]:
from scipy import stats



_d_features = pd.DataFrame(d['track_id'].unique(), columns=['track_id']).merge(tracks, left_on="track_id", right_on="id", how="inner", validate="1:1" )
d_features = stats.zscore(_d_features.drop(['id', 'track_id'], axis=1))

In [35]:
from lightfm.data import Dataset

assert d['track_id'].unique().shape[0] == d_features.shape[0]

dataset = Dataset()
dataset.fit(
    users=d['user_id'].unique(),
    items=d['track_id'].unique(),
    # item_features=d_features
)

num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 1000, num_items 21792.


In [36]:
(interactions, weights) = dataset.build_interactions(d[['user_id', 'track_id', 'weight']].apply(tuple, axis=1))

print(repr(interactions))

<1000x21792 sparse matrix of type '<class 'numpy.int32'>'
	with 80371 stored elements in COOrdinate format>


In [37]:
from lightfm.cross_validation import random_train_test_split

(train, test) = random_train_test_split(interactions)


In [38]:
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank


In [39]:
# feature_names = d.drop(['user_id', 'track_id', 'weight'], axis=1).columns
# 
# item_features = dataset.build_item_features(((i, feature_names) for i in d['track_id']))
# print(repr(item_features))

In [40]:
model_h = LightFM(
    loss='warp-kos',
    k=3,
    learning_rate=0.02,
    item_alpha=1e-4,
    user_alpha=1e-4,
    no_components=30,
)
model_h.fit(
    interactions=train,
    # item_features=item_features,
    epochs=50,
    num_threads=12)

<lightfm.lightfm.LightFM at 0x7f1a8c09a920>

In [41]:
train_auc_h = auc_score(model_h, train, 
                        # item_features=item_features,
                        num_threads=12).mean()
test_auc_h = auc_score(model_h, test, train_interactions=train, 
                       # item_features=item_features, 
                       num_threads=12).mean()
test_auc_h = auc_score(model_h, test, train_interactions=train, 
                       # item_features=item_features, 
                       num_threads=12).mean()

train_precision_h = precision_at_k(model_h, train, k=10,
                                   # item_features=item_features, 
                                   num_threads=12).mean()
test_precision_h = precision_at_k(model_h, test, k=10, train_interactions=train,
                                  # item_features=item_features, 
                                  num_threads=12).mean()

train_recall_h = recall_at_k(model_h, train, k=10, 
                             # item_features=item_features,
                             num_threads=12).mean()
test_recall_h = recall_at_k(model_h, test, k=10, train_interactions=train,
                            # item_features=item_features, 
                            num_threads=12).mean()

train_reciprocal_rank_h = reciprocal_rank(model_h, train,
                                          # item_features=item_features,
                                          num_threads=12).mean()
test_reciprocal_rank_h = reciprocal_rank(model_h, test, train_interactions=train,
                                         # item_features=item_features,
                                         num_threads=12).mean()

print('AUC: train %.6f, test %.6f.' % (train_auc_h, test_auc_h))
print('Precision: train %.6f, test %.6f.' % (train_precision_h, test_precision_h))
print('Recall: train %.6f, test %.6f.' % (train_recall_h, test_recall_h))
print('Reciprocal rank: train %.6f, test %.6f.' % (train_reciprocal_rank_h, test_reciprocal_rank_h))


AUC: train 0.763055, test 0.493200.
Precision: train 0.770300, test 0.000406.
Recall: train 0.181227, test 0.000159.
Reciprocal rank: train 0.669526, test 0.003383.


In [42]:
predicted_scores = model_h.predict(148, np.arange(interactions.shape[1]), 
                                   # item_features=item_features, 
                                   num_threads=12)
predicted_scores

array([-2.2398512 , -0.17288323, -1.3915393 , ..., -1.8822399 ,
       -2.029627  , -1.7873147 ], dtype=float32)

In [43]:
indices = np.argpartition(predicted_scores, -10)[-10:]
indices

array([ 2243,  4756,  9397,  6250,  9396,  9395,  4119,   383,  3738,
       15331])

In [46]:
pd.DataFrame(d['track_id'].unique()).loc[indices]

Unnamed: 0,0
2243,6v2eEC9Nr9POe5xPUm8361
4756,74oWejX8MG6SEaJiz1334D
9397,2Ygs64z9ywJGakuKU8tr6o
6250,06hBdrgjUendZyH9U1WV22
9396,2TdDRjNiF1HuRvnclprnce
9395,1bx6spmieE655BQvWdTYKA
4119,2D1hlMwWWXpkc3CZJ5U351
383,6IUhPMJf4iJQ3Go1CkHDsa
3738,3EO0tGvdBCw6vdChaNvVlc
15331,2aG2nLUd8NNI7vFuOLBCe0


In [45]:
d[d.user_id == 249]

Unnamed: 0,user_id,track_id,weight,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,popularity,release_year,speechiness,tempo,valence
12067,249,07BuyVse8pYAWd9DXD7B2D,1,0.000831,0.536,235240,0.968,3e-05,6,0.092,-2.373,65,2015,0.0891,86.036,0.57
12068,249,07qHCBBSRswIsfs0waYdjC,1,0.799,0.583,169846,0.418,0.0,5,0.109,-10.898,58,2019,0.0451,129.915,0.357
12069,249,0afhq8XCExXpqazXczTSve,1,0.0735,0.624,170827,0.876,0.0,9,0.327,-3.374,80,2017,0.1,99.943,0.781
12070,249,0gDyuX5rdHulQTUyrIdSR1,1,0.985,0.529,168107,0.104,0.291,9,0.109,-19.669,54,1971,0.0392,135.235,0.183
12071,249,1A9PKAFEHMWgXNLpgf7k4J,1,0.538,0.677,173452,0.722,0.0,2,0.11,-4.443,69,2020,0.0626,172.0,0.786
12072,249,1HKl3RJInVzf5ObVnM644j,1,0.00663,0.717,171172,0.712,0.0,0,0.0756,-6.098,74,2020,0.0765,97.019,0.825
12073,249,1HYnjKqSSHh1tdl2Hi57zH,1,0.612,0.427,266467,0.407,0.386,2,0.109,-9.665,65,2019,0.0382,168.844,0.244
12074,249,1HbyxkzMjXKIuyiZ5Xn4nB,1,0.0166,0.651,239627,0.412,0.00293,2,0.135,-11.539,53,1986,0.0279,118.276,0.532
12075,249,1N5p1acMejThPZZjmxQ3i5,1,0.152,0.495,172787,0.498,0.0,7,0.13,-5.176,59,2017,0.0337,96.322,0.559
12076,249,1bx6spmieE655BQvWdTYKA,1,0.394,0.703,319636,0.982,0.000939,4,0.109,-3.466,57,2019,0.118,110.03,0.697
