In [1]:
import pandas as pd 
import numpy as np 
import scipy
import implicit 
import mlflow
from prefect import flow, task
from sklearn.metrics import precision_score, recall_score, make_scorer
from sklearn.model_selection import train_test_split
from hyperopt import fmin, tpe, hp, Trials
import pickle 
from mlflow.tracking import MlflowClient

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
user_item = pd.read_csv("./train_triplets.txt/train_triplets.txt", delimiter = "\t", names = ["user_id", "song_id", "play_count"], engine = "pyarrow", dtype_backend = "pyarrow")

In [17]:
song_info = pd.read_csv("./song_data.csv", engine = "pyarrow", dtype_backend="pyarrow")

In [19]:
user_item.song_id.unique()

<ArrowExtensionArray>
['SOAKIMP12A8C130995', 'SOAPDEY12A81C210A9', 'SOBBMDR12A8C13253B',
 'SOBFNSP12AF72A0E22', 'SOBFOVM12A58A7D494', 'SOBNZDC12A6D4FC103',
 'SOBSUJE12A6D4F8CF5', 'SOBVFZR12A6D4F8AE3', 'SOBXALG12A8C13C108',
 'SOBXHDL12A81C204C0',
 ...
 'SOOFPQN12AB01884A6', 'SOSJRJM12A6701D41E', 'SOXWNXJ12A6701D974',
 'SOPGGXJ12AB018622A', 'SOFYWFV12A6D4F4A7E', 'SOJLGCD12A6D4F4A7C',
 'SONYEUK12A8C13E9A4', 'SOUSSCZ12A8C131D83', 'SOKSHWF12AB018B8CC',
 'SOCDLVW12AB018371F']
Length: 384546, dtype: string[pyarrow]

In [23]:
user_item.columns 

Index(['user_id', 'song_id', 'play_count'], dtype='object')

In [24]:
song_info.columns

Index(['song_id', 'title', 'release', 'artist_name', 'year'], dtype='object')

In [26]:
sample_user_item = user_item.iloc[:10**6]

In [72]:
taste_profile = sample_user_item.merge(song_info, on = "song_id", how = "left")

In [39]:
taste_profile.head()

Unnamed: 0,user_id,song_id,play_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1,Nothing from Nothing,To Die For,Billy Preston,1974
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1,Under Cold Blue Stars,Under Cold Blue Stars,Josh Rouse,2002
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1,Riot Radio (Soundtrack Version),Nick & Norah's Infinite Playlist - Original Mo...,The Dead 60s,0


In [73]:
taste_profile["song"] = taste_profile["artist_name"] + " - " + taste_profile["title"]

In [57]:
taste_profile = taste_profile.drop(columns = ["title", "artist_name", "release", "year"]).reindex(columns = ["user_id", "song_id", "song", "play_count"])

In [79]:
taste_profile["numeric_user_id"], _ = pd.factorize(taste_profile["user_id"])

In [80]:
taste_profile["numeric_song_id"], _ = pd.factorize(taste_profile["song_id"])

In [183]:
taste_profile = taste_profile.drop(index = taste_profile[taste_profile.play_count >= 15].index)

In [None]:
train_profile = pd.DataFrame(columns = ["user_id", "song_id", "song", "play_count", "numeric_user_id", "numeric_song_id"])
test_profile = pd.DataFrame(columns = ["user_id", "song_id", "song", "play_count", "numeric_user_id", "numeric_song_id"])

for user in taste_profile["user_id"].unique():
    data = taste_profile[taste_profile["user_id"] == user]
    user_train, user_test = train_test_split(data, test_size = 0.2, random_state = 42)
    train_profile = pd.concat([train_profile, user_train])
    test_profile = pd.concat([test_profile, user_test])

In [184]:
sample_profile_train = train_profile.drop(columns = ["user_id", "song_id"]).reindex(columns = ["numeric_user_id", "numeric_song_id", "play_count", "song"])
sample_profile_test = test_profile.drop(columns = ["user_id", "song_id"]).reindex(columns = ["numeric_user_id", "numeric_song_id", "play_count", "song"])

In [185]:
#sample_profile = sample_profile.set_index(["numeric_user_id", "numeric_song_id"])
sample_profile_train = sample_profile_train.set_index(["numeric_user_id", "numeric_song_id"])
sample_profile_test = sample_profile_test.set_index(["numeric_user_id", "numeric_song_id"])

In [186]:
def get_sparse_matrix(sample_profile):     
    coo = scipy.sparse.coo_matrix(
        (
            sample_profile.play_count.astype(float), 
            (
                sample_profile.index.get_level_values(0), 
                sample_profile.index.get_level_values(1),
            ),
        )
    )
    csr_matrix = coo.tocsr()
    return csr_matrix

In [187]:
train_matrix = get_sparse_matrix(sample_profile_train)
test_matrix = get_sparse_matrix(sample_profile_test)

In [188]:
train_matrix, test_matrix

<20787x148039 sparse matrix of type '<class 'numpy.float64'>'
	with 973731 stored elements in Compressed Sparse Row format>

In [189]:
als_model = implicit.als.AlternatingLeastSquares(factors = 50, iterations = 10, regularization = 0.01)
lmf_model = implicit.lmf.LogisticMatrixFactorization(factors = 50, iterations = 10, regularization = 0.01)

als_model.fit(csr_matrix)
lmf_model.fit(csr_matrix)

recommendations, scores = implicit_model.recommend(10, csr_matrix[20], N = 20)
rec, scr = lmf_model.recommend(10, csr_matrix[1000], N = 20)

In [307]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "als")
        mlflow.log_params(params)
        # Convert parameters to the appropriate format if needed
        params['factors'] = int(params['factors'])
        
        # Initialize the model with the given parameters
        model = implicit.als.AlternatingLeastSquares(**params, random_state = 42)
        
        # Fit the model and compute the desired metric (e.g., precision@k)
        model.fit(csr_matrix)  # Replace with your data
        score = get_model_score(model, csr_matrix, taste_profile)
        mlflow.log_metric("recall", score)
    
    return -score

space = {
    "factors": hp.quniform("factors", 25, 100, 25),
    "regularization": hp.uniform("regularization", 0.01, 0.1), 
    "alpha": hp.loguniform("alpha", 0, 1)
}

trials = Trials()
best = fmin(
    fn= objective, 
    space = space, 
    algo = tpe.suggest, 
    max_evals = 5, 
    trials = trials
)

  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

  0%|          | 0/15 [00:00<?, ?it/s]
[A
  7%|6         | 1/15 [00:13<03:09, 13.56s/it]
[A
 13%|#3        | 2/15 [00:25<02:44, 12.65s/it]
[A
 20%|##        | 3/15 [00:38<02:32, 12.67s/it]
[A
 27%|##6       | 4/15 [00:50<02:18, 12.57s/it]
[A
 33%|###3      | 5/15 [01:02<02:03, 12.35s/it]
[A
 40%|####      | 6/15 [01:14<01:50, 12.29s/it]
[A
 47%|####6     | 7/15 [01:27<01:39, 12.38s/it]
[A
 53%|#####3    | 8/15 [01:41<01:30, 12.86s/it]
[A
 60%|######    | 9/15 [01:58<01:25, 14.32s/it]
[A
 67%|######6   | 10/15 [02:14<01:13, 14.66s/it]
[A
 73%|#######3  | 11/15 [02:32<01:03, 15.75s/it]
[A
 80%|########  | 12/15 [02:54<00:53, 17.71s/it]
[A
 87%|########6 | 13/15 [03:20<00:40, 20.10s/it]
[A
 93%|#########3| 14/15 [03:53<00:24, 24.09s/it]
[A
100%|##########| 15/15 [04:28<00:00, 27.48s/it]
[A
100%|##########| 15/15 [04:28<00:00, 17.93s/it]


 20%|██        | 1/5 [17:23<1:09:35, 1043.91s/trial, best loss: -0.2855770985859546]

  0%|          | 0/15 [00:00<?, ?it/s]
[A
  7%|6         | 1/15 [00:17<04:05, 17.55s/it]
[A
 13%|#3        | 2/15 [00:31<03:24, 15.71s/it]
[A
 20%|##        | 3/15 [00:45<02:58, 14.84s/it]
[A
 27%|##6       | 4/15 [00:58<02:35, 14.17s/it]
[A
 33%|###3      | 5/15 [01:12<02:19, 14.00s/it]
[A
 40%|####      | 6/15 [01:28<02:10, 14.49s/it]
[A
 47%|####6     | 7/15 [01:43<01:59, 14.93s/it]
[A
 53%|#####3    | 8/15 [02:00<01:47, 15.40s/it]
[A
 60%|######    | 9/15 [02:19<01:38, 16.48s/it]
[A
 67%|######6   | 10/15 [02:38<01:26, 17.30s/it]
[A
 73%|#######3  | 11/15 [02:58<01:12, 18.15s/it]
[A
 80%|########  | 12/15 [03:19<00:56, 18.92s/it]
[A
 87%|########6 | 13/15 [03:41<00:40, 20.11s/it]
[A
 93%|#########3| 14/15 [04:04<00:20, 20.76s/it]
[A
100%|##########| 15/15 [04:31<00:00, 22.60s/it]
[A
100%|##########| 15/15 [04:31<00:00, 18.07s/it]


 40%|████      | 2/5 [30:16<44:13, 884.38s/trial, best loss: -0.2855770985859546]   

  0%|          | 0/15 [00:00<?, ?it/s]
[A
  7%|6         | 1/15 [00:09<02:12,  9.47s/it]
[A
 13%|#3        | 2/15 [00:18<02:00,  9.30s/it]
[A
 20%|##        | 3/15 [00:28<01:55,  9.66s/it]
[A
 27%|##6       | 4/15 [00:37<01:43,  9.39s/it]
[A
 33%|###3      | 5/15 [00:46<01:32,  9.25s/it]
[A
 40%|####      | 6/15 [00:55<01:21,  9.07s/it]
[A
 47%|####6     | 7/15 [01:03<01:11,  8.90s/it]
[A
 53%|#####3    | 8/15 [01:13<01:03,  9.06s/it]
[A
 60%|######    | 9/15 [01:23<00:56,  9.40s/it]
[A
 67%|######6   | 10/15 [01:34<00:49,  9.90s/it]
[A
 73%|#######3  | 11/15 [01:44<00:39,  9.95s/it]
[A
 80%|########  | 12/15 [01:55<00:30, 10.11s/it]
[A
 87%|########6 | 13/15 [02:05<00:20, 10.14s/it]
[A
 93%|#########3| 14/15 [02:15<00:10, 10.25s/it]
[A
100%|##########| 15/15 [02:27<00:00, 10.57s/it]
[A
100%|##########| 15/15 [02:27<00:00,  9.81s/it]


 60%|██████    | 3/5 [38:27<23:29, 704.63s/trial, best loss: -0.2855770985859546]

  0%|          | 0/15 [00:00<?, ?it/s]
[A
  7%|6         | 1/15 [00:06<01:34,  6.75s/it]
[A
 13%|#3        | 2/15 [00:12<01:23,  6.41s/it]
[A
 20%|##        | 3/15 [00:18<01:14,  6.19s/it]
[A
 27%|##6       | 4/15 [00:24<01:07,  6.15s/it]
[A
 33%|###3      | 5/15 [00:31<01:03,  6.39s/it]
[A
 40%|####      | 6/15 [00:37<00:56,  6.23s/it]
[A
 47%|####6     | 7/15 [00:43<00:49,  6.16s/it]
[A
 53%|#####3    | 8/15 [00:49<00:42,  6.07s/it]
[A
 60%|######    | 9/15 [00:55<00:35,  5.96s/it]
[A
 67%|######6   | 10/15 [01:01<00:29,  5.89s/it]
[A
 73%|#######3  | 11/15 [01:07<00:23,  5.98s/it]
[A
 80%|########  | 12/15 [01:13<00:18,  6.05s/it]
[A
 87%|########6 | 13/15 [01:19<00:12,  6.19s/it]
[A
 93%|#########3| 14/15 [01:26<00:06,  6.29s/it]
[A
100%|##########| 15/15 [01:36<00:00,  7.45s/it]
[A
100%|##########| 15/15 [01:36<00:00,  6.44s/it]


 80%|████████  | 4/5 [45:59<10:04, 604.98s/trial, best loss: -0.2855770985859546]

  0%|          | 0/15 [00:00<?, ?it/s]
[A
  7%|6         | 1/15 [00:08<01:54,  8.20s/it]
[A
 13%|#3        | 2/15 [00:16<01:46,  8.21s/it]
[A
 20%|##        | 3/15 [00:24<01:35,  7.97s/it]
[A
 27%|##6       | 4/15 [00:31<01:26,  7.84s/it]
[A
 33%|###3      | 5/15 [00:39<01:17,  7.73s/it]
[A
 40%|####      | 6/15 [00:47<01:09,  7.77s/it]
[A
 47%|####6     | 7/15 [00:54<01:02,  7.78s/it]
[A
 53%|#####3    | 8/15 [01:03<00:56,  8.07s/it]
[A
 60%|######    | 9/15 [01:12<00:49,  8.26s/it]
[A
 67%|######6   | 10/15 [01:20<00:41,  8.32s/it]
[A
 73%|#######3  | 11/15 [01:29<00:33,  8.45s/it]
[A
 80%|########  | 12/15 [01:37<00:25,  8.45s/it]
[A
 87%|########6 | 13/15 [01:46<00:17,  8.55s/it]
[A
 93%|#########3| 14/15 [01:55<00:08,  8.60s/it]
[A
100%|##########| 15/15 [02:03<00:00,  8.56s/it]
[A
100%|##########| 15/15 [02:03<00:00,  8.26s/it]


100%|██████████| 5/5 [54:49<00:00, 657.98s/trial, best loss: -0.3145835527080493]


In [310]:
best_params_als = {k: int(v) if k == "factors" else v for k, v in best.items()}

In [311]:
def objective_lmf(params):
    with mlflow.start_run(): 
        mlflow.set_tag("model", "lmf")
        mlflow.log_params(params)
        # Convert parameters to the appropriate format if needed
        params['factors'] = int(params['factors'])
        
        # Initialize the model with the given parameters
        model = implicit.lmf.LogisticMatrixFactorization(**params, random_state = 42)
        
        # Fit the model and compute the desired metric (e.g., precision@k)
        model.fit(csr_matrix)  # Replace with your data
        score = get_model_score(model, csr_matrix, taste_profile)
        mlflow.log_metric("recall", score)
    
    return -score

space_lmf = {
    "factors": hp.quniform("factors", 25, 100, 25),
    "regularization": hp.uniform("regularization", 0.1, 1), 
    "learning_rate": hp.loguniform("learning_rate", 0, 1)
}

trials_lmf = Trials()
best_lmf = fmin(
    fn= objective_lmf, 
    space = space_lmf, 
    algo = tpe.suggest, 
    max_evals = 5, 
    trials = trials_lmf
)

  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

  0%|          | 0/30 [00:00<?, ?it/s]
[A
  3%|3         | 1/30 [00:01<00:55,  1.92s/it]
[A
  7%|6         | 2/30 [00:03<00:52,  1.88s/it]
[A
 10%|#         | 3/30 [00:05<00:49,  1.84s/it]
[A
 13%|#3        | 4/30 [00:07<00:47,  1.81s/it]
[A
 17%|#6        | 5/30 [00:09<00:45,  1.81s/it]
[A
 20%|##        | 6/30 [00:10<00:39,  1.66s/it]
[A
 23%|##3       | 7/30 [00:11<00:35,  1.55s/it]
[A
 27%|##6       | 8/30 [00:13<00:33,  1.50s/it]
[A
 30%|###       | 9/30 [00:14<00:32,  1.54s/it]
[A
 33%|###3      | 10/30 [00:16<00:31,  1.57s/it]
[A
 37%|###6      | 11/30 [00:18<00:29,  1.56s/it]
[A
 40%|####      | 12/30 [00:19<00:28,  1.60s/it]
[A
 43%|####3     | 13/30 [00:21<00:28,  1.65s/it]
[A
 47%|####6     | 14/30 [00:23<00:27,  1.69s/it]
[A
 50%|#####     | 15/30 [00:25<00:26,  1.76s/it]
[A
 53%|#####3    | 16/30 [00:26<00:22,  1.62s/it]
[A
 57%|#####6    | 17/30 [00:27<00:19,  1.52s/it]
[A
 60%|######    | 18/30 [00:29<00:18,  1.50s/it]
[A
 63%|######3   | 19/30 [00:31<

 20%|██        | 1/5 [06:44<26:58, 404.65s/trial, best loss: -0.13522417594153308]

  0%|          | 0/30 [00:00<?, ?it/s]
[A
  3%|3         | 1/30 [00:02<01:16,  2.62s/it]
[A
  7%|6         | 2/30 [00:05<01:14,  2.67s/it]
[A
 10%|#         | 3/30 [00:07<01:10,  2.63s/it]
[A
 13%|#3        | 4/30 [00:10<01:09,  2.66s/it]
[A
 17%|#6        | 5/30 [00:13<01:09,  2.77s/it]
[A
 20%|##        | 6/30 [00:16<01:07,  2.81s/it]
[A
 23%|##3       | 7/30 [00:19<01:03,  2.78s/it]
[A
 27%|##6       | 8/30 [00:21<01:00,  2.73s/it]
[A
 30%|###       | 9/30 [00:24<00:56,  2.67s/it]
[A
 33%|###3      | 10/30 [00:27<00:53,  2.67s/it]
[A
 37%|###6      | 11/30 [00:29<00:51,  2.71s/it]
[A
 40%|####      | 12/30 [00:32<00:50,  2.82s/it]
[A
 43%|####3     | 13/30 [00:35<00:47,  2.80s/it]
[A
 47%|####6     | 14/30 [00:38<00:44,  2.76s/it]
[A
 50%|#####     | 15/30 [00:41<00:41,  2.75s/it]
[A
 53%|#####3    | 16/30 [00:43<00:39,  2.81s/it]
[A
 57%|#####6    | 17/30 [00:47<00:37,  2.87s/it]
[A
 60%|######    | 18/30 [00:49<00:34,  2.87s/it]
[A
 63%|######3   | 19/30 [00:53<

 40%|████      | 2/5 [14:26<21:54, 438.10s/trial, best loss: -0.1926494605112163] 

  0%|          | 0/30 [00:00<?, ?it/s]
[A
  3%|3         | 1/30 [00:04<02:22,  4.93s/it]
[A
  7%|6         | 2/30 [00:10<02:23,  5.13s/it]
[A
 10%|#         | 3/30 [00:15<02:16,  5.07s/it]
[A
 13%|#3        | 4/30 [00:20<02:09,  4.97s/it]
[A
 17%|#6        | 5/30 [00:25<02:10,  5.20s/it]
[A
 20%|##        | 6/30 [00:30<02:04,  5.18s/it]
[A
 23%|##3       | 7/30 [00:35<01:55,  5.00s/it]
[A
 27%|##6       | 8/30 [00:40<01:49,  4.97s/it]
[A
 30%|###       | 9/30 [00:45<01:46,  5.07s/it]
[A
 33%|###3      | 10/30 [00:51<01:44,  5.25s/it]
[A
 37%|###6      | 11/30 [00:55<01:35,  5.01s/it]
[A
 40%|####      | 12/30 [01:01<01:33,  5.21s/it]
[A
 43%|####3     | 13/30 [01:06<01:27,  5.15s/it]
[A
 47%|####6     | 14/30 [01:11<01:20,  5.04s/it]
[A
 50%|#####     | 15/30 [01:15<01:13,  4.90s/it]
[A
 53%|#####3    | 16/30 [01:20<01:09,  4.93s/it]
[A
 57%|#####6    | 17/30 [01:25<01:04,  4.97s/it]
[A
 60%|######    | 18/30 [01:30<00:58,  4.91s/it]
[A
 63%|######3   | 19/30 [01:36<

 60%|██████    | 3/5 [24:27<17:05, 512.78s/trial, best loss: -0.1926494605112163]

  0%|          | 0/30 [00:00<?, ?it/s]
[A
  3%|3         | 1/30 [00:04<02:16,  4.72s/it]
[A
  7%|6         | 2/30 [00:10<02:26,  5.23s/it]
[A
 10%|#         | 3/30 [00:14<02:12,  4.91s/it]
[A
 13%|#3        | 4/30 [00:18<01:59,  4.58s/it]
[A
 17%|#6        | 5/30 [00:22<01:48,  4.34s/it]
[A
 20%|##        | 6/30 [00:26<01:40,  4.17s/it]
[A
 23%|##3       | 7/30 [00:30<01:31,  3.98s/it]
[A
 27%|##6       | 8/30 [00:34<01:26,  3.93s/it]
[A
 30%|###       | 9/30 [00:38<01:24,  4.01s/it]
[A
 33%|###3      | 10/30 [00:41<01:17,  3.90s/it]
[A
 37%|###6      | 11/30 [00:45<01:12,  3.82s/it]
[A
 40%|####      | 12/30 [00:49<01:09,  3.84s/it]
[A
 43%|####3     | 13/30 [00:53<01:05,  3.83s/it]
[A
 47%|####6     | 14/30 [00:56<01:00,  3.79s/it]
[A
 50%|#####     | 15/30 [01:00<00:56,  3.76s/it]
[A
 53%|#####3    | 16/30 [01:04<00:51,  3.69s/it]
[A
 57%|#####6    | 17/30 [01:07<00:48,  3.70s/it]
[A
 60%|######    | 18/30 [01:11<00:44,  3.73s/it]
[A
 63%|######3   | 19/30 [01:15<

 80%|████████  | 4/5 [34:21<09:04, 544.60s/trial, best loss: -0.20023930594897846]

  0%|          | 0/30 [00:00<?, ?it/s]
[A
  3%|3         | 1/30 [00:04<02:14,  4.62s/it]
[A
  7%|6         | 2/30 [00:10<02:35,  5.54s/it]
[A
 10%|#         | 3/30 [00:17<02:42,  6.03s/it]
[A
 13%|#3        | 4/30 [00:23<02:35,  5.97s/it]
[A
 17%|#6        | 5/30 [00:29<02:29,  6.00s/it]
[A
 20%|##        | 6/30 [00:34<02:15,  5.65s/it]
[A
 23%|##3       | 7/30 [00:39<02:02,  5.34s/it]
[A
 27%|##6       | 8/30 [00:43<01:51,  5.06s/it]
[A
 30%|###       | 9/30 [00:50<01:59,  5.69s/it]
[A
 33%|###3      | 10/30 [00:55<01:51,  5.58s/it]
[A
 37%|###6      | 11/30 [01:00<01:39,  5.24s/it]
[A
 40%|####      | 12/30 [01:05<01:33,  5.21s/it]
[A
 43%|####3     | 13/30 [01:12<01:38,  5.79s/it]
[A
 47%|####6     | 14/30 [01:20<01:42,  6.38s/it]
[A
 50%|#####     | 15/30 [01:27<01:39,  6.60s/it]
[A
 53%|#####3    | 16/30 [01:35<01:37,  6.94s/it]
[A
 57%|#####6    | 17/30 [01:41<01:29,  6.87s/it]
[A
 60%|######    | 18/30 [01:48<01:19,  6.63s/it]
[A
 63%|######3   | 19/30 [01:54<

100%|██████████| 5/5 [48:19<00:00, 579.96s/trial, best loss: -0.21217974749576204]


In [313]:
best_params_lmf = {k: int(v) if k == "factors" else v for k, v in best_lmf.items()}

In [261]:
#taste_profile[taste_profile.numeric_song_id.isin(recommendations)]["song"].unique()
#taste_profile[taste_profile.numeric_song_id.isin(recommendations_alt)]["song"].unique()
#taste_profile[taste_profile.numeric_song_id.isin(rec)]["song"].unique()

In [211]:
ground_truth = get_ground_truth(taste_profile, 10)

In [230]:
recommendations_als = taste_profile[taste_profile.numeric_song_id.isin(recommendations)]["song"].unique().tolist()
recommendations_lmf = taste_profile[taste_profile.numeric_song_id.isin(rec)]["song"].unique().tolist()

In [326]:
best_params_als

{'alpha': 1.4979221661440207,
 'factors': 100,
 'regularization': 0.06388396548115775}

In [343]:
client = MlflowClient("sqlite:///mlflow.db")

2023/08/21 06:34:54 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/08/21 06:34:54 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

In [327]:
best_als = implicit.als.AlternatingLeastSquares(factors = best_params_als["factors"], alpha = best_params_als["alpha"], regularization = best_params_als["regularization"])
best_als.fit(csr_matrix)

100%|██████████| 15/15 [03:47<00:00, 15.16s/it]


In [None]:
run_id = client.list_run_infos(experiment_id = "1")[0].run_id
mlflow.register_model(
    model_uri= f"runs:/{run_id}/models", 
    name = "altenatingleastsquares"
)

In [329]:
best_lmf = implicit.lmf.LogisticMatrixFactorization(factors = best_params_lmf["factors"], learning_rate = best_params_lmf["learning_rate"], regularization = best_params_lmf["regularization"])
best_lmf.fit(csr_matrix)

100%|██████████| 30/30 [15:40<00:00, 31.36s/it]


In [None]:
run_id = client.list_run_infos(experiment_id = "2")[0].run_id
mlflow.register_model(
    model_uri= f"runs:/{run_id}/models", 
    name = "logisticmatrixfactorization"
)

In [330]:
models = [best_als, best_lmf]
scores_list = []
for model in models: 
    result = get_model_score(model, csr_matrix, taste_profile)
    scores_list.append(result)
best_model = models[np.argmax(scores_list)]

In [334]:
with open("rec_model.pkl", "wb") as f: 
    pickle.dump(best_model, f)

In [None]:
mlflow.log_artifact(local_path="./rec_model.pkl", artifact_path = "models_pickle")

In [1]:
import pandas as pd 
import numpy as np 
import scipy
import implicit 
import mlflow
from prefect import flow, task
from sklearn.metrics import precision_score, recall_score, make_scorer
from sklearn.model_selection import train_test_split
from hyperopt import fmin, tpe, hp, Trials
import pickle 
from mlflow.tracking import MlflowClient

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = MlflowClient("sqlite:///mlflow.db")

In [3]:
@task
def get_ground_truth(profile, user_id): 
    user_data = profile[profile["numeric_user_id"] == user_id]
    relevant_play_count = profile["play_count"].quantile(0.50)
    ground_truth = user_data[user_data["play_count"] >= relevant_play_count]["numeric_song_id"].tolist()
    return ground_truth

In [4]:
@task
def convert_to_binary(recommendations, ground_truth): 
    binary_recommendations = []
    for rec in recommendations: 
        if rec in ground_truth:
            binary_recommendations.append(1)
        else: 
            binary_recommendations.append(0)
    return binary_recommendations

In [5]:
@task
def get_model_score(model, matrix, profile):
    n = 20
    recall_scores = []
    for user_id in profile["numeric_user_id"].unique(): 
        ground_truth = get_ground_truth(profile, user_id)
        recommendation, _ = model.recommend(user_id, matrix[n], N = n)
        binary_recommendation = convert_to_binary(recommendation, ground_truth)
        binary_ground_truth = convert_to_binary(ground_truth, ground_truth)
        min_value = min(len(binary_ground_truth), len(binary_recommendation))
        score = recall_score(binary_ground_truth[:min_value], binary_recommendation[:min_value])
        recall_scores.append(score)
    average_score = sum(recall_scores) / len(recall_scores)
    return average_score

In [6]:
@task(name = "get_data", retries = 3, retry_delay_seconds = 2)
def read_and_prepare_data(user_item_path, song_info_path): 
    user_item = pd.read_csv(user_item_path, delimiter = "\t", names = ["user_id", "song_id", "play_count"], engine = "pyarrow", dtype_backend = "pyarrow")
    song_info = pd.read_csv(song_info_path, engine = "pyarrow", dtype_backend="pyarrow")
    sample_user_item = user_item.iloc[:10**6]
    taste_profile = sample_user_item.merge(song_info, on = "song_id", how = "left")
    taste_profile["song"] = taste_profile["artist_name"] + " - " + taste_profile["title"]
    taste_profile = taste_profile.drop(columns = ["title", "artist_name", "release", "year"]).reindex(columns = ["user_id", "song_id", "song", "play_count"])
    taste_profile["numeric_user_id"], _ = pd.factorize(taste_profile["user_id"])
    taste_profile["numeric_song_id"], _ = pd.factorize(taste_profile["song_id"])
    taste_profile = taste_profile.drop(index = taste_profile[taste_profile.play_count >= 15].index)
    train_profile = pd.DataFrame(columns = ["user_id", "song_id", "song", "play_count", "numeric_user_id", "numeric_song_id"])
    test_profile = pd.DataFrame(columns = ["user_id", "song_id", "song", "play_count", "numeric_user_id", "numeric_song_id"])
    
    for user in taste_profile["user_id"].unique():
        data = taste_profile[taste_profile["user_id"] == user]
        if len(data) >= 5:
            user_train, user_test = train_test_split(data, test_size = 0.2, random_state = 42)
            train_profile = pd.concat([train_profile, user_train])
            test_profile = pd.concat([test_profile, user_test])
        else:
            continue
    sample_profile_train = train_profile.drop(columns = ["user_id", "song_id"]).reindex(columns = ["numeric_user_id", "numeric_song_id", "play_count", "song"])
    sample_profile_test = test_profile.drop(columns = ["user_id", "song_id"]).reindex(columns = ["numeric_user_id", "numeric_song_id", "play_count", "song"])
    sample_profile_train = sample_profile_train.set_index(["numeric_user_id", "numeric_song_id"])
    sample_profile_test = sample_profile_test.set_index(["numeric_user_id", "numeric_song_id"])

    return sample_profile_train, sample_profile_test

In [7]:
@task
def process_data(sample_profile): 
    coo = scipy.sparse.coo_matrix(
        (
            sample_profile.play_count.astype(float), 
            (
                sample_profile.index.get_level_values(0), 
                sample_profile.index.get_level_values(1),
            ),
        )
    )
    csr_matrix = coo.tocsr()
    return csr_matrix

In [8]:
@task
def tune_model(rec_model, csr_matrix, taste_profile): 
    def objective(params):
        with mlflow.start_run():
            mlflow.set_tag("model", f"{rec_model[1]}")
            mlflow.log_params(params)
            # Convert parameters to the appropriate format if needed
            params['factors'] = int(params['factors'])
            
            # Initialize the model with the given parameters
            if rec_model[1] == "als":
                model = rec_model[0].AlternatingLeastSquares(**params, random_state = 42)
            elif rec_model[1] == "lmf": 
                model = rec_model[0].LogisticMatrixFactorization(**params, random_state = 42)
            
            # Fit the model and compute the desired metric (e.g., precision@k)
            model.fit(csr_matrix)  # Replace with your data
            score = get_model_score(model, csr_matrix, taste_profile)
            mlflow.log_metric("recall", score)
        
        return -score
    if rec_model[1] == "als": 
        space = {
            "factors": hp.quniform("factors", 25, 100, 25),
            "regularization": hp.uniform("regularization", 0.01, 0.1), 
            "alpha": hp.loguniform("alpha", 0, 1)
        }
        
        trials = Trials()
        best = fmin(
            fn= objective, 
            space = space, 
            algo = tpe.suggest, 
            max_evals = 5, 
            trials = trials
        )
    elif rec_model[1] == "lmf": 
        space = {
            "factors": hp.quniform("factors", 25, 100, 25),
            "regularization": hp.uniform("regularization", 0.1, 1), 
            "learning_rate": hp.loguniform("learning_rate", 0, 1)
        }
        
        trials = Trials()
        best = fmin(
            fn= objective, 
            space = space, 
            algo = tpe.suggest, 
            max_evals = 5, 
            trials = trials
        )
        
    best_params = {k: int(v) if k == "factors" else v for k, v in best.items()}
    return best_params

In [9]:
@task(log_prints = True) 
def train_best_model(profile_train, matrix_train, profile_test, matrix_test): 
    als_model = (implicit.als, "als")
    lmf_model = (implicit.lmf, "lmf") 
    models = [als_model, lmf_model]
    params_list = []
    for model in models: 
        param = tune_model(model, matrix_train, profile_train)
        params_list.append(param)
    best_als = als_model[0].AlternatingLeastSquares(factors = params_list[0]["factors"], alpha = params_list[0]["alpha"], regularization = params_list[0]["regularization"])
    best_als.fit(matrix_train)
    best_lmf = lmf_model[0].LogisticMatrixFactorization(factors = params_list[1]["factors"], learning_rate = params_list[1]["learning_rate"], regularization = params_list[1]["regularization"])
    best_lmf.fit(matrix_train)

    best_models = [best_als, best_lmf]
    scores_list_train = []
    scores_list_test = []
    for model in best_models: 
        result_train = get_model_score(model, matrix_train, profile_train)
        scores_list_train.append(result_train)
        result_test = get_model_score(model, matrix_test, profile_test)
        scores_list_test.append(result_test)
    best_model = best_models[np.argmax(scores_list_test)]

    with open("rec_model.pkl", "wb") as f: 
        pickle.dump(best_model, f)
    mlflow.log_artifact(local_path="./rec_model.pkl", artifact_path = "models_pickle")
    
    return None

In [None]:
@flow
def main_flow(user_item_path = "./train_triplets.txt/train_triplets.txt", song_info_path = "./song_data.csv"): 
    mlflow.set_tracking_uri("sqlite:///mlflow.db")
    mlflow.set_experiment("music-recommender-experiment")
    
    profile_train, profile_test = read_and_prepare_data(user_item_path, song_info_path)
    matrix_train, matrix_test = process_data(profile_train), process_data(profile_test)

    train_best_model(profile_train, matrix_train, profile_test, matrix_test)

if __name__ == "__main__": 
    main_flow()