In [1]:
from collections import defaultdict

import pandas as pd
import numpy as np
import scipy.stats as ss

import lightfm
import lightfm.data as ld
import lightfm.evaluation as lv

import tqdm
import json
import optuna

import tensorboardX as tb

import matplotlib.pyplot as pl
import seaborn as sns

np.random.seed(31337)

ModuleNotFoundError: No module named 'lightfm'

In [None]:
DATA_DIR = "/Users/ivan/Desktop/"

In [None]:
data = pd.read_json(DATA_DIR + "input.json", lines=True).drop_duplicates(subset=["user", "track"])

In [None]:
positives = data[data["time"] > 0.8].copy()
positives["test"] = np.random.random(len(positives)) >= 0.7

In [None]:
user_counts = positives[~positives["test"]].groupby("user").size()
users = set(user_counts[user_counts >= 5].index.values)

In [None]:
track_counts = positives[~positives["test"]].groupby("track").size()
tracks = set(track_counts[track_counts >= 5].index.values)

## Train LightFM

In [None]:
train_data = positives[~positives["test"] & positives["user"].isin(users) & positives["track"].isin(tracks)]
test_data = positives[positives["test"] & positives["user"].isin(users) & positives["track"].isin(tracks)]

len(train_data), len(test_data)

In [None]:
dataset = ld.Dataset()
dataset.fit(users, tracks)

In [None]:
train_interactions, _ = dataset.build_interactions(train_data[["user", "track"]].itertuples(index=False, name=None))
test_interactions, _ = dataset.build_interactions(test_data[["user", "track"]].itertuples(index=False, name=None))

In [None]:
def fit_model(
    epochs=1, 
    at=10, 
    loss="warp", 
    no_components=30, 
    learning_rate=0.01, 
    max_sampled=10, 
    user_alpha=0.0, 
    item_alpha=0.0, 
    threads=30, 
    verbose=False,
    patience=3,
    epsilon=1e-6
):
    model = lightfm.LightFM(
        no_components=no_components,
        loss=loss,
        learning_rate=learning_rate,
        max_sampled=max_sampled,
        user_alpha=user_alpha,
        item_alpha=item_alpha,
    )

    precisions_at = []
    
    for epoch in range(epochs):
        model = model.fit_partial(train_interactions, num_threads=threads)
        precision_at = lv.precision_at_k(model, test_interactions, train_interactions=train_interactions, k=at, num_threads=threads)
        if verbose:
            print(f"{epoch}:\t{np.mean(precision_at)} +/- {ss.sem(precision_at) * 1.96}")
        precisions_at.append(np.mean(precision_at))
        
        if epoch > patience and all([precisions_at[-j] - precisions_at[-patience-1] < epsilon for j in range(1, patience + 1)]):
            if verbose:
                print("Early stopiing!")
            break
        
    return model, precisions_at


def objective(trial):
    loss = trial.suggest_categorical("loss", ["warp", "bpr"])
    no_components = trial.suggest_categorical("no_components", [10, 30, 50])
    learning_rate = trial.suggest_categorical("learning_rate", [0.0001, 0.001, 0.01])
    max_sampled = trial.suggest_categorical("max_sampled", [10, 20, 50, 100])
    user_alpha = trial.suggest_categorical("user_alpha", [0.0, 0.0001])
    item_alpha = trial.suggest_categorical("item_alpha", [0.0, 0.0001])
    
    model, precisions_at = fit_model(
        epochs=5, 
        at=10,
        loss=loss,
        no_components=no_components, 
        learning_rate=learning_rate, 
        max_sampled=max_sampled, 
        user_alpha=user_alpha, 
        item_alpha=item_alpha,
    )
    
    return precisions_at[-1]


In [None]:
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=30)
# best_params = study.best_params

best_params = {
    'loss': 'warp',
    'no_components': 50,
    'learning_rate': 0.01,
    'max_sampled': 50,
    'user_alpha': 0.0,
    'item_alpha': 0.0001
}

In [None]:
model, precisions_at = fit_model(
    epochs=200,
    at=10,
    loss=best_params["loss"],
    no_components=best_params["no_components"], 
    learning_rate=best_params["learning_rate"], 
    max_sampled=best_params["max_sampled"],
    user_alpha=best_params["user_alpha"],
    item_alpha=best_params["item_alpha"],
    verbose=True,
)

In [None]:
figure, ax = pl.subplots()

ax.plot(np.arange(len(precisions_at)), precisions_at)

pass

## Save track embeddings

In [None]:
biases, embeddings = model.get_item_representations()

In [None]:
model.item_biases *= 0.0

In [None]:
track_meta = pd.read_json(DATA_DIR + "tracks.json", lines=True)
track_meta["dataset_index"] = track_meta["track"].map(lambda t: dataset.mapping()[2].get(t))

In [None]:
dataset_tracks = track_meta[pd.notnull(track_meta["dataset_index"])].sort_values("dataset_index")

In [None]:
writer = tb.SummaryWriter(comment='msd_ligtfm_embeddings', log_dir=DATA_DIR + "tb")
writer.add_embedding(embeddings, metadata=list(dataset_tracks[["artist", "title"]].itertuples(index=False, name=None)), tag="lightfm", metadata_header=["artist", "title"])
writer.close()

## Compute top recommendations

In [None]:
k = 20
max_tracks_from_same_artist = 5

with open(DATA_DIR + f"recommendations_{k}_{max_tracks_from_same_artist}.json", "w") as rf:
    for _, track in tqdm.tqdm(track_meta.iterrows()):
        j = track["dataset_index"]
        
        recommendations = []
        if pd.notna(j):
            embedding = embeddings[int(j)]
            neighbours = np.argsort(-np.dot(embeddings, embedding))
            
            artists = defaultdict(int)
            for neighbour in neighbours:
                recommended_track = dataset_tracks[dataset_tracks["dataset_index"] == neighbour].iloc[0]
                
                recommendation = int(recommended_track["track"])
                if recommendation == track["track"]:
                    continue
                
                artist = recommended_track["artist"]
                if artists[artist] >= max_tracks_from_same_artist:
                    continue
                
                recommendations.append(recommendation)
                artists[artist] += 1

                if len(recommendations) == k:
                    break
         
        track_with_recommendations = dict(track)
        track_with_recommendations["recommendations"] = recommendations
        
        rf.write(json.dumps(track_with_recommendations) + "\n")

## How many unique artist per recommendation list?

In [None]:
dataset_track_artists = dict(zip(
    dataset_tracks["track"].values.tolist(),
    dataset_tracks["artist"].values.tolist(),
))

In [None]:
recs = pd.read_json(DATA_DIR + f"recommendations_{k}_{max_tracks_from_same_artist}.json", lines=True)
recs = recs[recs["dataset_index"].notnull()]

sample = recs.sample(frac=0.1).iloc[0]

print(sample["title"], "by" , sample["artist"], "\n===")
print("\n".join([dataset_track_artists[track] for track in sample["recommendations"]]))

In [None]:
def count_artists(tracks):
    return len(
        set([dataset_track_artists[track] for track in tracks])
    )


artist_counts = recs["recommendations"].map(count_artists)
artist_counts.value_counts()