In [None]:
import pandas as pd
import numpy as np

In [None]:
plays = pd.read_csv('data/user_artists.dat', sep='\t')
artists = pd.read_csv('data/artists.dat', sep='\t', usecols=['id','name'])

# Merge artist and user pref data
ap = pd.merge(artists, plays, how="inner", left_on="id", right_on="artistID")
ap = ap.rename(columns={"weight": "playCount"})

# Group artist by name
artist_rank = ap.groupby(['name']) \
    .agg({'userID' : 'count', 'playCount' : 'sum'}) \
    .rename(columns={"userID" : 'totalUsers', "playCount" : "totalPlays"}) \
    .sort_values(['totalPlays'], ascending=False)

artist_rank['avgPlays'] = artist_rank['totalPlays'] / artist_rank['totalUsers']
print(artist_rank)

In [None]:
ap

In [None]:
# Merge into ap matrix
ap = ap.join(artist_rank, on="name", how="inner") \
    .sort_values(['playCount'], ascending=False)

# Preprocessing
pc = ap.playCount
play_count_scaled = (pc - pc.min()) / (pc.max() - pc.min())
ap = ap.assign(playCountScaled=play_count_scaled)
#print(ap)

# Build a user-artist rating matrix 
ratings_df = ap.pivot(index='userID', columns='artistID', values='playCountScaled')
ratings = ratings_df.fillna(0).values

# Show sparsity
sparsity = float(len(ratings.nonzero()[0])) / (ratings.shape[0] * ratings.shape[1]) * 100
print("sparsity: %.2f" % sparsity)

In [5]:
ratings_df.shape

(1892, 17632)

In [6]:
from scipy.sparse import csr_matrix

# Build a sparse matrix
X = csr_matrix(ratings)

n_users, n_items = ratings_df.shape
print("rating matrix shape", ratings_df.shape)

user_ids = ratings_df.index.values
artist_names = ap.sort_values("artistID")["name"].unique()
#print(type(X))

rating matrix shape (1892, 17632)


In [7]:
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from lightfm.data import Dataset
np.random
# Build data references + train test
Xcoo = X.tocoo()
data = Dataset()
data.fit(np.arange(n_users), np.arange(n_items))
interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data)) 
train, test = random_train_test_split(interactions, random_state=42)

# Ignore that (weight seems to be ignored...)
#train = train_.tocsr()
#test = test_.tocsr()
#train[train==1] = X[train==1]
#test[test==1] = X[test==1]

# To be completed...

In [8]:
print(Dataset())

<lightfm.data.Dataset object at 0x7f4774faf6a0>


In [9]:
# Train
model = LightFM(learning_rate=0.05, loss='warp', random_state=42)
model.fit(train, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x7f4774faf640>

In [10]:
# Evaluate
train_precision1 = precision_at_k(model, train, k=10).mean()
test_precision1 = precision_at_k(model, test, k=10, train_interactions=train).mean()

train_auc1 = auc_score(model, train).mean()
test_auc1 = auc_score(model, test, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision1, test_precision1))
print('AUC: train %.2f, test %.2f.' % (train_auc1, test_auc1))

Precision: train 0.38, test 0.13.
AUC: train 0.97, test 0.86.


In [11]:
# Predict
scores = model.predict(0, np.arange(n_items))
top_items = artist_names[np.argsort(-scores)]
print(len(top_items))

17632


In [12]:
len()

TypeError: len() takes exactly one argument (0 given)

1. logistic

In [None]:
# Train
model = LightFM(learning_rate=0.05, loss='logistic',random_state=42)
model.fit(train, epochs=10, num_threads=2)

# Evaluate
train_precision2 = precision_at_k(model, train, k=10).mean()
test_precision2 = precision_at_k(model, test, k=10, train_interactions=train).mean()

train_auc2 = auc_score(model, train).mean()
test_auc2 = auc_score(model, test, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision2, test_precision2))
print('AUC: train %.2f, test %.2f.' % (train_auc2, test_auc2))

# Predict
scores = model.predict(0, np.arange(n_items))
top_items = artist_names[np.argsort(-scores)]
print(top_items)

2.bpr

In [None]:
# Train
model = LightFM(learning_rate=0.05, loss='bpr',random_state=42)
model.fit(train, epochs=10, num_threads=2)

# Evaluate
train_precision3 = precision_at_k(model, train, k=10).mean()
test_precision3 = precision_at_k(model, test, k=10, train_interactions=train).mean()

train_auc3 = auc_score(model, train).mean()
test_auc3 = auc_score(model, test, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision3, test_precision3))
print('AUC: train %.2f, test %.2f.' % (train_auc3, test_auc3))

# Predict
scores = model.predict(0, np.arange(n_items))
top_items = artist_names[np.argsort(-scores)]
print(top_items)

3.warp-kos

In [None]:
# Train
model = LightFM(learning_rate=0.05, loss='warp-kos',random_state=42)
model.fit(train, epochs=10, num_threads=2)

# Evaluate
train_precision3 = precision_at_k(model, train, k=10).mean()
test_precision3 = precision_at_k(model, test, k=10, train_interactions=train).mean()

train_auc3 = auc_score(model, train).mean()
test_auc3 = auc_score(model, test, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision3, test_precision3))
print('AUC: train %.2f, test %.2f.' % (train_auc3, test_auc3))

# Predict
scores = model.predict(0, np.arange(n_items))
top_items = artist_names[np.argsort(-scores)]
print(top_items)

In [None]:
def score():
    df = pd.DataFrame(columns=["WARP", "LOGISTIC", "BPR" ,"KOS-WARP" ])
    loss = ['warp', 'logistic', 'bpr', 'warp-kos' ]

    for i,j in enumerate(loss):

        model = LightFM(learning_rate=0.05, loss= j,random_state=42)
        model.fit(train, epochs=10, num_threads=2)
        a = precision_at_k(model, train, k=5).mean()
        b = precision_at_k(model, test, k=5, train_interactions=train).mean()
        c = auc_score(model, train).mean()
        d = auc_score(model, test, train_interactions=train).mean()
        this_column = df.columns[i]
        df[this_column] = [a,b,c,d]
    return(pd.DataFrame(df))

In [None]:
score()

## Part 1

Voici deux sous taches supplémentaire qui vont nous aider à evaluer/interpréter notre modéle, après l'obtention des tableaux de résultats :

1. faire la fonction get_recommandation qui prend en entrée un User et renvoie les Artists recommandé (du meilleurs au moins bon au sens du score de recommandation)

    2. get_ground_truth qui renvoie les artistes ecoutés par un utilisateur par ordre décroissant du playCountScaled

Ceci nous permettra d"evaluer qualitatement les résultats que retourne le modéle et le comparer avec la vérité terrain

In [None]:
def get_recommendations(user_id):
    # initialize the model
    model = LightFM(learning_rate=0.05, loss='bpr', random_state=42)
    model.fit(train, epochs=10, num_threads=2)
    # predict
    scores = model.predict(user_id, np.arange(n_items))
    top_items = artist_names[np.argsort(-scores)]
    return(pd.DataFrame(top_items[:10]))

In [None]:
get_recommendations(1)

In [None]:
#get_ground_truth qui renvoie les artistes ecoutés par un utilisateur par ordre décroissant du playCountScaled
def get_ground_truth(user_id): 
    t = get_recommendations(user_id)
    z = top_items.join(artist_rank, on="")
    return(z)

In [None]:
def get_ground_truth(user_id):
    ground_ap = ap(user_id)(userID,name,playCountScaled)
    ground_truth = ground.sort_values(by='playCountScaled', ascending=True)

In [None]:
get_ground_truth(1)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid

param_grid = {'learning_schedule':['adagrad', 'adadelta'], 
              'loss':['warp', 'logistic','bpr','warp-kos'],
             'random_state':[0, 42, 100]}
params = list(ParameterGrid(param_grid))

In [None]:
score = []
for grid in params:
    model = LightFM(**grid)
    pred = model.fit(train)
    score.append(round(auc_score(model, train).mean(),3))

In [None]:
max_value = np.argmax(score)
max_value_item = params[max_value].items()
print(max_value)
print(max_value_item)

In [None]:
value =  dict()
value[max_value] = max_value_item
value