In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
import os

ModuleNotFoundError: No module named 'pandas'

In [2]:
os.chdir("C:\\Users\\simon.weiss\\Documents\\Freaky-Friday\\recommender\\recommender-lastfm\\code")

In [3]:
lastfm = pd.read_table("../data/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv", 
                         usecols=[0, 2, 3], 
                         names=["user", "artist", "plays"],
                         na_filter = False,
                         encoding = "utf-8")

In [4]:
lastfm.head()

Unnamed: 0,user,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


I print basic stats relevant for recommender systems.

In [5]:
def print_stats(df, user, item):
    n_users = np.int64(df.loc[:, user].drop_duplicates().count())
    n_artists = np.int64(df.loc[:, item].drop_duplicates().count())
    sparsity =  (1 - float(df.shape[0]) / float(n_users*n_artists)) * 100
    print("Number of Users: {}".format(n_users))
    print("Number of Artists: {}".format(n_artists))
    print("Sparsity: {:.8} %".format(str(sparsity)))

In [6]:
print_stats(lastfm, "user", "artist")

Number of Users: 358868
Number of Artists: 292365
Sparsity: 99.98328 %


The sparsity is above 99%, this means that more than 99% of values are missing in the user-artist matrix. Matrix Factorization algorithms excel in these sparse data contexts, however, above 99% sparsity might not be enough "signal" to produce valuable recommendations. I will deal with this problem next by removing some users and artists from the dataset.

Looking at artists that were only played by one user but many times can give insights about hardcore fans of unknonw bands. For example, the artist *Mr. Silakka* has been only playe by one user but that user played *Mr. Silakka* 32366 times!

In [7]:
lastfm["dist_user_plays"] = lastfm.loc[:, ["user", "artist"]].groupby("user").transform("count")
lastfm["dist_artist_plays"] = lastfm.loc[:, ["user", "artist"]].groupby("artist").transform("count")

In [8]:
lastfm.sort_values(["dist_artist_plays", "plays"], ascending=[True, False]).head()

Unnamed: 0,user,artist,plays,dist_user_plays,dist_artist_plays
13497754,c5403d0be4692edf6f57f871c078c288955e2a3b,Макс иванов,36285,43,1
6525045,5f662478181f3f7bc04266885f8136f28be4fc3f,mr. silakka,32366,50,1
2911499,2a8f828c08d6737cd9bcdd762348060c07d557f8,coca cola christmas,27604,50,1
9714637,8ddf34c21d1247e24966e7188bbac69261b34fd7,2brother,23675,49,1
160002,024dafcb227af455f06ccbe504bb7dc366264d2c,big lonz,22202,42,1


However, from an recommendation perspective there is not much to learn from these users and artists. Hence, I remove any user that listend only to one artist. Next, I remove artists that were only played by one distinct user (although the user might have played that artist multiple times). 

Naturally, we won't be able to get recommendations for the users we remove nor will the removed artists be recommended. In a production application I would treat these users and artists like new users and artists. For these one has to define a cold-start strategy like recommending *most popular on average*. 

In [9]:
# Subsetting the data to artists with more than 1 distinct play and users with more than 1 distinct play
data = lastfm[(lastfm["dist_user_plays"] > 1) & (lastfm["dist_artist_plays"] > 1)].reset_index()

In [10]:
print_stats(data, "user", "artist")

Number of Users: 358833
Number of Artists: 159602
Sparsity: 98.78800 %


  after removing the cwd from sys.path.


Sparsity went down to 98.8%, perfect! This also reduces computationally complexity significantly. We now need to produce only `358833 users * 159602 artists = 57,270,464,466` recommendations instead of `104,920,442,820`. It nearly halved the number of recommendations to calculate.

In [11]:
data['user'] = data['user'].astype("category")
data['artist'] = data['artist'].astype("category")

artist_user_mat = coo_matrix((data['plays'].astype(float), 
                   (data['artist'].cat.codes, 
                    data['user'].cat.codes)))

Dictionaries to translate between names and ids an vice-versa

In [12]:
ids_to_artists = dict(enumerate(data["artist"].cat.categories))
artists_to_ids = {r: i for i, r in ids_to_artists.items()}

ids_to_users = dict(enumerate(data["user"].cat.categories))
users_to_ids = {r: i for i, r in ids_to_users.items()}

In [13]:
# disable internal multithreading
%env MKL_NUM_THREADS=1 

env: MKL_NUM_THREADS=1


In [14]:
#TODO: Confidence !!

In [15]:
model = AlternatingLeastSquares(factors=40, dtype=np.float64, iterations=15)
model.fit(artist_user_mat)

In [16]:
# enable internal multithreading
%env MKL_NUM_THREADS=4 

env: MKL_NUM_THREADS=4


As a sanity check one can look at the similarity between items. Looks good to me.

In [17]:
[(ids_to_artists[a], s) for a, s in model.similar_items(artists_to_ids["metallica"], 10)]

[('metallica', 1.0000000000000002),
 ('system of a down', 0.9848837074352589),
 ('rammstein', 0.9745327928234145),
 ('ac/dc', 0.9684443468017192),
 ('marilyn manson', 0.9639789155981889),
 ('the offspring', 0.9629375831115958),
 ('rage against the machine', 0.9623200350814303),
 ('nirvana', 0.9553169242264237),
 ('green day', 0.9499000909667074),
 ('tool', 0.9461640766784706)]

Next, I pick a random user with enough plays to calculate recommendations and look at the artists that user listens to most

In [18]:
specific_user_id = users_to_ids["d6eb36bb5fe4081f06b4fcf5fd608fafd77ad687"]
data[data["user"]=="d6eb36bb5fe4081f06b4fcf5fd608fafd77ad687"].sort_values(["plays"], ascending=False).head()

Unnamed: 0,index,user,artist,plays,dist_user_plays,dist_artist_plays
14607267,14718530,d6eb36bb5fe4081f06b4fcf5fd608fafd77ad687,bob dylan,3047,112,31799
14607268,14718531,d6eb36bb5fe4081f06b4fcf5fd608fafd77ad687,bruce springsteen,1370,112,13648
14607269,14718532,d6eb36bb5fe4081f06b4fcf5fd608fafd77ad687,the clash,1028,112,19423
14607270,14718533,d6eb36bb5fe4081f06b4fcf5fd608fafd77ad687,tom waits,803,112,19976
14607271,14718534,d6eb36bb5fe4081f06b4fcf5fd608fafd77ad687,håkan hellström,699,112,2751


Now we recommend artists to that specific user. The recommendations look good to me.

In [19]:
user_artist_mat = artist_user_mat.T.tocsr()
user_recommendations = model.recommend(specific_user_id, user_artist_mat, N=15)
[(ids_to_artists[a], s) for a, s in user_recommendations]

[('glasvegas', 1.1588939004469023),
 ('buddy holly', 1.1463831720994624),
 ('josh rouse', 1.1204749747771507),
 ('roy orbison', 1.1121315549741937),
 ('the soundtrack of our lives', 1.0903597384375263),
 ('lou reed', 1.0777825705680872),
 ('dusty springfield', 1.0626501871007736),
 ('van morrison', 1.0598874312014752),
 ('billy bragg', 1.0481429305472343),
 ('dolly parton', 1.042571271234622),
 ('the velvet underground', 1.0359946433881955),
 ('adam green', 1.0196747932334036),
 ('the pogues', 1.0183723786909082),
 ('ron sexsmith', 1.0177700468442497),
 ('antony and the johnsons', 1.0113270299816617)]

As a next step we can calcualte which user-item interaction, i.e. the user listening to an artist, had the biggest influence on an specific recommendation. This is useful for explaining the recommendations to users, e.g. "Because you lsitend to *Metallica* here are some other Metal Bands...". These explanations were found to increase user experience and acceptance.

In [20]:
_, explanations, _ = model.explain(userid=specific_user_id, user_items=user_artist_mat, itemid=artists_to_ids["billy bragg"])
[(ids_to_artists[a], s) for a, s in explanations]

[('bruce springsteen', 0.2242125635860653),
 ('the clash', 0.14642937494579433),
 ('drive-by truckers', 0.11852761308854332),
 ('jonathan richman', 0.09948623819309899),
 ('the hold steady', 0.08622294041591065),
 ('lloyd cole and the commotions', 0.08326799576045336),
 ('madness', 0.07016338112690264),
 ('moneybrother', 0.06481854344063194),
 ('townes van zandt', 0.056589804787619845),
 ('elvis costello & the attractions', 0.05018096418104496)]

The biggest influence on the recommendation *Billy Bragg*  has *The Clash*. Personally, I don't know *Billy Bragg* but Wikipedia says that "Bragg was particularly influenced by the Clash, whom he'd seen play live in London in May 1977 on their White Riot Tour,..."

To get all recommendations one just multiplies the user factor matrix with the item factor matrix. This is a pretty big operation since the resulting matrix is the completely filled of size number_of_items * number_of_users.

In [22]:
all_recommendations = np.dot(model.item_factors, model.user_factors.T)

# dgemm implementation
#from scipy.linalg import blas as blas
#X = blas.dgemm(alpha=1., a=np.asfortranarray(model.item_factors), b=np.asfortranarray(model.user_factors), trans_b=True)

MemoryError: 

In [69]:
print("{} bytes".format(model.user_factors.size * model.user_factors.itemsize))

114826560 bytes


The following also throws an memory error because the zeros matrix is already too big.

In [65]:
# https://github.com/numpy/numpy/issues/4062

item_factors, user_factors = model.item_factors, model.user_factors
B = np.zeros((159602,358833))
item_factors.shape = (159602, 40, 100)
user_factors.shape = (358833, 40, 100)
for I, U in np.rollaxis(item_factors,2), np.rollaxis(user_factors, 2):
    B += np.dot(I, U.T)

MemoryError: 

# Old stuff

In [None]:
def prune(data, user_col, item_col, user_min, item_min):
    n_users = data.loc[:, user_col].drop_duplicates().count()
    n_artists = data.loc[:, item_col].drop_duplicates().count()
    sparsity =  (1 - float(data.shape[0]) / float(n_users*n_artists)) * 100
    print("Number of Users: {}".format(n_users))
    print("Number of Artists: {}".format(n_artists))
    print("Sparsity: {:.5} %".format(str(sparsity)))
    
    done = False
    while not done:
        starting_shape = data.shape[0]
        # Creating a new column with number of plays of distinct artists for this user
        data["distinct_user_play"] = data.loc[:,[user_col, item_col]].groupby(user_col).transform("count")
        data = data[data["distinct_user_play"] > user_min]
        data["distinct_artist_play"] = data.loc[:,[user_col, item_col]].groupby(item_col).transform("count")
        data = data[data["distinct_artist_play"] > item_min]
        data.drop(columns=["distinct_user_play", "distinct_artist_play"])
        ending_shape = data.shape[0]
        if starting_shape == ending_shape:
            done = True
    
    n_users = data.loc[:, user_col].drop_duplicates().count()
    n_artists = data.loc[:, item_col].drop_duplicates().count()
    sparsity =  (1 - float(data.shape[0]) / float(n_users*n_artists)) * 100
    print("Number of Users: {}".format(n_users))
    print("Number of Artists: {}".format(n_artists))
    print("Sparsity: {:.5} %".format(str(sparsity)))
    
    return data