In [1]:
# common stuff
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
import warnings
warnings.simplefilter('ignore')
%pylab inline
%config InlineBackend.figure_format = 'png' 
from pylab import rcParams
rcParams['figure.figsize'] = 8,5
import numpy as np
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Install latest implicit, the default one in ubuntu 16.04 appears to have a bug
# in CosineRecommender
# git clone https://github.com/benfred/implicit.git
# sudo -H python3 setup.py install

In [3]:
# Cleanup dataset, elimenate bad data
with open("lastfm-dataset-360K/usersha1-artmbid-artname-plays.cleaned", "w") as output:
    for i, line in enumerate(open("lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv")):
        tokens = line.strip().split("\t")
        if len(tokens) != 4:
            print("wrong # of tokens", i)
            continue
        if not tokens[3].isdigit():
            print("non integer play count", i)
            continue
        if tokens[2] == '""':
            print("invalid artist id", tokens[2])
            continue
        # some lines contain carriage returns (without newlines), which
        # randomly messes pandas up
        line = line.replace('\r', '')
        output.write(line)

invalid artist id ""
invalid artist id ""
non integer play count 16890410


In [4]:
# Read in dataset
col_names = ["user", "artist-mbid", "artist-name", "total-plays"]
data = pd.read_csv(
    "lastfm-dataset-360K/usersha1-artmbid-artname-plays.cleaned",
    sep="\t",
    header=None,
    names=col_names
)
data.head()

Unnamed: 0,user,artist-mbid,artist-name,total-plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706


In [5]:
data.fillna("None", inplace=True)
data["user_id"] = data["user"].astype("category").cat.codes.copy() + 1
data["artist_id"] = data["artist-mbid"].astype("category").cat.codes.copy() + 1
data.drop(["artist-name", "artist-mbid", "user"], axis=1, inplace=True)
display(data.head())
display(data.describe())

Unnamed: 0,total-plays,user_id,artist_id
0,2137,1,37426
1,1099,1,152041
2,897,1,112367
3,717,1,38435
4,706,1,117444


Unnamed: 0,total-plays,user_id,artist_id
count,17535652.0,17535652.0,17535652.0
mean,215.193186,179404.100173,80813.51224
std,614.481504,103588.706302,46071.36686
min,0.0,1.0,1.0
25%,35.0,89700.0,41580.0
50%,94.0,179376.0,83024.0
75%,224.0,269104.0,118609.0
max,419157.0,358868.0,160113.0


In [6]:
# Make a sparse matrix
from scipy.sparse import coo_matrix, csr_matrix, lil_matrix

def sparse_info(sparse_matrix: csr_matrix) -> None:
    print("Shape: {}".format(sparse_matrix.shape))
    print("NNZ count: {}".format(sparse_matrix.nnz))
    print("NNZ%: {}"
          .format(sparse_matrix.nnz / sparse_matrix.shape[0] / sparse_matrix.shape[1])
    )
    print("mean: {}".format(sparse_matrix.data.mean()))
    print("max: {}".format(sparse_matrix.data.max()))
    print("min: {}".format(sparse_matrix.data.min()))

plays = coo_matrix((
    data["total-plays"].astype(np.float32),
    (
        data["user_id"],
        data["artist_id"]
    )
))

plays = plays.tocsr()

sparse_info(plays)

Shape: (358869, 160114)
NNZ count: 17535285
NNZ%: 0.0003051740329513771
mean: 215.19761657714844
max: 419157.0
min: 0.0


In [7]:
# Now we want to make a train/test split, however, we need to be careful.
# We would like to use prec@k as a metric later. A k of 15 would be nice, but
# if we move 15 items from training to test for some of the users, then they may not have any data
# left in the training set. Thus, the train_test_split only looks for people who have at
# least 2*k plays of unique artists before moving some of their data to the test set.
def train_test_split(plays, split_count, fraction):
    train = plays.copy().tocoo()
    test = lil_matrix(train.shape)
    user_index = np.random.choice(
        np.where(np.bincount(train.row) >= split_count * 2)[0],
        replace=False,
        size=np.int32(np.floor(fraction * train.shape[0]))
    ).tolist()        
    train = train.tolil()
    for user in user_index:
        test_plays = np.random.choice(plays.getrow(user).indices, 
            size=split_count, 
            replace=False)
        train[user, test_plays] = 0.        
        test[user, test_plays] = plays[user, test_plays]
    assert(train.multiply(test).nnz == 0)
    return train.tocsr(), test.tocsr(), user_index
train, test, test_user_indices = train_test_split(plays, 15, 0.2)
# These are still user*item matrices, transpose
train = train.T
test = test.T
sparse_info(train)
sparse_info(test)

Shape: (160114, 358869)
NNZ count: 16458690
NNZ%: 0.0002864375916557103
mean: 215.186279296875
max: 419157.0
min: 0.0
Shape: (160114, 358869)
NNZ count: 1076595
NNZ%: 1.873644129566687e-05
mean: 215.37216594912664
max: 92357.0
min: 1.0


In [8]:
%%time

# Let's use cosine recommender
from implicit.nearest_neighbours import CosineRecommender
model = CosineRecommender()
model.fit(train)
sparse_info(model.similarity)

Shape: (160114, 160114)
NNZ count: 3153068
NNZ%: 0.00012299139357680342
mean: 0.25543252243529674
max: 1.0000000585200308
min: 0.0
CPU times: user 30.6 s, sys: 252 ms, total: 30.9 s
Wall time: 5.76 s


In [27]:
%%time

# Save predictions
! mkdir -p recs
train_user_item = train.T
with open("recs/test1.recs.tsv", "w") as output_file:
    for user_id in test_user_indices:
        for artist_id, score in model.recommend(user_id, train_user_item):
            output_file.write("%s\t%s\t%s\n" % (user_id, artist_id, score))

CPU times: user 44.6 s, sys: 56 ms, total: 44.6 s
Wall time: 45 s


In [22]:
%%time

# Save test ground truth
clmns = ["user_id", "artist_id", "total-plays"]
test_user_item = test.T
rows, cols = test_user_item.nonzero()
records = []
for i in range(len(rows)):    
    records.append((rows[i], cols[i], test_user_item[rows[i], cols[i]]))
df = pd.DataFrame.from_records(records, columns=clmns)
df.to_csv('test1', sep="\t", header=False, index=False)

In [28]:
# Evaluate
! mrec_evaluate \
    --input_format=tsv --test_input_format=tsv \
    --train test1 \
    --recsdir recs
    
# We can expect adequate values up to prec@15, since we used 15
# in our test/train split

[2017-11-25 21:41:03,732] INFO: processing /home/stas/Projects/1-netology/Netology/lab2/test1...
None
mrr            0.2300 +/- 0.0000
prec@5         0.0947 +/- 0.0000
prec@10        0.0744 +/- 0.0000
prec@15        0.0496 +/- 0.0000
prec@20        0.0372 +/- 0.0000


In [33]:
%%time

# Now let's use ALS, let's hardcode factors for now

from implicit.als import AlternatingLeastSquares
model = AlternatingLeastSquares(factors=50)
model.fit(train)



CPU times: user 10min 30s, sys: 10min 56s, total: 21min 27s
Wall time: 2min 46s


In [34]:
%%time

# Save predictions
! mkdir -p recs2
with open("recs2/test1.recs.tsv", "w") as output_file:
    for user_id in test_user_indices:
        for artist_id, score in model.recommend(user_id, train_user_item):
            output_file.write("%s\t%s\t%s\n" % (user_id, artist_id, score))

CPU times: user 25min 44s, sys: 24min 4s, total: 49min 48s
Wall time: 6min 24s


In [35]:
# Evaluate
! mrec_evaluate \
    --input_format=tsv --test_input_format=tsv \
    --train test1 \
    --recsdir recs2
    
# This is better than kNN, prec@5 is 2 times better, prec@15 is almost 3 times better

[2017-11-25 22:05:17,148] INFO: processing /home/stas/Projects/1-netology/Netology/lab2/test1...
None
mrr            0.4299 +/- 0.0000
prec@5         0.2120 +/- 0.0000
prec@10        0.1772 +/- 0.0000
prec@15        0.1182 +/- 0.0000
prec@20        0.0886 +/- 0.0000
