In [1]:
# common stuff
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
import warnings
warnings.simplefilter('ignore')
%pylab inline
%config InlineBackend.figure_format = 'png' 
from pylab import rcParams
rcParams['figure.figsize'] = 8,5
import numpy as np
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Let's prepare tags matrix, read tags
tags_df = pd.read_csv("ml-latest-small/tags.csv")
tags_df['tag'] = tags_df['tag'].str.lower()
display(tags_df.info())
display(tags_df.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1296 entries, 0 to 1295
Data columns (total 4 columns):
userId       1296 non-null int64
movieId      1296 non-null int64
tag          1296 non-null object
timestamp    1296 non-null int64
dtypes: int64(3), object(1)
memory usage: 50.6+ KB


None

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,cambodia,1170560997
3,15,32892,russian,1170626366
4,15,34162,forgettable,1141391765


In [3]:
# Let's have a look at the data
# There're 560 unique tags, and it looks like there's a lot of garbage,
# i.e. probably there're tags like "blablablaxxyy123" that are probably only set
# on a single movie, these aren't of any use, let's inspect the data more
vc = tags_df["tag"].value_counts()
display(vc.head(20))
display(len(vc))

getdvd        33
ei muista     30
toplist07     26
tivo          26
tcm           20
toplist12     20
toplist11     20
toplist15     19
toplist08     19
toplist14     18
toplist10     18
toplist09     18
funny         17
toplist06     16
holes00s      16
holes70s      16
holes80s      14
sightsound    14
toplist13     14
holes40s      13
Name: tag, dtype: int64

560

In [4]:
vc_g1 = vc[vc > 1]
display(len(vc_g1))
# There're only 161 tags that are used more than once, let's refine our dataframe
# we'll keep only lines with those tags
tags_df = tags_df[tags_df["tag"].isin(vc_g1.index)]
display(tags_df.info())
display(tags_df.head(20))

161

<class 'pandas.core.frame.DataFrame'>
Int64Index: 897 entries, 9 to 1294
Data columns (total 4 columns):
userId       897 non-null int64
movieId      897 non-null int64
tag          897 non-null object
timestamp    897 non-null int64
dtypes: int64(3), object(1)
memory usage: 35.0+ KB


None

Unnamed: 0,userId,movieId,tag,timestamp
9,15,100365,documentary,1425876220
12,68,2174,music,1249808064
13,68,2174,weird,1249808102
14,68,8623,steve martin,1249808497
15,73,107999,action,1430799184
16,73,107999,anime,1430799184
17,73,107999,kung fu,1430799184
18,73,111624,drama,1431584497
19,73,111624,indie,1431584497
20,73,111624,love,1431584497


In [5]:
# Let's have a look at how many words can each "tag" have
# Most of the tags are single words, but there're 2+ word tags as well
import re
tl = tags_df["tag"].apply(lambda x: len(re.split("[\ \\t]+", x)))
tl.value_counts()

1    696
2    183
3     12
4      6
Name: tag, dtype: int64

In [6]:
# I wonder what are these
# Ok, these look fine, let's keep them as is
display(tags_df[tl == 2].head(5))
display(tags_df[tl == 3].head(5))
display(tags_df[tl == 4].head(5))
tags_df.reset_index(inplace=True, drop=True)

Unnamed: 0,userId,movieId,tag,timestamp
14,68,8623,steve martin,1249808497
17,73,107999,kung fu,1430799184
29,77,5909,takashi miike,1163219591
37,94,64957,original plot,1291781246
48,138,260,science fiction,1440379018


Unnamed: 0,userId,movieId,tag,timestamp
94,152,52319,world war ii,1335900622
187,212,66934,neil patrick harris,1253926160
196,212,68157,world war ii,1253926408
391,364,1732,nudity (full frontal),1444535170
396,364,2068,coming of age,1444530920


Unnamed: 0,userId,movieId,tag,timestamp
24,77,1199,trilogy of the imagination,1163220043
26,77,2968,trilogy of the imagination,1163220039
27,77,4467,trilogy of the imagination,1163220065
812,531,6942,nudity (topless - notable),1243454950
821,531,35836,nudity (topless - notable),1243454603


In [7]:
from scipy.sparse import coo_matrix, csr_matrix, lil_matrix

def sparse_info(sparse_matrix: csr_matrix) -> None:
    print("Shape: {}".format(sparse_matrix.shape))
    print("NNZ count: {}".format(sparse_matrix.nnz))
    print("NNZ%: {}"
          .format(sparse_matrix.nnz / sparse_matrix.shape[0] / sparse_matrix.shape[1])
    )
    print("mean: {}".format(sparse_matrix.data.mean()))
    print("max: {}".format(sparse_matrix.data.max()))
    print("min: {}".format(sparse_matrix.data.min()))
    
# Let's write a movie-tag sparse matrix getter, it accepts movieId-based category
# and generates movie indices from it.
def get_movie_tag_matrix(ct):
    df = tags_df.copy()
    ct_map = dict(zip(ct, ct.cat.codes + 1))
    df["movie_id"] = df["movieId"].map(ct_map)
    df.dropna(inplace=True)    
    df["tag_id"] = df["tag"].astype("category").cat.codes.copy()
    df.drop(['userId', 'timestamp', 'movieId', 'tag'], inplace=True, axis=1)    
    last_movie_id = ct.cat.codes.max() + 1
    last_tag_id = df["tag_id"].max()
    movie_x_tag = df[["movie_id", "tag_id"]].drop_duplicates().as_matrix()
    movie_tag_matrix = csr_matrix(
        (
            [1] * len(movie_x_tag),
            (
                [pair[0] for pair in movie_x_tag],
                [pair[1] for pair in movie_x_tag],
            )
        ),
        shape=(last_movie_id + 1, last_tag_id + 1),
        dtype=np.float32
    )    
    return movie_tag_matrix
# Let's test it using tags dataframe itself
mtm = get_movie_tag_matrix(tags_df["movieId"].astype("category"))
sparse_info(mtm)
assert(mtm.shape[0] == tags_df["movieId"].nunique() + 1)
assert(mtm.shape[1] == len(vc_g1))
assert(mtm.data.mean() == 1)
assert(mtm.data.min() == 1)
assert(mtm.data.max() == 1)
# Looks correct

Shape: (585, 161)
NNZ count: 872
NNZ%: 0.00925837447576578
mean: 1.0
max: 1.0
min: 1.0


In [8]:
# Let's open ratings
ratings_df = pd.read_csv("ml-latest-small/ratings.csv")
movie_cat = ratings_df["movieId"].astype("category")
ratings_df["movie_id"] = movie_cat.cat.codes.copy() + 1
ratings_df["user_id"] = ratings_df["userId"].astype("category").cat.codes.copy() + 1
ratings_df.drop(['userId', 'timestamp', 'movieId'], inplace=True, axis=1)    
last_movie_id = ratings_df["movie_id"].max()
last_user_id = ratings_df["user_id"].max()
display(ratings_df.info())
display(ratings_df.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100004 entries, 0 to 100003
Data columns (total 3 columns):
rating      100004 non-null float64
movie_id    100004 non-null int16
user_id     100004 non-null int16
dtypes: float64(1), int16(2)
memory usage: 1.9 MB


None

Unnamed: 0,rating,movie_id,user_id
0,2.5,31,1
1,3.0,834,1
2,3.0,860,1
3,2.0,907,1
4,4.0,932,1


In [9]:
# Let's create user*movie matrix
user_x_item = ratings_df[["user_id", "movie_id"]].as_matrix()
mean_rating = ratings_df["rating"].mean()
user_item_matrix = csr_matrix(
    (
        (ratings_df["rating"] > mean_rating).tolist(),
        (
            [pair[0] for pair in user_x_item],
            [pair[1] for pair in user_x_item],
        )
    ),
    shape=(last_user_id + 1, last_movie_id + 1),
    dtype=np.float32
)
user_item_matrix.eliminate_zeros()
sparse_info(user_item_matrix)
# And corresponding movie-tag matrix
movie_tag_matrix = get_movie_tag_matrix(movie_cat)
sparse_info(movie_tag_matrix)

Shape: (672, 9067)
NNZ count: 51568
NNZ%: 0.008463449347975653
mean: 1.0
max: 1.0
min: 1.0
Shape: (9067, 160)
NNZ count: 813
NNZ%: 0.0005604113819344876
mean: 1.0
max: 1.0
min: 1.0


In [10]:
# train_test_split from lab2:

# Now we want to make a train/test split, however, we need to be careful.
# We would like to use prec@k as a metric later. A k of 15 would be nice, but
# if we move 15 items from training to test for some of the users, then they may not have any data
# left in the training set. Thus, the train_test_split only looks for people who have at
# least 2*k plays of unique artists before moving some of their data to the test set.
def train_test_split(plays, split_count, fraction):
    train = plays.copy().tocoo()
    test = lil_matrix(train.shape)
    user_index = np.random.choice(
        np.where(np.bincount(train.row) >= split_count * 2)[0],
        replace=False,
        size=np.int32(np.floor(fraction * train.shape[0]))
    ).tolist()        
    train = train.tolil()
    for user in user_index:
        test_plays = np.random.choice(plays.getrow(user).indices, 
            size=split_count, 
            replace=False)
        train[user, test_plays] = 0.        
        test[user, test_plays] = plays[user, test_plays]
    assert(train.multiply(test).nnz == 0)
    return train.tocsr(), test.tocsr(), user_index

train, test, test_user_indices = train_test_split(user_item_matrix, 15, 0.2)
train_user_indices = list(range(1, last_user_id + 1))
# These are still user*item matrices, transpose
train = train.T
test = test.T
sparse_info(train)
sparse_info(test)

Shape: (9067, 672)
NNZ count: 49558
NNZ%: 0.008133563892083799
mean: 1.0
max: 1.0
min: 1.0
Shape: (9067, 672)
NNZ count: 2010
NNZ%: 0.00032988545589185273
mean: 1.0
max: 1.0
min: 1.0


In [11]:
# Let's use cosine recommender

from implicit.nearest_neighbours import CosineRecommender
model = CosineRecommender()
# We train it using movie_tag_matrix, thus, we'll get movie similarity matrix
# by tags
model.fit(movie_tag_matrix)
sparse_info(model.similarity)

Shape: (9067, 9067)
NNZ count: 7135
NNZ%: 8.678941199340147e-05
mean: 0.8219122717531071
max: 1.0000000000000002
min: 0.0


In [12]:
import itertools

# Now we want to get scores for train and test data, for test data
# it's simple, but for train we can't use model.recommend, because it drops all
# movies that were already liked by the user, thus, we'll write our own recommend that
# doesn't do this (i.e. it's a copy-pasted code from implicit with liked = empty set)

def recommend_train(model, userid, user_items, N=10, filter_items=None, recalculate_user=False):    
    liked_vector = user_items[userid]    
    recommendations = liked_vector.dot(model.similarity)
    best = sorted(zip(recommendations.indices, recommendations.data), key=lambda x: -x[1])    
    liked = set()    
    return list(itertools.islice((rec for rec in best if rec[0] not in liked), N))

! mkdir -p recs1
! mkdir -p recs2
train_user_item = train.T
test_user_item = test.T
with open("recs1/test1.recs.tsv", "w") as output_file:
    for user_id in train_user_indices:
        for movie_id, score in recommend_train(model, user_id, train_user_item):
            output_file.write("%s\t%s\t%s\n" % (user_id, movie_id, score))
with open("recs2/test2.recs.tsv", "w") as output_file:
    for user_id in test_user_indices:
        for movie_id, score in model.recommend(user_id, train_user_item):
            output_file.write("%s\t%s\t%s\n" % (user_id, movie_id, score))
            
clmns = ["user_id", "movie_id", "score"]

rows, cols = train_user_item.nonzero()
records = []
for i in range(len(rows)):
    records.append((rows[i], cols[i], train_user_item[rows[i], cols[i]]))
df = pd.DataFrame.from_records(records, columns=clmns)
df.to_csv('test1', sep="\t", header=False, index=False)

rows, cols = test_user_item.nonzero()
records = []
for i in range(len(rows)):    
    records.append((rows[i], cols[i], test_user_item[rows[i], cols[i]]))
df = pd.DataFrame.from_records(records, columns=clmns)
df.to_csv('test2', sep="\t", header=False, index=False)

In [13]:
# Get scores for train set
! mrec_evaluate \
    --input_format=tsv --test_input_format=tsv \
    --train test1 \
    --recsdir recs1

# And for test set
! mrec_evaluate \
    --input_format=tsv --test_input_format=tsv \
    --train test2 \
    --recsdir recs2
    
# We'll use mrr, prec@5 and prec@15 as our metrics

[2017-11-26 16:12:58,425] INFO: processing /home/stas/Projects/1-netology/Netology/lab3/test1...
None
mrr            0.5246 +/- 0.0000
prec@5         0.2966 +/- 0.0000
prec@10        0.2588 +/- 0.0000
prec@15        0.1725 +/- 0.0000
prec@20        0.1294 +/- 0.0000
[2017-11-26 16:12:59,400] INFO: processing /home/stas/Projects/1-netology/Netology/lab3/test2...
None
mrr            0.0417 +/- 0.0000
prec@5         0.0179 +/- 0.0000
prec@10        0.0149 +/- 0.0000
prec@15        0.0100 +/- 0.0000
prec@20        0.0075 +/- 0.0000


In [14]:
from lightfm import LightFM
from lightfm.evaluation import reciprocal_rank, precision_at_k, auc_score

model = LightFM(loss="bpr", random_state=123)
model.fit(train_user_item, epochs=30)

train_mrr = reciprocal_rank(model, train_user_item).mean()
test_mrr = reciprocal_rank(model, test_user_item).mean()

train_p5 = precision_at_k(model, train_user_item, k=5).mean()
test_p5 = precision_at_k(model, test_user_item, k=5).mean()

train_p15 = precision_at_k(model, train_user_item, k=15).mean()
test_p15 = precision_at_k(model, test_user_item, k=15).mean()

print("train mrr ", train_mrr)
print("train prec@5 ", train_p5)
print("train prec@15 ", train_p15)
print("test mrr ", test_mrr)
print("test prec@5 ", test_p5)
print("test prec@15 ", test_p15)

train mrr  0.743988
train prec@5  0.543964
train prec@15  0.44302
test mrr  0.237852
test prec@5  0.0970149
test prec@15  0.0985075


In [15]:
from scipy.sparse import hstack, identity

features = hstack([
    identity(movie_tag_matrix.shape[0]),
    movie_tag_matrix
])

model = LightFM(loss="bpr", random_state=123)
model.fit(train_user_item, item_features=features, epochs=30)

train_mrr = reciprocal_rank(model, train_user_item, item_features=features).mean()
test_mrr = reciprocal_rank(model, test_user_item, item_features=features).mean()

train_p5 = precision_at_k(model, train_user_item, k=5, item_features=features).mean()
test_p5 = precision_at_k(model, test_user_item, k=5, item_features=features).mean()

train_p15 = precision_at_k(model, train_user_item, k=15, item_features=features).mean()
test_p15 = precision_at_k(model, test_user_item, k=15, item_features=features).mean()

# For some fucking reason this is worse than without item_features...

print("train mrr ", train_mrr)
print("train prec@5 ", train_p5)
print("train prec@15 ", train_p15)
print("test mrr ", test_mrr)
print("test prec@5 ", test_p5)
print("test prec@15 ", test_p15)

train mrr  0.76829
train prec@5  0.568107
train prec@15  0.4461
test mrr  0.239784
test prec@5  0.0880597
test prec@15  0.0835821
