* ALS (+ different NN approaches) + implicit
* KNN approaches at MF or aka wordembs

![image.png](attachment:image.png)

In [8]:
import pandas as pd
import numpy as np
import os
import json
from sklearn.neighbors import NearestNeighbors
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
%matplotlib inline

ModuleNotFoundError: No module named 'gensim'

In [None]:
#pip install implicit
from implicit.nearest_neighbours import CosineRecommender

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder

In [None]:
from scipy.sparse import csr_matrix

In [None]:
"""Information Retrieval metrics
Useful Resources:
http://www.cs.utexas.edu/~mooney/ir-course/slides/Evaluation.ppt
http://www.nii.ac.jp/TechReports/05-014E.pdf
http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
http://hal.archives-ouvertes.fr/docs/00/72/67/60/PDF/07-busa-fekete.pdf
Learning to Rank for Information Retrieval (Tie-Yan Liu)
"""


def mean_reciprocal_rank(rs):
    """Score is reciprocal of the rank of the first relevant item
    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75
    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean reciprocal rank
    """
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


def r_precision(r):
    """Score is precision after all relevant documents have been retrieved
    Relevance is binary (nonzero is relevant).
    >>> r = [0, 0, 1]
    >>> r_precision(r)
    0.33333333333333331
    >>> r = [0, 1, 0]
    >>> r_precision(r)
    0.5
    >>> r = [1, 0, 0]
    >>> r_precision(r)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        R Precision
    """
    r = np.asarray(r) != 0
    z = r.nonzero()[0]
    if not z.size:
        return 0.
    return np.mean(r[:z[-1] + 1])


def precision_at_k(r, k):
    """Score is precision @ k
    Relevance is binary (nonzero is relevant).
    >>> r = [0, 0, 1]
    >>> precision_at_k(r, 1)
    0.0
    >>> precision_at_k(r, 2)
    0.0
    >>> precision_at_k(r, 3)
    0.33333333333333331
    >>> precision_at_k(r, 4)
    Traceback (most recent call last):
        File "<stdin>", line 1, in ?
    ValueError: Relevance score length < k
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Precision @ k
    Raises:
        ValueError: len(r) must be >= k
    """
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)


def average_precision(r):
    """Score is average precision (area under PR curve)
    Relevance is binary (nonzero is relevant).
    >>> r = [1, 1, 0, 1, 0, 1, 0, 0, 0, 1]
    >>> delta_r = 1. / sum(r)
    >>> sum([sum(r[:x + 1]) / (x + 1.) * delta_r for x, y in enumerate(r) if y])
    0.7833333333333333
    >>> average_precision(r)
    0.78333333333333333
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Average precision
    """
    r = np.asarray(r) != 0
    out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]]
    if not out:
        return 0.
    return np.mean(out)


def mean_average_precision(rs):
    """Score is mean average precision
    Relevance is binary (nonzero is relevant).
    >>> rs = [[1, 1, 0, 1, 0, 1, 0, 0, 0, 1]]
    >>> mean_average_precision(rs)
    0.78333333333333333
    >>> rs = [[1, 1, 0, 1, 0, 1, 0, 0, 0, 1], [0]]
    >>> mean_average_precision(rs)
    0.39166666666666666
    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean average precision
    """
    return np.mean([average_precision(r) for r in rs])


def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

# Loading data

In [None]:
dataset = pd.read_csv('./data/ml-latest-small/ratings.csv')
dataset['timestamp'] = pd.to_datetime(dataset['timestamp'], unit='s')
movies = pd.read_csv('./data/ml-latest-small/movies.csv')
dataset.movieId = dataset.movieId.map(movies.set_index('movieId')['title'].to_dict())
TIME_TRESHOLD = '2018-01-01'
train = dataset[dataset.timestamp<TIME_TRESHOLD]
test = dataset[dataset.timestamp>=TIME_TRESHOLD]
train.shape, test.shape

In [7]:
train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,Toy Story (1995),4.0,2000-07-30 18:45:03
1,1,Grumpier Old Men (1995),4.0,2000-07-30 18:20:47
2,1,Heat (1995),4.0,2000-07-30 18:37:04
3,1,Seven (a.k.a. Se7en) (1995),5.0,2000-07-30 19:03:35
4,1,"Usual Suspects, The (1995)",5.0,2000-07-30 18:48:51


In [8]:
id2user = {key: value for key, value in enumerate(train.userId.unique())}
id2items = {key: value for key, value in enumerate(train.movieId.unique())} # Get our unique products that were purchased
rating = list(train.rating) # All of our purchases

In [9]:
user2id  = {value:key for key, value in id2user.items()}
items2id  = {value:key for key, value in id2items.items()}

In [10]:
rows = train.userId.map(user2id)
# Get the associated row indices
cols = train.movieId.map(items2id)
# Get the associated column indices

In [11]:
rating_sparse = csr_matrix((rating, (rows, cols)), shape=(len(user2id), len(items2id)))

In [12]:
rating_sparse

<581x8827 sparse matrix of type '<class 'numpy.float64'>'
	with 94415 stored elements in Compressed Sparse Row format>

In [13]:
train.userId.nunique(), train.movieId.nunique()

(581, 8827)

## ALS (implicit library)

In [14]:
#! pip install implicit

In [15]:
from implicit.als import AlternatingLeastSquares

In [16]:
ALS = AlternatingLeastSquares(num_threads=4, factors = 10,regularization=0.1)



In [17]:
ALS.fit(rating_sparse.T)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [18]:
ALS.item_factors.shape

(8827, 10)

In [19]:
ALS.user_factors.shape

(581, 10)

In [20]:
ALS.user_factors

array([[ 1.3527158 ,  0.01491007,  1.2371342 , ...,  3.4895258 ,
         0.19226548,  0.58524555],
       [-0.20711175,  0.05650488,  0.63691694, ...,  0.2078228 ,
         0.35244894, -0.14382915],
       [ 0.10414543, -0.01079307,  0.18197979, ...,  0.26068655,
        -0.1423418 ,  0.06395322],
       ...,
       [ 2.2472835 ,  1.9442282 ,  1.5741363 , ...,  3.8858483 ,
         2.8513823 ,  3.9966185 ],
       [-0.14965855, -0.5366759 ,  0.34867975, ...,  0.8380644 ,
         0.18506303,  0.684798  ],
       [ 2.7623782 ,  5.8487196 ,  8.784322  , ...,  4.2150064 ,
         1.8804636 ,  4.0603757 ]], dtype=float32)

In [21]:
%%time
predict = ALS.recommend_all(rating_sparse, filter_already_liked_items=True)

HBox(children=(FloatProgress(value=0.0, max=581.0), HTML(value='')))


CPU times: user 531 ms, sys: 240 ms, total: 771 ms
Wall time: 159 ms


In [22]:
predict.shape

(581, 10)

In [23]:
predict

array([[1028, 1165, 2129, ...,  361, 1027,  723],
       [  16,   20,   34, ...,  166,  774, 1055],
       [1028, 1166, 1165, ...,  164, 1457,  187],
       ...,
       [ 198,  531,  847, ..., 2034,  977,  153],
       [  34,  472,  531, ...,   28,  322,   17],
       [1240, 1057, 2037, ..., 1106,  147,  103]], dtype=int32)

In [24]:
test.head()

Unnamed: 0,userId,movieId,rating,timestamp
1839,18,"Maltese Falcon, The (1941)",4.0,2018-02-01 22:57:29
1852,18,One Flew Over the Cuckoo's Nest (1975),4.5,2018-02-03 19:27:05
1980,18,Sleuth (1972),4.5,2018-01-15 22:08:35
2012,18,All the President's Men (1976),4.0,2018-08-11 20:54:56
2053,18,Born into Brothels (2004),4.5,2018-05-12 21:18:13


In [25]:
cold_users = [i for i in test.userId.unique() if i not in train.userId.unique()]
cold_items = [i for i in test.movieId.unique() if i not in train.movieId.unique()]

In [26]:
# flag cold user
test['cold_users'] = test.userId.map(lambda x: x in cold_users)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
# flag cold item
test['cold_items'] = test.movieId.map(lambda x: x in cold_items)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [28]:
test_result = test[(test.cold_items==False)&(test.cold_users==False)]

In [29]:
test_result = test_result.groupby(['userId'])['movieId'].apply(lambda x: list(x)).reset_index()

In [30]:
test_result['movieId_pred'] = test_result.userId.map(lambda x: predict[user2id[x], :])

In [31]:
test_result['movieId_pred'] = test_result.movieId_pred.map(lambda x: [id2items[i] for i in x])

In [32]:
test_result.head()

Unnamed: 0,userId,movieId,movieId_pred
0,18,"[Maltese Falcon, The (1941), One Flew Over the...","[WALL·E (2008), American Beauty (1999), Aliens..."
1,50,"[Twelve Monkeys (a.k.a. 12 Monkeys) (1995), Ta...","[Dark Knight, The (2008), Star Wars: Episode I..."
2,68,"[Four Rooms (1995), Atlantis: The Lost Empire ...","[Charlie and the Chocolate Factory (2005), Chi..."
3,103,"[Mask, The (1994), On the Waterfront (1954), B...","[American Beauty (1999), Monty Python and the ..."
4,105,"[Band Wagon, The (1953), Autumn Sonata (Höstso...","[Children of Men (2006), Bourne Ultimatum, The..."


In [33]:
test_result['top_3'] = test_result.apply(lambda x: [int(pred in x.movieId_pred[:3]) for pred in x.movieId[:3]], axis=1)
test_result['top_5'] = test_result.apply(lambda x: [int(pred in x.movieId_pred[:5]) for pred in x.movieId[:5]], axis=1)
test_result['top_10'] = test_result.apply(lambda x: [int(pred in x.movieId_pred[:10]) for pred in x.movieId[:10]], axis=1)

In [34]:
test_result.head(10)

Unnamed: 0,userId,movieId,movieId_pred,top_3,top_5,top_10
0,18,"[Maltese Falcon, The (1941), One Flew Over the...","[WALL·E (2008), American Beauty (1999), Aliens...","[0, 0, 0]","[0, 1, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
1,50,"[Twelve Monkeys (a.k.a. 12 Monkeys) (1995), Ta...","[Dark Knight, The (2008), Star Wars: Episode I...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
2,68,"[Four Rooms (1995), Atlantis: The Lost Empire ...","[Charlie and the Chocolate Factory (2005), Chi...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,103,"[Mask, The (1994), On the Waterfront (1954), B...","[American Beauty (1999), Monty Python and the ...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,105,"[Band Wagon, The (1953), Autumn Sonata (Höstso...","[Children of Men (2006), Bourne Ultimatum, The...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,112,"[Inglourious Basterds (2009), Up (2009), Distr...","[Shawshank Redemption, The (1994), Pulp Fictio...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0]"
6,210,"[Batman Forever (1995), Coneheads (1993), Back...","[Lord of the Rings: The Return of the King, Th...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,212,"[Lion King, The (1994), Terminator 2: Judgment...","[Lord of the Rings: The Return of the King, Th...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
8,233,"[Toy Story (1995), Seven (a.k.a. Se7en) (1995)...","[Forrest Gump (1994), Matrix, The (1999), Lord...","[0, 0, 0]","[0, 0, 0, 0, 1]","[0, 1, 1, 0, 1, 0, 0, 0, 0, 0]"
9,249,"[Kill Bill: Vol. 2 (2004), John Wick (2014), T...","[John Wick (2014), Meet the Parents (2000), Ca...","[0, 1, 0]","[0, 1, 0, 0, 0]","[0, 1, 0, 1, 0, 1, 0, 0, 0, 0]"


In [35]:
score_results = pd.DataFrame(index=['top_3', 'top_5', 'top_10'], columns=['MRR', 'MAP@K', 'NDCG@k'])

In [36]:
for top in [3, 5, 10]:
    score_results.loc['top_'+str(top), 'MAP@K'] = mean_reciprocal_rank(list(test_result['top_'+str(top)].values))
    score_results.loc['top_'+str(top), 'MRR'] = mean_average_precision(list(test_result['top_'+str(top)].values))
    score_results.loc['top_'+str(top), 'NDCG@k'] = np.mean([ndcg_at_k(i, top) for i in list(test_result['top_'+str(top)].values)])

In [37]:
# factors = 10 and regularisation = .1
score_results

Unnamed: 0,MRR,MAP@K,NDCG@k
top_3,0.025,0.025,0.05
top_5,0.06,0.06,0.121534
top_10,0.139444,0.135,0.205142


In [38]:
score_results

Unnamed: 0,MRR,MAP@K,NDCG@k
top_3,0.025,0.025,0.05
top_5,0.06,0.06,0.121534
top_10,0.139444,0.135,0.205142


# ALS with implicit feedback

у кейса есть понятие confidence - уверенность в кейсе

если пользователь добавил объект в корзину - уверенность в рейтинге 1

если просто посмотрел - уверенность 0.5, например

если никак не провзамидействовал - 0


In [None]:
train.head()

In [40]:
train['imp_rating'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [41]:
id2user = {key: value for key, value in enumerate(train.userId.unique())}
id2items = {key: value for key, value in enumerate(train.movieId.unique())} # Get our unique products that were purchased
imp_rating = list(train.imp_rating) # All of our purchases

user2id  = {value:key for key, value in id2user.items()}
items2id  = {value:key for key, value in id2items.items()}

rows = train.userId.map(user2id)
# Get the associated row indices
cols = train.movieId.map(items2id)
# Get the associated column indices

CONF = 5

rating_sparse = csr_matrix((imp_rating, (rows, cols)), shape=(len(user2id), len(items2id)))

rating_sparse = csr_matrix(rating_sparse.todense()*CONF+1)

In [42]:
ALS = AlternatingLeastSquares(num_threads=4, factors = 3,regularization=0.01 )
ALS.fit(rating_sparse.T)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [43]:
rating_sparse = csr_matrix((imp_rating, (rows, cols)), shape=(len(user2id), len(items2id)))

rating_sparse = csr_matrix(rating_sparse.todense()*CONF)

In [44]:
%%time
predict = ALS.recommend_all(rating_sparse, recalculate_user=True, filter_already_liked_items=True)

HBox(children=(FloatProgress(value=0.0, max=581.0), HTML(value='')))


CPU times: user 2.34 s, sys: 358 ms, total: 2.7 s
Wall time: 2.07 s


In [45]:
test_result = test[(test.cold_items==False)&(test.cold_users==False)]

test_result = test_result.groupby(['userId'])['movieId'].apply(lambda x: list(x)).reset_index()

test_result['movieId_pred'] = test_result.userId.map(lambda x: predict[user2id[x], :])

test_result['movieId_pred'] = test_result.movieId_pred.map(lambda x: [id2items[i] for i in x])

test_result['top_3'] = test_result.apply(lambda x: [int(pred in x.movieId_pred[:3]) for pred in x.movieId[:3]], axis=1)
test_result['top_5'] = test_result.apply(lambda x: [int(pred in x.movieId_pred[:5]) for pred in x.movieId[:5]], axis=1)
test_result['top_10'] = test_result.apply(lambda x: [int(pred in x.movieId_pred[:10]) for pred in x.movieId[:10]], axis=1)

In [46]:
test_result.head()

Unnamed: 0,userId,movieId,movieId_pred,top_3,top_5,top_10
0,18,"[Maltese Falcon, The (1941), One Flew Over the...","[Houseguest (1994), Train to Busan (2016), Hom...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,50,"[Twelve Monkeys (a.k.a. 12 Monkeys) (1995), Ta...","[Houseguest (1994), Train to Busan (2016), Hom...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,68,"[Four Rooms (1995), Atlantis: The Lost Empire ...","[Houseguest (1994), Train to Busan (2016), Hom...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,103,"[Mask, The (1994), On the Waterfront (1954), B...","[Houseguest (1994), Train to Busan (2016), Hom...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,105,"[Band Wagon, The (1953), Autumn Sonata (Höstso...","[Houseguest (1994), Train to Busan (2016), Hom...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [47]:
score_results_imp = pd.DataFrame(index=['top_3', 'top_5', 'top_10'], columns=['MRR', 'MAP@K', 'NDCG@k'])

In [48]:
for top in [3, 5, 10]:
    score_results_imp.loc['top_'+str(top), 'MAP@K'] = mean_reciprocal_rank(list(test_result['top_'+str(top)].values))
    score_results_imp.loc['top_'+str(top), 'MRR'] = mean_average_precision(list(test_result['top_'+str(top)].values))
    score_results_imp.loc['top_'+str(top), 'NDCG@k'] = np.mean([ndcg_at_k(i, top) for i in list(test_result['top_'+str(top)].values)])

In [49]:
# binary implicit feedback on the base rating existence CONF 40
score_results_imp

Unnamed: 0,MRR,MAP@K,NDCG@k
top_3,0,0,0
top_5,0,0,0
top_10,0,0,0


In [50]:
# binary implicit feedback on the base rating existence CONF 5 
score_results_imp

Unnamed: 0,MRR,MAP@K,NDCG@k
top_3,0,0,0
top_5,0,0,0
top_10,0,0,0


In [51]:
# binary implicit feedback on the base rating existence CONF 5 with regularisation 1.0
score_results_imp

Unnamed: 0,MRR,MAP@K,NDCG@k
top_3,0,0,0
top_5,0,0,0
top_10,0,0,0


In [52]:
# binary implicit feedback on the base rating existence CONF 5 with regularisation 1.0 and #factors = 3
score_results_imp

Unnamed: 0,MRR,MAP@K,NDCG@k
top_3,0,0,0
top_5,0,0,0
top_10,0,0,0


In [53]:
# binary implicit feedback on the base rating existence CONF 5 with regularisation 0.001 and #factors = 3
score_results_imp

Unnamed: 0,MRR,MAP@K,NDCG@k
top_3,0,0,0
top_5,0,0,0
top_10,0,0,0


## ALS with annoy nn backend

* https://github.com/spotify/annoy
* https://github.com/facebookresearch/faiss

# Content-based approaches

## Content-based on the base of MF

In [54]:
ALS.similar_users(45, N=20)

[(45, 1.0000001),
 (266, 1.0),
 (82, 1.0),
 (479, 1.0),
 (365, 1.0),
 (88, 1.0),
 (224, 0.9999999),
 (151, 0.9999999),
 (89, 0.9999999),
 (233, 0.9999999),
 (442, 0.9999999),
 (38, 0.9999999),
 (170, 0.9999999),
 (437, 0.9999999),
 (425, 0.9999999),
 (400, 0.9999999),
 (201, 0.9999999),
 (91, 0.9999999),
 (394, 0.9999998),
 (539, 0.9999998)]

In [55]:
ALS.similar_items(34)

[(34, 1.0),
 (192, 0.99999994),
 (70, 0.9999998),
 (0, 0.9999998),
 (36, 0.99999976),
 (4452, 0.99999976),
 (1149, 0.9999997),
 (1217, 0.9999997),
 (401, 0.99999964),
 (850, 0.9999995)]

In [56]:
_id = 21

In [57]:
id2items[_id]

'Jungle Book, The (1994)'

![image-3.png](attachment:image-3.png)
![image-2.png](attachment:image-2.png)
![image-4.png](attachment:image-4.png)

In [58]:
[(id2items[item], score) for item, score in ALS.similar_items(_id)]

[('Jungle Book, The (1994)', 1.0),
 ('Megamind (2010)', 0.9999999),
 ('American Sniper (2014)', 0.99999976),
 ('Last House on the Left, The (1972)', 0.9999995),
 ('Antitrust (2001)', 0.99999946),
 ('Inconvenient Truth, An (2006)', 0.99999946),
 ('Godsend (2004)', 0.9999994),
 ('Closer (2004)', 0.9999994),
 ("She's All That (1999)", 0.9999994),
 ("It's Pat (1994)", 0.99999934)]

## Cousine content based

In [59]:
cos = CosineRecommender(K = 10)

In [60]:
last_films = train[train.rating>=3.0].sort_values(by = ['timestamp'], ascending=False).groupby('userId')['movieId', 'rating']\
    .apply(lambda x: x.head(5)).reset_index()

  """Entry point for launching an IPython kernel.


In [61]:
last_films.head(10)

Unnamed: 0,userId,level_1,movieId,rating
0,1,161,20 Dates (1998),4.0
1,1,119,Back to the Future Part III (1990),4.0
2,1,160,¡Three Amigos! (1986),4.0
3,1,31,Tombstone (1993),5.0
4,1,95,McHale's Navy (1997),3.0
5,2,247,"Town, The (2010)",4.5
6,2,259,Mad Max: Fury Road (2015),5.0
7,2,234,Good Will Hunting (1997),4.5
8,2,250,Warrior (2011),5.0
9,2,248,Inside Job (2010),5.0


In [62]:
cos.fit(ALS.item_factors)

HBox(children=(FloatProgress(value=0.0, max=8827.0), HTML(value='')))




In [63]:
#id2user = {key: value for key, value in enumerate(train.userId.unique())}
#id2items = {key: value for key, value in enumerate(train.movieId.unique())} # Get our unique products that were purchased
imp_rating = list(last_films.rating) # All of our purchases
rows = last_films.userId.map(user2id)
# Get the associated row indices
cols = last_films.movieId.map(items2id)
# Get the associated column indices

rating_sparse_last = csr_matrix((imp_rating, (rows, cols)), shape=(len(user2id), len(items2id)))

In [64]:
rating_sparse_last

<581x8827 sparse matrix of type '<class 'numpy.float64'>'
	with 2899 stored elements in Compressed Sparse Row format>

In [65]:
def predict_cos(model, user, rating):
    res = model.recommend(user, rating, filter_already_liked_items=True)
    return [i for i, k in res]

test_result['movieId_pred'] = test_result.userId\
    .map(lambda x: predict_cos(cos, user2id[x], rating_sparse_last))

In [66]:
test_result['movieId_pred'] = test_result.movieId_pred.map(lambda x: [id2items[i] for i in x])

In [67]:
test_result['top_3'] = test_result.apply(lambda x: [int(pred in x.movieId_pred[:3]) for pred in x.movieId[:3]], axis=1)
test_result['top_5'] = test_result.apply(lambda x: [int(pred in x.movieId_pred[:5]) for pred in x.movieId[:5]], axis=1)
test_result['top_10'] = test_result.apply(lambda x: [int(pred in x.movieId_pred[:10]) for pred in x.movieId[:10]], axis=1)

In [68]:
test_result.head(10)

Unnamed: 0,userId,movieId,movieId_pred,top_3,top_5,top_10
0,18,"[Maltese Falcon, The (1941), One Flew Over the...","[Due Date (2010), Monsters University (2013), ...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,50,"[Twelve Monkeys (a.k.a. 12 Monkeys) (1995), Ta...","[Spy Game (2001), Boys Don't Cry (1999), Super...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,68,"[Four Rooms (1995), Atlantis: The Lost Empire ...","[Child's Play 2 (1990), Rat Race (2001), Blank...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,103,"[Mask, The (1994), On the Waterfront (1954), B...","[Rounders (1998), Jarhead (2005), Bug's Life, ...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,105,"[Band Wagon, The (1953), Autumn Sonata (Höstso...","[Bring It On (2000), Signs (2002), Despicable ...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,112,"[Inglourious Basterds (2009), Up (2009), Distr...","[Excess Baggage (1997), Rookie, The (1990), We...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0]"
6,210,"[Batman Forever (1995), Coneheads (1993), Back...","[That Munchhausen (1979), Ricki and the Flash ...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,212,"[Lion King, The (1994), Terminator 2: Judgment...","[Clockers (1995), Recruit, The (2003), And You...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
8,233,"[Toy Story (1995), Seven (a.k.a. Se7en) (1995)...","[Princess Mononoke (Mononoke-hime) (1997), Hur...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
9,249,"[Kill Bill: Vol. 2 (2004), John Wick (2014), T...","[Bring It On (2000), Hanna (2011), Signs (2002...","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [69]:
score_results_cos = pd.DataFrame(index=['top_3', 'top_5', 'top_10'], columns=['MRR', 'MAP@K', 'NDCG@k'])

In [70]:
for top in [3, 5, 10]:
    score_results_cos.loc['top_'+str(top), 'MAP@K'] = mean_reciprocal_rank(list(test_result['top_'+str(top)].values))
    score_results_cos.loc['top_'+str(top), 'MRR'] = mean_average_precision(list(test_result['top_'+str(top)].values))
    score_results_cos.loc['top_'+str(top), 'NDCG@k'] = np.mean([ndcg_at_k(i, top) for i in list(test_result['top_'+str(top)].values)])

In [71]:
score_results_cos

Unnamed: 0,MRR,MAP@K,NDCG@k
top_3,0,0,0
top_5,0,0,0
top_10,0,0,0


## aka Word2vec

In [72]:
train.head()

Unnamed: 0,userId,movieId,rating,timestamp,imp_rating
0,1,Toy Story (1995),4.0,2000-07-30 18:45:03,1
1,1,Grumpier Old Men (1995),4.0,2000-07-30 18:20:47,1
2,1,Heat (1995),4.0,2000-07-30 18:37:04,1
3,1,Seven (a.k.a. Se7en) (1995),5.0,2000-07-30 19:03:35,1
4,1,"Usual Suspects, The (1995)",5.0,2000-07-30 18:48:51,1


In [73]:
corpus = list(train.sort_values('timestamp').groupby('userId')['movieId'].apply(lambda x: [str(i) for i in x]))

In [74]:
# starspace

In [75]:
from gensim.models import Word2Vec
model = Word2Vec(min_count=3, size = 10)
model.build_vocab(corpus)  # prepare the model vocabulary
model.train(corpus, total_examples=model.corpus_count, epochs=model.iter)  # train word vectors

  after removing the cwd from sys.path.


(444450, 472090)

In [76]:
model.init_sims()

In [77]:
#model.wv.vocab

In [78]:
model.wv.word_vec('Excalibur (1981)', use_norm=True)

array([-0.47700855, -0.7808306 ,  0.15501912, -0.13717349, -0.26545772,
        0.00866778,  0.06765513,  0.13152057, -0.15749684,  0.05192179],
      dtype=float32)

In [79]:
emb_matrix = np.empty((len(model.wv.vocab), 10))

In [80]:
for i, key in enumerate(model.wv.vocab.keys()):
    emb_matrix[i, :]= model.wv.word_vec(key, use_norm=True)

In [81]:
cos = CosineRecommender(K = 10)

In [82]:
cos.fit(emb_matrix)

HBox(children=(FloatProgress(value=0.0, max=4726.0), HTML(value='')))




In [None]:
# fix it
def predict_cos(model, user, rating):
    res = model.recommend(user, rating, filter_already_liked_items=True)
    return [i for i, k in res]

test_result['movieId_pred'] = test_result.userId\
    .map(lambda x: predict_cos(cos, user2id[x], rating_sparse_last))

In [None]:
test_result['movieId_pred'] = test_result.movieId_pred.map(lambda x: [id2items[i] for i in x])

In [None]:
test_result['top_3'] = test_result.apply(lambda x: [int(pred in x.movieId_pred[:3]) for pred in x.movieId[:3]], axis=1)
test_result['top_5'] = test_result.apply(lambda x: [int(pred in x.movieId_pred[:5]) for pred in x.movieId[:5]], axis=1)
test_result['top_10'] = test_result.apply(lambda x: [int(pred in x.movieId_pred[:10]) for pred in x.movieId[:10]], axis=1)

In [None]:
test_result.head()

In [None]:
score_results_emb = pd.DataFrame(index=['top_3', 'top_5', 'top_10'], columns=['MRR', 'MAP@K', 'NDCG@k'])

In [None]:
for top in [3, 5, 10]:
    score_results_emb.loc['top_'+str(top), 'MAP@K'] = mean_reciprocal_rank(list(test_result['top_'+str(top)].values))
    score_results_emb.loc['top_'+str(top), 'MRR'] = mean_average_precision(list(test_result['top_'+str(top)].values))
    score_results_emb.loc['top_'+str(top), 'NDCG@k'] = np.mean([ndcg_at_k(i, top) for i in list(test_result['top_'+str(top)].values)])

In [None]:
score_results_emb