In [19]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import implicit
from sklearn.utils import validation as skval

def make_sparse_matrix(data, rows, cols):
    skval.check_consistent_length(data, rows, cols)
    rows = rows.astype("category").cat.codes
    cols = cols.astype("category").cat.codes
    data, rows, cols = (np.asarray(x) for x in (data.astype(float), rows, cols))
    shape = (np.unique(rows).shape[0], np.unique(cols).shape[0])

    return sparse.csr_matrix((data, (rows, cols)), shape = shape)

In [20]:
class ALSRecommender():
    def __init__(self, iterations = 20, latent = 10, alpha = 40, regularizer = 0.1):        
        self.iterations = iterations
        self.latent = latent
        self.alpha = 40
        self.regularizer = 0.1
        self.S = None

    def fit(self, sample):
        #Instead of pivot_table the als from the implicit library expects a sparse matrix
        self.S = sample
        self.S.movieId = self.S.movieId.astype("category").cat.codes
        self.S.userId = self.S.userId.astype("category").cat.codes
        self.movie_user_matrix = make_sparse_matrix(self.S.rating, self.S.movieId, self.S.userId)
        self.user_movie_matrix = make_sparse_matrix(self.S.rating, self.S.userId, self.S.movieId)
        
        confidence = (self.movie_user_matrix * self.alpha).astype("double")

        #Als model with 10 latent factor, lambda = 0.1 and 20 alternating iterations
        als_model = implicit.als.AlternatingLeastSquares(factors = self.latent, regularization = self.regularizer, iterations = self.iterations)
        als_model.fit(confidence)
        
        self.user_vectors = als_model.user_factors
        self.movie_vectors = als_model.item_factors
        
        return self.user_vectors, self.movie_vectors
    
    def similar_to_movie(self, movie_id, n_similar):
        #scores = V dot V.T[movie] -> Item recommendation
        movie_vec = self.movie_vectors[movie_id].T #será que posso fazer com a movie_user_matrix? assim dava para testar

        movie_norms = np.sqrt((self.movie_vectors * self.movie_vectors).sum(axis = 1))

        scores = np.dot(self.movie_vectors, movie_vec) / movie_norms
        top_10 = np.argpartition(scores, -n_similar)[-n_similar:]
        similar = sorted(zip(top_10, scores[top_10] / movie_norms[n_similar]), key=lambda x: -x[1])
        
        titles = []
        similarity = []
        for i in similar:
            idx, sim = i
            titles.append(self.S.loc[self.S.movieId == idx].title.iloc[0])
            similarity.append(sim)
            
        return pd.DataFrame({"movie_title": titles, "similarity": similarity})
    
    #user_movie_matrix deve ser um parametro (para poder passar uma test sample)
    def recommend_to_user(self, user_id, n_movies, user_movie_matrix):
        user_ratings = user_movie_matrix[user_id,:].toarray().reshape(-1) + 1 #ratings do user escolhido (1d array)
        #ratings de filmes não vistos ficam com rating 1, enquanto os que já foram vistos são postos a 0 para não serem recomendados
        #outra vez ao utilizador
        user_ratings[user_ratings > 1] = 0

        user_vectors = sparse.csr_matrix(self.user_vectors)
        movie_vectors = sparse.csr_matrix(self.movie_vectors)
        
        #formula de cálculo de recomendações para utilizadores
        #Ui dot V.T (produto escalar entre vetor do utilizador i e dos filmes todos transpostos)
        recommendation_vector = np.dot(user_vectors[user_id,:], movie_vectors.T).toarray()

        #remover os filmes já vistos pelo utilizador
        recommend_vector = user_ratings * recommendation_vector.reshape(-1)

        #argsort é ascendente
        #::-1 coloca o valor mais alto no inicio do array, o segundo mais alto de seguida, etc.
        scores_idx = np.argsort(recommend_vector)[::-1][:n_movies] #dez valores mais parecidos

        titles = []
        scores = []
        for i in scores_idx:
            titles.append(self.S.loc[self.S.movieId == i].title.iloc[0])
            scores.append(recommend_vector[i])

        return pd.DataFrame({"movie_title" : titles, "score": scores})

In [21]:
ratings = pd.read_csv("datasets/100k/ratings.csv")
ratings.dropna(inplace = True)
ratings.drop(columns = "timestamp", inplace = True)
print(ratings.shape)
ratings.head()

(100836, 3)


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [22]:
movies = pd.read_csv("datasets/100k/movies.csv")
movies.dropna(inplace = True)
movies.drop(columns = "genres", inplace = True)
print(movies.shape)
movies.head()

(9742, 2)


Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [23]:
movie_ratings = ratings.join(movies.set_index("movieId"), on = "movieId")
print(movie_ratings.shape)
print("Users:", movie_ratings.userId.unique().size)
print("Movies:", movie_ratings.movieId.unique().size)
print(movie_ratings.drop_duplicates().shape)
movie_ratings.head()

(100836, 4)
Users: 610
Movies: 9724
(100836, 4)


Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,1,3,4.0,Grumpier Old Men (1995)
2,1,6,4.0,Heat (1995)
3,1,47,5.0,Seven (a.k.a. Se7en) (1995)
4,1,50,5.0,"Usual Suspects, The (1995)"


## Apply baseline predictors to ratings:

In [24]:
import baseline as base
baseline_df = base.make_baseline(movie_ratings, damping_factor = 25).get_ratings()
baseline_df.drop(columns = ["rating", "bi", "bu"], inplace = True)
baseline_df.rename(columns = {"bui": "rating"}, inplace = True)
baseline_df.head()

Unnamed: 0,userId,movieId,title,rating
0,1,1,Toy Story (1995),5.089208
1,1,3,Grumpier Old Men (1995),4.637278
2,1,6,Heat (1995),5.099497
3,1,47,Seven (a.k.a. Se7en) (1995),6.150616
4,1,50,"Usual Suspects, The (1995)",6.360803


## Alternating Least Squares (ALS):
Fitting the model

In [25]:
#import als_recommender as als
model = ALSRecommender(iterations = 10, latent = 10, alpha = 40, regularizer = 0.1)
model.fit(movie_ratings)



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




(array([[17.734818  , 10.219808  , 20.813026  , ...,  5.5213423 ,
         10.684315  , 19.552555  ],
        [-0.48394808, -0.651731  ,  0.8520135 , ..., 15.950156  ,
          7.9368134 ,  0.6330999 ],
        [16.189411  ,  4.126651  ,  2.3362892 , ...,  4.1413717 ,
         -2.904066  , 12.085226  ],
        ...,
        [17.705315  , 19.287487  , 14.987953  , ..., 18.50695   ,
          7.219401  , 17.254366  ],
        [ 6.7488194 , -4.3674917 ,  0.14057973, ...,  3.9427464 ,
         15.290184  , -0.2532464 ],
        [ 4.924457  , 22.745455  , 15.1717205 , ...,  8.501083  ,
         10.5501795 , 24.10157   ]], dtype=float32),
 array([[ 5.19224536e-03, -1.01913884e-02,  8.10204074e-03, ...,
          2.05841586e-02,  2.23653223e-02, -1.89959642e-03],
        [ 3.32620391e-03, -3.29579671e-05,  8.36145133e-03, ...,
          1.31441634e-02,  2.22663935e-02, -4.21004649e-03],
        [ 8.21489468e-03, -7.29986466e-03,  3.69444489e-04, ...,
          1.39502445e-02,  1.16375275e-02

### Item-Item collaborative filter
Find similar movies according to a given movie

In [26]:
#Find movies similar to a certain movie with class
model.similar_to_movie(movie_id = 0, n_similar = 10)

Unnamed: 0,movie_title,similarity
0,Toy Story (1995),1.081008
1,Forrest Gump (1994),1.079774
2,"Lion King, The (1994)",1.078086
3,Star Wars: Episode IV - A New Hope (1977),1.077847
4,"Shawshank Redemption, The (1994)",1.077643
5,Jurassic Park (1993),1.076894
6,"Silence of the Lambs, The (1991)",1.076187
7,Aladdin (1992),1.0751
8,Braveheart (1995),1.074699
9,Pulp Fiction (1994),1.074584


### User-Item collaborative filter
Recommend movies to a certain user

In [27]:
user_movie_matrix = sparse.csr_matrix((movie_ratings.rating.astype(float), (movie_ratings.userId, movie_ratings.movieId)))
recs = model.recommend_to_user(user_id = 10, n_movies = 9730, user_movie_matrix = user_movie_matrix)

In [28]:
recs.loc[recs.movie_title == "Dead Man Walking (1995)"]

Unnamed: 0,movie_title,score
5573,Dead Man Walking (1995),0.0


In [29]:
movie_ratings.loc[movie_ratings.userId == 10].title.unique()

array(['Heat (1995)', 'GoldenEye (1995)', 'Dead Man Walking (1995)',
       'Mortal Kombat (1995)', 'Broken Arrow (1996)', 'Braveheart (1995)',
       'Apollo 13 (1995)', 'Batman Forever (1995)',
       'Die Hard: With a Vengeance (1995)', 'Hackers (1995)',
       'Waterworld (1995)', 'Outbreak (1995)',
       'Shawshank Redemption, The (1994)',
       'Clear and Present Danger (1994)', 'Forrest Gump (1994)',
       'Maverick (1994)', 'River Wild, The (1994)', 'Speed (1994)',
       'True Lies (1994)', 'Cliffhanger (1993)', 'Fugitive, The (1993)',
       'Hot Shots! Part Deux (1993)', 'In the Line of Fire (1993)',
       'Jurassic Park (1993)', 'Menace II Society (1993)',
       'Program, The (1993)', 'Searching for Bobby Fischer (1993)',
       'Terminator 2: Judgment Day (1991)',
       'Silence of the Lambs, The (1991)', 'Mission: Impossible (1996)',
       'Rock, The (1996)', 'Twister (1996)',
       'Independence Day (a.k.a. ID4) (1996)', 'Days of Thunder (1990)',
       'Top Gun 

## Using the implicit library:

In [30]:
movie_user_matrix = sparse.csr_matrix((movie_ratings.rating.astype(float), (movie_ratings.movieId, movie_ratings.userId)))

als_model = implicit.als.AlternatingLeastSquares(factors=10, regularization=0.1, iterations=10)
confidence = (movie_user_matrix * 40).astype("double")
als_model.fit(confidence)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [31]:
similar = als_model.similar_items(0, 10)

movies = []
scores = []
for i in similar:
    idx, score = i
    movies.append(movie_ratings.loc[movie_ratings.movieId == idx].title.iloc[0])
    scores.append(score)
    
pd.DataFrame({"movie_title": movies, "similarity": scores})

Unnamed: 0,movie_title,similarity
0,Toy Story (1995),0.049867
1,Forrest Gump (1994),0.049788
2,"Silence of the Lambs, The (1991)",0.049738
3,"Shawshank Redemption, The (1994)",0.049687
4,Terminator 2: Judgment Day (1991),0.049654
5,Braveheart (1995),0.04963
6,Schindler's List (1993),0.049595
7,Seven (a.k.a. Se7en) (1995),0.049569
8,Jurassic Park (1993),0.049563
9,Star Wars: Episode IV - A New Hope (1977),0.049527


In [32]:
similar = als_model.recommend(10, user_movie_matrix)

movies = []
scores = []
for i in similar:
    idx, score = i
    movies.append(movie_ratings.loc[movie_ratings.movieId == idx].title.iloc[0])
    scores.append(score)
    
pd.DataFrame({"movie_title": movies, "similarity": scores})

Unnamed: 0,movie_title,similarity
0,101 Dalmatians (1996),1.179175
1,Jack (1996),1.169049
2,Beavis and Butt-Head Do America (1996),1.160122
3,Kingpin (1996),1.119681
4,"Usual Suspects, The (1995)",1.097537
5,Sleepers (1996),1.095358
6,Matilda (1996),1.094412
7,Friday (1995),1.093864
8,Schindler's List (1993),1.091154
9,Pulp Fiction (1994),1.089453


# Evaluation:

In [33]:
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(movie_ratings, train_size = 0.5)
print("x_train:", x_train.shape, "\nx_test:", x_test.shape)

x_train: (50418, 4) 
x_test: (50418, 4)


In [34]:
#Não pode haver users e movies no x_train que depois não existam no x_test
x_train = x_train[x_train.movieId.isin(x_test.movieId.unique())]
x_test = x_test[x_test.movieId.isin(x_train.movieId.unique())]
print("x_train:", x_train.shape, "\nx_test:", x_test.shape)
print("x_train_users:", x_train.userId.unique().shape, "\nx_test_users:", x_test.userId.unique().shape)
print("x_train_movies:", x_train.movieId.unique().shape, "\nx_test_movies:", x_test.movieId.unique().shape)

x_train: (47495, 4) 
x_test: (47566, 4)
x_train_users: (610,) 
x_test_users: (610,)
x_train_movies: (5349,) 
x_test_movies: (5349,)


In [35]:
train_model = ALSRecommender(iterations = 10, latent = 10, alpha = 40, regularizer = 0.1)
user_vec, movie_vec = train_model.fit(x_train)
print(user_vec.shape, movie_vec.shape)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


(610, 10) (5349, 10)


In [39]:
train_matrix = make_sparse_matrix(x_train.rating, x_train.userId, x_train.movieId)
train_matrix.shape

(610, 5349)

In [36]:
test_matrix = make_sparse_matrix(x_test.rating, x_test.userId, x_test.movieId)
test_matrix.shape

(610, 5349)

In [37]:
predictions = train_model.recommend_to_user(user_id = 10, n_movies = 10, user_movie_matrix = test_matrix)
print(predictions.shape)
predictions

(10, 2)


Unnamed: 0,movie_title,score
0,Crimson Tide (1995),1.212612
1,Star Trek: Generations (1994),1.179942
2,Escape from L.A. (1996),1.111863
3,"Firm, The (1993)",1.089559
4,Executive Decision (1996),1.086759
5,Dances with Wolves (1990),1.082071
6,Die Hard: With a Vengeance (1995),1.067426
7,Johnny Mnemonic (1995),1.058139
8,Judge Dredd (1995),1.040145
9,Stargate (1994),1.029331


In [None]:
truth = train_model.recommend_to_user(user_id = 10, n_movies = 10, )