In [80]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import implicit
#from sklearn.utils import validation as skval
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k#, train_test_split
#from sklearn.model_selection import train_test_split


def make_sparse_matrix(df):
    # Convert movie and users into numerical IDs
    df['user_id'] = df.userId.astype("category").cat.codes
    df['movie_id'] = df.movieId.astype("category").cat.codes

    # Create a lookup frame so we can get the movie titles back in 
    # readable form later.
    item_lookup = df[['movie_id', 'title']].drop_duplicates()
    item_lookup['movie_id'] = item_lookup.movie_id.astype(str)

    df = df.drop(['userId', 'movieId'], axis=1)

    # Drop any rows that have rating 0
    df = df.loc[df.rating != 0]

    # Create lists of all users, artists and plays
    users = list(np.sort(df.user_id.unique()))
    movies = list(np.sort(df.movie_id.unique()))
    ratings = list(df.rating)

    # Get the rows and columns for our new matrix
    rows = df.user_id.astype(int)
    cols = df.movie_id.astype(int)

    # Contruct a sparse matrix for our users and items containing number of plays
    user_item = sparse.csr_matrix((ratings, (rows, cols)), shape=(len(users), len(movies)))
    item_user = sparse.csr_matrix((ratings, (cols, rows)), shape=(len(movies), len(users)))
    return user_item, item_user

In [81]:
ratings = pd.read_csv("datasets/100k/ratings.csv")
ratings.dropna(inplace = True)
ratings.drop(columns = "timestamp", inplace = True)
print(ratings.shape)
ratings.head()

(100836, 3)


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [82]:
movies = pd.read_csv("datasets/100k/movies.csv")
movies.dropna(inplace = True)
movies.drop(columns = "genres", inplace = True)
print(movies.shape)
movies.head()

(9742, 2)


Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [83]:
movie_ratings = ratings.join(movies.set_index("movieId"), on = "movieId")
print(movie_ratings.shape)
print("Users:", movie_ratings.userId.unique().size)
print("Movies:", movie_ratings.movieId.unique().size)
print(movie_ratings.drop_duplicates().shape)
movie_ratings.head()
data = movie_ratings.copy()

(100836, 4)
Users: 610
Movies: 9724
(100836, 4)


In [84]:
user_movie_matrix, movie_user_matrix = make_sparse_matrix(movie_ratings)
print(user_movie_matrix.shape, movie_user_matrix.shape)

(610, 9724) (9724, 610)


## Apply baseline predictors to ratings:

In [85]:
import baseline as base
baseline_df = base.make_baseline(data, damping_factor = 25).get_ratings()
baseline_df.drop(columns = ["rating", "bi", "bu"], inplace = True)
baseline_df.rename(columns = {"bui": "rating"}, inplace = True)
baseline_df.head()

Unnamed: 0,userId,movieId,title,rating
0,1,1,Toy Story (1995),5.089208
1,1,3,Grumpier Old Men (1995),4.637278
2,1,6,Heat (1995),5.099497
3,1,47,Seven (a.k.a. Se7en) (1995),6.150616
4,1,50,"Usual Suspects, The (1995)",6.360803


## Using the implicit library:

In [86]:
als_model = implicit.als.AlternatingLeastSquares(factors=10, regularization=0.1, iterations=10)
confidence = (movie_user_matrix * 40).astype("double")
als_model.fit(confidence)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [87]:
similar = als_model.similar_items(0, 10)

movies = []
scores = []
for i in similar:
    idx, score = i
    movies.append(item_lookup.loc[item_lookup.movie_id == str(idx)].title.iloc[0])
    scores.append(score)

pd.DataFrame({"movie_title": movies, "similarity": scores})

Unnamed: 0,movie_title,similarity
0,Toy Story (1995),0.048282
1,Aladdin (1992),0.048146
2,Apollo 13 (1995),0.048031
3,"Lion King, The (1994)",0.04801
4,Forrest Gump (1994),0.047963
5,Terminator 2: Judgment Day (1991),0.047942
6,Jurassic Park (1993),0.047931
7,Star Wars: Episode IV - A New Hope (1977),0.047931
8,"Fugitive, The (1993)",0.047817
9,Dances with Wolves (1990),0.047788


In [88]:
similar = als_model.recommend(10, user_movie_matrix)

movies = []
scores = []
for i in similar:
    idx, score = i
    movies.append(item_lookup.loc[item_lookup.movie_id == str(idx)].title.iloc[0])
    scores.append(score)
    
pd.DataFrame({"movie_title": movies, "similarity": scores})

Unnamed: 0,movie_title,similarity
0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),1.111498
1,Heavy Metal (1981),1.107557
2,Chain Reaction (1996),1.099048
3,Black Sheep (1996),1.093331
4,Friday (1995),1.090675
5,Toy Story (1995),1.086233
6,Happy Gilmore (1996),1.082253
7,Tommy Boy (1995),1.081758
8,Sgt. Bilko (1996),1.076707
9,Dumb & Dumber (Dumb and Dumber) (1994),1.062917


# Evaluation:
* Split the data into two set, train and test
* Make sure that the train set and test set only have ratings for common movies

In [112]:
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(movie_ratings, train_size = 0.8)
print("x_train:", x_train.shape, "\nx_test:", x_test.shape)

x_train: (80668, 6) 
x_test: (20168, 6)


In [113]:
#Não pode haver users e movies no x_train que depois não existam no x_test
x_train = x_train[x_train.movie_id.isin(x_test.movie_id.unique())]
x_test = x_test[x_test.movie_id.isin(x_train.movie_id.unique())]
x_train = x_train[x_train.user_id.isin(x_test.user_id.unique())]
x_test = x_test[x_test.user_id.isin(x_train.user_id.unique())]
print("x_train:", x_train.shape, "\nx_test:", x_test.shape)
print("x_train_users:", x_train.user_id.unique().shape, "\nx_test_users:", x_test.user_id.unique().shape)
print("x_train_movies:", x_train.movie_id.unique().shape, "\nx_test_movies:", x_test.movie_id.unique().shape)

x_train: (71406, 6) 
x_test: (19344, 6)
x_train_users: (610,) 
x_test_users: (610,)
x_train_movies: (4376,) 
x_test_movies: (4376,)


#### Train the model:
With the train set

In [114]:
train_user_movie, train_movie_user = make_sparse_matrix(x_train)
train_user_movie.shape

(610, 4376)

In [115]:
train_model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.001, iterations=10, calculate_training_loss = True)
train_confidence = (train_movie_user * 40).astype("double")
train_model.fit(train_confidence)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [120]:
test_user_movie, test_movie_user = make_sparse_matrix(x_test)
test_confidence.shape

(4376, 610)

#### Calculate metrics:
* Grid search
* Test parameters: alpha, iterations, latent factors, lambda and K

In [124]:
alpha = [15, 25, 40, 100]
iterations = [5, 10, 20, 30, 50, 100, 500, 1000]
lat_factors = [5, 10, 20, 50, 100]
lmbda = [1, 0.1, 0.01, 0.001]
k_vals = [5, 10, 20]

In [None]:
for alpha_val in alpha:
    for it in iterations:
        for lat in lat_factors:
            for l in lmbda:
                for k in k_vals:
                    model = implicit.als.AlternatingLeastSquares(factors=lat, regularization=l, iterations=it)
                    confidence = (train_movie_user * alpha_val).astype("double")
                    train_model.fit(confidence)
                    print("")

In [119]:
precision_at_k(train_model, train_confidence, test_user_movie, K=10)

HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))




0.08795040681906238

In [123]:
mean_average_precision_at_k(train_model, train_confidence, test_user_movie, K = 10)

HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))




0.03078392583505777

In [122]:
AUC_at_k(train_model, train_confidence, test_user_movie, K = 10)

HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))




0.5266431148470124

In [121]:
ndcg_at_k(train_model, train_confidence, test_user_movie, K = 10)

HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))




0.08007263942375745