In [115]:
import pandas as pd
import numpy as np
import scipy
from sklearn.model_selection import train_test_split

# read ratings
ratings = pd.read_csv("ml-latest-small/ratings.csv", usecols=[0,1,2])
# assumption: uid/mid is of string type
ratings = ratings.astype({"userId": "category", "movieId": "category"})

# n_user
n_user = ratings["userId"].nunique()
# n_item
n_item = ratings["movieId"].nunique()

# inner id -> raw id
user_index = dict(enumerate(ratings["userId"].unique()))
# raw_id -> inner_id
user_inverted_index = { v: k for k, v in user_index.items()}

# inner id -> raw id
item_index = dict(enumerate(ratings["movieId"].unique()))
# raw_id -> inner_id
item_inverted_index = { v: k for k, v in item_index.items()}

# add inner id
uids = [user_inverted_index[userId] for userId in ratings["userId"].values]
iids = [item_inverted_index[itemId] for itemId in ratings["movieId"].values]
ratings = ratings.assign(
    uid=uids, 
    iid=iids)
# drop unused columns
ratings.drop(columns=["userId", "movieId"], axis=1, inplace=True)

# drop ratings less then (preparation for implicit dataset)
min_rating = 3
ratings = ratings[ratings["rating"] >= min_rating]
ratings["rating"] = 1

In [196]:
ratings.describe()

Unnamed: 0,rating,uid,iid
count,81763.0,81763.0,81763.0
mean,1.0,321.286315,1726.459254
std,0.0,181.744214,1900.710684
min,1.0,0.0,0.0
25%,1.0,169.0,401.0
50%,1.0,320.0,1077.0
75%,1.0,473.0,2316.0
max,1.0,609.0,9723.0


In [211]:
# split
ratings_train, ratings_test = train_test_split(ratings, test_size=0.2, random_state=42)
def stat(df):
    print("dataframe shape: ", df.shape)
    print(f"unique user count: {df['uid'].nunique()}, unique item count: {df['iid'].nunique()}")

print(ratings.dtypes)
stat(ratings)
stat(ratings_train)
stat(ratings_test)

rating    int64
uid       int64
iid       int64
dtype: object
dataframe shape:  (81763, 3)
unique user count: 609, unique item count: 8452
dataframe shape:  (65410, 3)
unique user count: 609, unique item count: 7774
dataframe shape:  (16353, 3)
unique user count: 606, unique item count: 4379


In [212]:
# to sparse matrix
sparse_ratings_train = scipy.sparse.csr_matrix((ratings_train.rating, (ratings_train.iid, ratings_train.uid)), 
                                               shape=(n_item, n_user))

In [213]:
# training
from implicit.als import AlternatingLeastSquares
import time
import logging
model_name = "als"
log = logging.getLogger("implicit")

model = AlternatingLeastSquares(iterations=20, calculate_training_loss=True)
start = time.time()
model.fit(sparse_ratings_train)
log.debug("trained model '%s' in %s", model_name, time.time() - start)





In [214]:
def evaluate(model, ratings_train, ratings_test, K=10):
    # recommendations matrix: n_user x K (recommend list length)
    recommendations = model.recommend_all(ratings_train, 
                                          N = K, 
                                          filter_already_liked_items=True)
    # uid as index
    interactions = ratings_test.groupby(['uid'])['iid'].apply(list)
    
    # recall@k, precision@k, f1@k
    # https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)
    records = []
    for index, interactionList in interactions.items():
        # every user should has recommendations
        recommendationList = recommendations[index]
        intersection = set(recommendationList).intersection(set(interactionList))
        record = {
            "n_intersection": len(intersection),
            "n_interaction": len(interactionList),
#             "n_recommendation": len(recommendationList),
            f"P@{K}": len(intersection) * 1.0 / len(recommendationList),
            f"R@{K}": len(intersection) * 1.0 / len(interactionList),
        }
        if record["n_intersection"] > 0:
            record[f"F1@{K}"] = 2 * record[f"P@{K}"] * record[f"R@{K}"] / (record[f"P@{K}"] + record[f"R@{K}"])
        records.append(record)
    
    df = pd.DataFrame(records)
    print(df.describe())

In [215]:
evaluate(model, sparse_ratings_train.transpose(), ratings_test, K=10)


       n_intersection  n_interaction        P@10        R@10       F1@10
count      606.000000     606.000000  606.000000  606.000000  480.000000
mean         2.084158      26.985149    0.208416    0.163357    0.192434
std          1.802986      41.353163    0.180299    0.192219    0.138601
min          0.000000       1.000000    0.000000    0.000000    0.008811
25%          1.000000       6.000000    0.100000    0.015968    0.086957
50%          2.000000      12.000000    0.200000    0.100000    0.148701
75%          3.000000      30.000000    0.300000    0.250000    0.270270
max          9.000000     443.000000    0.900000    1.000000    0.700000


In [216]:
evaluate(model, sparse_ratings_train.transpose(), ratings_test, K=5)


       n_intersection  n_interaction         P@5         R@5        F1@5
count      606.000000     606.000000  606.000000  606.000000  413.000000
mean         1.257426      26.985149    0.251485    0.104346    0.183928
std          1.203192      41.353163    0.240638    0.145151    0.137033
min          0.000000       1.000000    0.000000    0.000000    0.007905
25%          0.000000       6.000000    0.000000    0.000000    0.076923
50%          1.000000      12.000000    0.200000    0.049390    0.153846
75%          2.000000      30.000000    0.400000    0.166667    0.260870
max          5.000000     443.000000    1.000000    1.000000    0.666667


In [217]:
# TODO: compare with SparkALS