# Unit 7: Content-based Filtering for Rating Prediction using a Factorization Machine

In this section, we switch from collaborative to content-based filtering. Where collaborative filtering exploits similarities among interactions, content-based filtering exploits similarities between user and/or item features. It finds combinations of user-item features that help to predict ratings or rankings.

However, we discussed the superiority of the ranking approach before, for simplicity we do rating prediction again here. The rating predictions are hence used to impose an ordering on items that are then recommended to the user.

The model we use for the relationship between features and ratings is a factorization machine which is similar to matrix factorization and offers more flexibility in modeling.

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
from typing import Dict, List, Tuple
import math

import numpy as np
import scipy as sp
from sklearn.metrics import mean_squared_error, mean_absolute_error

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

In [2]:
from pyfm import pylibfm

In [3]:
from recsys_training.data import Dataset, genres
from recsys_training.evaluation import get_relevant_items

In [4]:
ml100k_ratings_filepath = '../data/raw/ml-100k/u.data'
ml100k_item_filepath = '../data/raw/ml-100k/u.item'
ml100k_user_filepath = '../data/raw/ml-100k/u.user'

## Load Data

In [5]:
data = Dataset(ml100k_ratings_filepath)
data.rating_split(seed=42)

In [6]:
user_ratings = {}
grouped = data.train_ratings[['user', 'item', 'rating']].groupby('user')
for user in data.users:
    vals = grouped.get_group(user)[['item', 'rating']].values
    user_ratings[user] = dict(zip(vals[:, 0].astype(int),
                                  vals[:, 1].astype(float)))

In [7]:
item_feat = pd.read_csv(ml100k_item_filepath, sep='|', header=None,
                        names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,
                        engine='python')

In [8]:
user_feat = pd.read_csv(ml100k_user_filepath, sep='|', header=None,
                        names=['user', 'age', 'gender', 'occupation', 'zip'])

## User and Item Content (Features)

### Exploration

In [9]:
item_feat.head()

Unnamed: 0,item,title,release,video_release,imdb_url,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [10]:
user_feat.head()

Unnamed: 0,user,age,gender,occupation,zip
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### Preprocessing

#### Items

We keep the following information for items:
* release year
* genres

In [11]:
def min_max_scale(val, bounds):
    min_max_range = bounds['max']-bounds['min']
    return (val-bounds['min'])/min_max_range

In [13]:
# Infer the release year
idxs = item_feat[item_feat['release'].notnull()].index
item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release'].str.split('-')
item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release_year'].apply(lambda val: val[2]).astype(int)

# Impute median release year value for the items with missing release year
top_year = item_feat.loc[idxs, 'release_year'].astype(int).describe()['50%']
idx = item_feat[item_feat['release'].isnull()].index
item_feat.loc[idx, 'release_year'] = top_year

# Min-max scale the release year
item_year_bounds = {'min': item_feat['release_year'].min(),
                    'max': item_feat['release_year'].max()}
item_feat['release_year'] = item_feat['release_year'].apply(
    lambda year: min_max_scale(year, item_year_bounds))

# Drop other columns
item_feat.drop(['title', 'release', 'video_release', 'imdb_url'], axis=1, inplace=True)

In [14]:
item_feat.head()

Unnamed: 0,item,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release_year
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.960526
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.960526
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.960526
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0.960526
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0.960526


#### users

We keep the following information for users:
* age
* gender
* occupation
* zip-code

In [15]:
# Min-max scale the age
user_age_bounds = {'min': user_feat['age'].min(),
                   'max': user_feat['age'].max()}
user_feat['age'] = user_feat['age'].apply(lambda age: min_max_scale(age, user_age_bounds))

# Transform gender characters to numerical values (categories)
genders = sorted(user_feat['gender'].unique())
user_gender_map = dict(zip(genders, range(len(genders))))
user_feat['gender'] = user_feat['gender'].map(user_gender_map)

# Transform occupation strings to numerical values (categories)
occupations = sorted(user_feat['occupation'].unique())
user_occupation_map = dict(zip(occupations, range(len(occupations))))
user_feat['occupation'] = user_feat['occupation'].map(user_occupation_map)

# Transform the zip codes to categories keeping the first three digits and impute for missing
idxs = user_feat[~user_feat['zip'].str.isnumeric()].index
user_feat.loc[idxs, 'zip'] = '00000'
zip_digits_to_cut = 3
user_feat['zip'] = user_feat['zip'].apply(lambda val: int(val) // 10 ** zip_digits_to_cut)

In addition, we infer profiles by combining item information with rating data for each user to get features that represent the users' preferred genres and film age

In [16]:
def user_profiler(group):
    genre_dist = group[genres].mean()
    year_dist = group['release_year'].describe()[['mean', 'std', '50%']]

    return pd.concat((genre_dist, year_dist), axis=0)

In [18]:
min_rating = 4
ratings = data.train_ratings[data.train_ratings.rating >= min_rating]
ratings = ratings[['user', 'item']]
ratings = ratings.merge(item_feat, on='item', how='left')
ratings.drop(['item'], axis=1, inplace=True)

grouped = ratings.groupby('user')
profiles = grouped.apply(user_profiler).reset_index()
profiles.rename(columns={'50%': 'median'}, inplace=True)

Finally, we join the original user information with their profiles' information and one-hot-encode categorical information

In [19]:
user_feat = user_feat.merge(profiles, on='user', how='left')

occupation_1H = pd.get_dummies(user_feat['occupation'], prefix='occupation')
zip_1H = pd.get_dummies(user_feat['zip'], prefix='zip')

user_feat.drop(['occupation', 'zip', ], axis=1, inplace=True)
user_feat = pd.concat([user_feat, occupation_1H, zip_1H], axis=1)

user_feat.fillna(0, inplace=True)

We remove the user/item id columns and replace the current dataframe indices with their values

In [20]:
user_feat.index = user_feat['user'].values
user_feat.drop('user', axis=1, inplace=True)

item_feat.index = item_feat['item'].values
item_feat.drop('item', axis=1, inplace=True)

### Final Check

In [21]:
item_feat.head()

Unnamed: 0,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release_year
1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.960526
2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.960526
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.960526
4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0.960526
5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0.960526


In [22]:
user_feat.head()

Unnamed: 0,age,gender,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,mean,std,median,occupation_0,occupation_1,occupation_2,occupation_3,occupation_4,occupation_5,occupation_6,occupation_7,occupation_8,occupation_9,occupation_10,occupation_11,occupation_12,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,occupation_20,zip_0,zip_1,zip_2,zip_3,zip_4,zip_5,zip_6,zip_7,zip_8,zip_9,zip_10,zip_11,zip_12,zip_13,zip_14,...,zip_36,zip_37,zip_38,zip_39,zip_40,zip_41,zip_42,zip_43,zip_44,zip_45,zip_46,zip_47,zip_48,zip_49,zip_50,zip_51,zip_52,zip_53,zip_54,zip_55,zip_56,zip_57,zip_58,zip_59,zip_60,zip_61,zip_62,zip_63,zip_64,zip_65,zip_66,zip_67,zip_68,zip_70,zip_71,zip_73,zip_74,zip_75,zip_76,zip_77,zip_78,zip_79,zip_80,zip_81,zip_82,zip_83,zip_84,zip_85,zip_87,zip_89,zip_90,zip_91,zip_92,zip_93,zip_94,zip_95,zip_96,zip_97,zip_98,zip_99
1,0.257576,1,0.00813,0.235772,0.097561,0.04065,0.04065,0.308943,0.081301,0.04065,0.447154,0.0,0.00813,0.03252,0.04065,0.02439,0.186992,0.178862,0.186992,0.097561,0.01626,0.871095,0.146112,0.934211,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,0.69697,0,0.0,0.193548,0.064516,0.032258,0.032258,0.258065,0.193548,0.0,0.580645,0.0,0.064516,0.0,0.0,0.032258,0.322581,0.064516,0.193548,0.064516,0.0,0.960102,0.073141,0.986842,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0.242424,1,0.0,0.25,0.166667,0.0,0.0,0.25,0.25,0.0,0.5,0.0,0.0,0.0,0.0,0.333333,0.166667,0.166667,0.333333,0.166667,0.0,0.983553,0.016948,0.986842,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.257576,1,0.0,0.285714,0.142857,0.0,0.0,0.142857,0.214286,0.071429,0.285714,0.0,0.0,0.0,0.0,0.214286,0.142857,0.285714,0.357143,0.142857,0.0,0.968045,0.070898,0.986842,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0.393939,0,0.021277,0.382979,0.234043,0.12766,0.06383,0.595745,0.148936,0.0,0.12766,0.021277,0.021277,0.12766,0.06383,0.0,0.106383,0.340426,0.085106,0.148936,0.0,0.849384,0.168231,0.934211,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Sparsity of user/item content information

In [23]:
(user_feat==0).sum().sum()/user_feat.size

0.861337364529982

In [24]:
(item_feat==0).sum().sum()/item_feat.size

0.8640309155766944

## Factorization Machine

[Steffen Rendle: Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)

[pyFM - Factorization Machines in Python](https://github.com/coreylynch/pyFM)

[TODO]: Illustrate the math involved here using latex

#### Create Feature Matrices

In [25]:
# fetch content information for all observed user-item rating combinations
user_cb_feat_train = user_feat.loc[data.train_ratings.user.values].values
user_cb_feat_test = user_feat.loc[data.test_ratings.user.values].values
item_cb_feat_train = item_feat.loc[data.train_ratings.item.values].values
item_cb_feat_test = item_feat.loc[data.test_ratings.item.values].values

In [26]:
# concatenate user and item content information to form design matrices
# and convert to sparse matrix in Compressed Sparse Row (CSR) format
X_train = np.concatenate((user_cb_feat_train, item_cb_feat_train), axis=1)
X_train = sp.sparse.csr_matrix(X_train)
X_test = np.concatenate((user_cb_feat_test, item_cb_feat_test), axis=1)
X_test = sp.sparse.csr_matrix(X_test)

In [27]:
def get_sparsity(sparse_arr) -> float:
    num_elements = sparse_arr.shape[0]*sparse_arr.shape[1]
    num_nonzero_elements = sparse_arr.nnz
    density = num_nonzero_elements/num_elements
    return 1-density

In [28]:
X_train

<80000x161 sparse matrix of type '<class 'numpy.float64'>'
	with 1995351 stored elements in Compressed Sparse Row format>

In [29]:
# Sparsity of Training Data
get_sparsity(X_train)

0.8450814440993789

In [30]:
X_test

<20000x161 sparse matrix of type '<class 'numpy.float64'>'
	with 498098 stored elements in Compressed Sparse Row format>

In [31]:
# Sparsity of Test Data
get_sparsity(X_test)

0.8453111801242236

#### Create Target Matrices for Rating Predictions

In [32]:
y_train = data.train_ratings.rating.values.astype(float)
y_test = data.test_ratings.rating.values

#### Train Factorization Machine for Rating Prediction as Regressor using pyFM

In [33]:
n_epochs = 50  # number of full stochastic passes through the training data
k = 16
random_seed = 28

In [34]:
fm_cb = pylibfm.FM(num_factors=k,
                   num_iter=n_epochs,
                   verbose=True,
                   task="regression",
                   initial_learning_rate=0.001,
                   learning_rate_schedule="optimal",
                   seed=random_seed)
fm_cb.fit(X_train, y_train)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.58234
-- Epoch 2
Training MSE: 0.54275
-- Epoch 3
Training MSE: 0.52975
-- Epoch 4
Training MSE: 0.52318
-- Epoch 5
Training MSE: 0.52044
-- Epoch 6
Training MSE: 0.51692
-- Epoch 7
Training MSE: 0.51528
-- Epoch 8
Training MSE: 0.51396
-- Epoch 9
Training MSE: 0.51294
-- Epoch 10
Training MSE: 0.51197
-- Epoch 11
Training MSE: 0.51168
-- Epoch 12
Training MSE: 0.51125
-- Epoch 13
Training MSE: 0.51067
-- Epoch 14
Training MSE: 0.51097
-- Epoch 15
Training MSE: 0.51078
-- Epoch 16
Training MSE: 0.50997
-- Epoch 17
Training MSE: 0.51024
-- Epoch 18
Training MSE: 0.51071
-- Epoch 19
Training MSE: 0.51002
-- Epoch 20
Training MSE: 0.50968
-- Epoch 21
Training MSE: 0.50932
-- Epoch 22
Training MSE: 0.50995
-- Epoch 23
Training MSE: 0.50949
-- Epoch 24
Training MSE: 0.50883
-- Epoch 25
Training MSE: 0.50892
-- Epoch 26
Training MSE: 0.50831
-- Epoch 27
Training MSE: 0.50846
-- Epoch 28
Tra

## Evaluation on Test Set

In [35]:
y_pred = fm_cb.predict(X_test)

$MSE$

In [36]:
mean_squared_error(y_test, y_pred)

1.054013611878566

$MAE$

In [37]:
mean_absolute_error(y_test, y_pred)

0.8316663556320505

In [38]:
def get_prediction(fm: object, user: int, user_feat: pd.DataFrame, item_feat: pd.DataFrame,
                   items: np.array = None, remove_known_pos: bool = True) -> Dict[int, Dict[str, float]]:
    
    if items is None:
        if remove_known_pos:
            # Predict from unobserved items
            known_items = np.array(list(user_ratings[user].keys()))
            items = np.setdiff1d(data.items, known_items)
        else:
            items = np.array(data.items)
    if type(items) == np.int64:
        items = np.array([items])
    
    n_items = len(items)
    
    single_user_cb_feat = user_feat.loc[user].values.reshape(1, -1).repeat(n_items, axis=0)
    all_items_cb_feat = item_feat.loc[items].values
    
    input_data = np.concatenate((single_user_cb_feat, all_items_cb_feat), axis=1)
    input_data = sp.sparse.csr_matrix(input_data)
    
    preds = fm.predict(input_data)
    sorting = np.argsort(preds)[::-1]
    
    preds = {item: {'pred': pred} for item, pred in
             zip(items[sorting], preds[sorting])}
    
    return preds

In [39]:
get_prediction(fm_cb, 1, user_feat, item_feat)

{656: {'pred': 4.808310660845486},
 1122: {'pred': 4.636857180467904},
 1542: {'pred': 4.578625157375732},
 675: {'pred': 4.5530277711596865},
 835: {'pred': 4.542644627002138},
 484: {'pred': 4.531647492781998},
 1203: {'pred': 4.520766241481584},
 498: {'pred': 4.5174534045898085},
 617: {'pred': 4.513777102943216},
 836: {'pred': 4.511424197193398},
 1453: {'pred': 4.501179852252117},
 478: {'pred': 4.489745527990144},
 615: {'pred': 4.4883290235534234},
 1458: {'pred': 4.477009470440476},
 1286: {'pred': 4.477009470440476},
 525: {'pred': 4.470681867883593},
 1604: {'pred': 4.458037665541619},
 1397: {'pred': 4.448929048510699},
 607: {'pred': 4.446012944301404},
 1455: {'pred': 4.417843459871075},
 1580: {'pred': 4.417175867962432},
 967: {'pred': 4.41650502129444},
 1124: {'pred': 4.404672945676309},
 1198: {'pred': 4.402028447098578},
 612: {'pred': 4.400293007686313},
 429: {'pred': 4.398373996767872},
 1299: {'pred': 4.3876403735565885},
 611: {'pred': 4.385114862716462},
 633

In [40]:
def get_recommendations(fm_cb: object, user: int, N: int, user_feat: pd.DataFrame, item_feat: pd.DataFrame,
                        remove_known_pos: bool = True) -> List[Tuple[int, Dict[str, float]]]:
    predictions = get_prediction(fm_cb, user, user_feat, item_feat,
                                 remove_known_pos=remove_known_pos)
    recommendations = []
    # TODO: Simplify
    for item, pred in predictions.items():
        add_item = (item, pred)
        recommendations.append(add_item)
        if len(recommendations) == N:
            break

    return recommendations

In [42]:
get_recommendations(fm_cb, 1, N=10, user_feat=user_feat, item_feat=item_feat)

[(656, {'pred': 4.808310660845486}),
 (1122, {'pred': 4.636857180467904}),
 (1542, {'pred': 4.578625157375732}),
 (675, {'pred': 4.5530277711596865}),
 (835, {'pred': 4.542644627002138}),
 (484, {'pred': 4.531647492781998}),
 (1203, {'pred': 4.520766241481584}),
 (498, {'pred': 4.5174534045898085}),
 (617, {'pred': 4.513777102943216}),
 (836, {'pred': 4.511424197193398})]

## Evaluation

In [43]:
N = 10

In [44]:
relevant_items = get_relevant_items(data.test_ratings)

In [46]:
users = relevant_items.keys()
prec_at_N = dict.fromkeys(data.users)

for user in users:
    recommendations = get_recommendations(fm_cb, user, N, user_feat=user_feat, item_feat=item_feat, remove_known_pos=True)
    recommendations = [val[0] for val in recommendations]
    hits = np.intersect1d(recommendations,
                          relevant_items[user])
    prec_at_N[user] = len(hits)/N

In [47]:
recommendations

[498, 491, 1122, 133, 483, 1542, 1453, 510, 617, 601]

In [48]:
np.mean([val for val in prec_at_N.values() if val is not None])

0.02553191489361702