# Unit 8: Hybrid Recommender Model using both Collaborative Filtering and Content-based Filtering using a Factorization Machine

In this section, we combine CF and CBF.

Therefore, we simply add the one-hot-encoded user and item IDs to the data. Thus, the model is capable of factorizing the similarities in rating and features for rating prediction. This combination is called hybrid as it combines two recommenders.

In [1]:
from collections import OrderedDict
import itertools
from typing import Dict, List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyfm import pylibfm
from scipy import sparse
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [3]:
from recsys_training.data import Dataset, genres
from recsys_training.evaluation import get_relevant_items
from recsys_training.utils import get_sparsity

In [4]:
ml100k_ratings_filepath = '../../data/raw/ml-100k/u.data'
ml100k_item_filepath = '../../data/raw/ml-100k/u.item'
ml100k_user_filepath = '../../data/raw/ml-100k/u.user'

## Load Data

In [5]:
data = Dataset(ml100k_ratings_filepath)
data.rating_split(seed=42)
user_ratings = data.get_user_ratings()

In [6]:
item_feat = pd.read_csv(ml100k_item_filepath, sep='|', header=None,
                        names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,
                        engine='python')

In [7]:
user_feat = pd.read_csv(ml100k_user_filepath, sep='|', header=None,
                        names=['user', 'age', 'gender', 'occupation', 'zip'])

## User and Item Content (Features)

### Preprocessing

#### Items

We keep the following information for items:
* release year
* genres

In [8]:
def min_max_scale(val, bounds):
    min_max_range = bounds['max']-bounds['min']
    return (val-bounds['min'])/min_max_range

In [9]:
# Infer the release year
idxs = item_feat[item_feat['release'].notnull()].index
item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release'].str.split('-')
item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release_year'].apply(lambda val: val[2]).astype(int)

# Impute median release year value for the items with missing release year
top_year = item_feat.loc[idxs, 'release_year'].astype(int).describe()['50%']
idx = item_feat[item_feat['release'].isnull()].index
item_feat.loc[idx, 'release_year'] = top_year

# Min-max scale the release year
item_year_bounds = {'min': item_feat['release_year'].min(),
                    'max': item_feat['release_year'].max()}
item_feat['release_year'] = item_feat['release_year'].apply(
    lambda year: min_max_scale(year, item_year_bounds))

# Drop other columns
item_feat.drop(['title', 'release', 'video_release', 'imdb_url'], axis=1, inplace=True)

#### users

We keep the following information for users:
* age
* gender
* occupation
* zip-code

In [10]:
# Min-max scale the age
user_age_bounds = {'min': user_feat['age'].min(),
                   'max': user_feat['age'].max()}
user_feat['age'] = user_feat['age'].apply(lambda age: min_max_scale(age, user_age_bounds))

# Transform gender characters to numerical values (categories)
genders = sorted(user_feat['gender'].unique())
user_gender_map = dict(zip(genders, range(len(genders))))
user_feat['gender'] = user_feat['gender'].map(user_gender_map)

# Transform occupation strings to numerical values (categories)
occupations = sorted(user_feat['occupation'].unique())
user_occupation_map = dict(zip(occupations, range(len(occupations))))
user_feat['occupation'] = user_feat['occupation'].map(user_occupation_map)

# Transform the zip codes to categories keeping the first three digits and impute for missing
idxs = user_feat[~user_feat['zip'].str.isnumeric()].index
user_feat.loc[idxs, 'zip'] = '00000'
zip_digits_to_cut = 3
user_feat['zip'] = user_feat['zip'].apply(lambda val: int(val) // 10 ** zip_digits_to_cut)

In addition, we infer profiles by combining item information with rating data for each user to get features that represent the users' preferred genres and film age

In [11]:
def user_profiler(group):
    genre_dist = group[genres].mean()
    year_dist = group['release_year'].describe()[['mean', 'std', '50%']]

    return pd.concat((genre_dist, year_dist), axis=0)

In [12]:
def get_user_profiles(ratings: pd.DataFrame,
                      item_feat: pd.DataFrame,
                      min_rating: float = 4.0) -> pd.DataFrame:
    ratings = ratings[ratings.rating >= min_rating]
    ratings = ratings[['user', 'item']]
    ratings = ratings.merge(item_feat, on='item', how='left')
    ratings.drop(['item'], axis=1, inplace=True)

    grouped = ratings.groupby('user')
    profiles = grouped.apply(user_profiler).reset_index()
    profiles.rename(columns={'50%': 'median'}, inplace=True)
    
    return profiles

Finally, we join the original user information with their profiles' information and one-hot-encode categorical information

In [13]:
profiles = get_user_profiles(data.train_ratings, item_feat)
user_feat = user_feat.merge(profiles, on='user', how='left')

occupation_1H = pd.get_dummies(user_feat['occupation'], prefix='occupation')
zip_1H = pd.get_dummies(user_feat['zip'], prefix='zip')

user_feat.drop(['occupation', 'zip', ], axis=1, inplace=True)
user_feat = pd.concat([user_feat, occupation_1H, zip_1H], axis=1)

user_feat.fillna(0, inplace=True)

We remove the user/item id columns and replace the current dataframe indices with their values

In [14]:
user_feat.index = user_feat['user'].values
user_feat.drop('user', axis=1, inplace=True)

item_feat.index = item_feat['item'].values
item_feat.drop('item', axis=1, inplace=True)

## Factorization Machine for a Hybrid Recommender

[Steffen Rendle: Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)

[pyFM - Factorization Machines in Python](https://github.com/coreylynch/pyFM)

#### Create Feature Matrices

In [15]:
# fetch content information for all observed user-item rating combinations
user_cb_feat_train = user_feat.loc[data.train_ratings.user.values].values
user_cb_feat_test = user_feat.loc[data.test_ratings.user.values].values
item_cb_feat_train = item_feat.loc[data.train_ratings.item.values].values
item_cb_feat_test = item_feat.loc[data.test_ratings.item.values].values

![](../Parrot.png)

**Task:** Implement additional arrays for user and item IDs and adjust the design matrices `X_train` and `X_test` accordingly.

In [16]:
def one_hot_encode_ids(ids: np.array, length):
    one_hot_enc = np.zeros((len(ids), length))
    one_hot_enc[np.arange(len(ids)), ids] = 1
    return one_hot_enc

In [17]:
# Subtract 1 to turn 1-base-indexed into 0-base-indexed IDs for 0-base-indexed array
user_cf_feat_train = one_hot_encode_ids(data.train_ratings.user.values-1, data.n_users)
user_cf_feat_test = one_hot_encode_ids(data.test_ratings.user.values-1, data.n_users)
item_cf_feat_train = one_hot_encode_ids(data.train_ratings.item.values-1, data.n_items)
item_cf_feat_test = one_hot_encode_ids(data.test_ratings.item.values-1, data.n_items)

In [18]:
# concatenate user and item content information to form design matrices
# and convert to sparse matrix in Compressed Sparse Row (CSR) format
X_train = np.concatenate((user_cb_feat_train, item_cb_feat_train,
                          user_cf_feat_train, item_cf_feat_train), axis=1)
X_train = sparse.csr_matrix(X_train)
X_test = np.concatenate((user_cb_feat_test, item_cb_feat_test,
                         user_cf_feat_test, item_cf_feat_test), axis=1)
X_test = sparse.csr_matrix(X_test)

In [20]:
X_train

<80000x2786 sparse matrix of type '<class 'numpy.float64'>'
	with 2155351 stored elements in Compressed Sparse Row format>

In [21]:
# Sparsity of Training Data
get_sparsity(X_train)

0.9903295450466619

In [22]:
X_test

<20000x2786 sparse matrix of type '<class 'numpy.float64'>'
	with 538098 stored elements in Compressed Sparse Row format>

In [23]:
# Sparsity of Test Data
get_sparsity(X_test)

0.9903428212491027

#### Create Target Matrices for Rating Predictions

In [24]:
y_train = data.train_ratings.rating.values.astype(float)
y_test = data.test_ratings.rating.values

#### Train Factorization Machine for Rating Prediction as Regressor using pyFM

In [27]:
n_epochs = 30  # number of full stochastic passes through the training data
k = 16
random_seed = 28

In [28]:
fm_hybrid = pylibfm.FM(num_factors=k,
                       num_iter=n_epochs,
                       verbose=True,
                       task="regression",
                       initial_learning_rate=0.001,
                       learning_rate_schedule="optimal",
                       seed=random_seed)
fm_hybrid.fit(X_train, y_train)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.54742
-- Epoch 2
Training MSE: 0.47356
-- Epoch 3
Training MSE: 0.44877
-- Epoch 4
Training MSE: 0.43495
-- Epoch 5
Training MSE: 0.42588
-- Epoch 6
Training MSE: 0.41863
-- Epoch 7
Training MSE: 0.41311
-- Epoch 8
Training MSE: 0.40761
-- Epoch 9
Training MSE: 0.40307
-- Epoch 10
Training MSE: 0.39861
-- Epoch 11
Training MSE: 0.39402
-- Epoch 12
Training MSE: 0.39086
-- Epoch 13
Training MSE: 0.38718
-- Epoch 14
Training MSE: 0.38318
-- Epoch 15
Training MSE: 0.37987
-- Epoch 16
Training MSE: 0.37659
-- Epoch 17
Training MSE: 0.37341
-- Epoch 18
Training MSE: 0.37014
-- Epoch 19
Training MSE: 0.36745
-- Epoch 20
Training MSE: 0.36448
-- Epoch 21
Training MSE: 0.36187
-- Epoch 22
Training MSE: 0.35923
-- Epoch 23
Training MSE: 0.35631
-- Epoch 24
Training MSE: 0.35428
-- Epoch 25
Training MSE: 0.35159
-- Epoch 26
Training MSE: 0.34939
-- Epoch 27
Training MSE: 0.34738
-- Epoch 28
Tra

## Evaluation on Test Set

In [29]:
y_pred = fm_hybrid.predict(X_test)

$MSE$

In [30]:
mean_squared_error(y_test, y_pred)

0.8414891043110517

$MAE$

In [31]:
mean_absolute_error(y_test, y_pred)

0.7167074022523409

In [32]:
def get_prediction(fm: object, user: int, user_feat: pd.DataFrame, item_feat: pd.DataFrame,
                   items: np.array = None, remove_known_pos: bool = True) -> Dict[int, Dict[str, float]]:
    
    if items is None:
        if remove_known_pos:
            # Predict from unobserved items
            known_items = np.array(list(user_ratings[user].keys()))
            items = np.setdiff1d(data.items, known_items)
        else:
            items = np.array(data.items)
    if type(items) == np.int64:
        items = np.array([items])
    
    n_items = len(items)
    
    single_user_cb_feat = user_feat.loc[user].values.reshape(1, -1).repeat(n_items, axis=0)
    all_items_cb_feat = item_feat.loc[items].values
    
    input_data = np.concatenate((single_user_cb_feat, all_items_cb_feat), axis=1)
    input_data = sparse.csr_matrix(input_data)
    
    preds = fm.predict(input_data)
    sorting = np.argsort(preds)[::-1]
    
    preds = {item: {'pred': pred} for item, pred in
             zip(items[sorting], preds[sorting])}
    
    return preds

In [33]:
predictions = get_prediction(fm_hybrid, 1, user_feat, item_feat)
list(predictions.items())[:10]

[(611, {'pred': 4.813609502259241}),
 (1122, {'pred': 4.740596963321308}),
 (1453, {'pred': 4.625803040835942}),
 (656, {'pred': 4.588156414546111}),
 (1366, {'pred': 4.541011702303663}),
 (1561, {'pred': 4.541011702303663}),
 (847, {'pred': 4.509516644092099}),
 (1542, {'pred': 4.476956271572494}),
 (1064, {'pred': 4.458898036095825}),
 (617, {'pred': 4.4184344080264975})]

In [34]:
def get_recommendations(fm_cb: object,
                        user: int,
                        N: int,
                        user_feat: pd.DataFrame,
                        item_feat: pd.DataFrame,
                        remove_known_pos: bool = True) -> List[Tuple[int, Dict[str, float]]]:
    
    recommendations = []
    
    predictions = get_prediction(fm_cb, user, user_feat, item_feat,
                                 remove_known_pos=remove_known_pos)

    for item, pred in predictions.items():
        add_item = (item, pred)
        recommendations.append(add_item)
        if len(recommendations) == N:
            break

    return recommendations

In [35]:
get_recommendations(fm_hybrid, 1, N=10, user_feat=user_feat, item_feat=item_feat)

[(611, {'pred': 4.813609502259241}),
 (1122, {'pred': 4.740596963321308}),
 (1453, {'pred': 4.625803040835942}),
 (656, {'pred': 4.588156414546111}),
 (1366, {'pred': 4.541011702303663}),
 (1561, {'pred': 4.541011702303663}),
 (847, {'pred': 4.509516644092099}),
 (1542, {'pred': 4.476956271572494}),
 (1064, {'pred': 4.458898036095825}),
 (617, {'pred': 4.4184344080264975})]

## Evaluation

In [36]:
N = 10

In [37]:
relevant_items = get_relevant_items(data.test_ratings)

In [38]:
users = relevant_items.keys()
prec_at_N = dict.fromkeys(data.users)

for user in users:
    recommendations = get_recommendations(fm_hybrid, user, N,
                                          user_feat=user_feat, item_feat=item_feat)
    recommendations = [val[0] for val in recommendations]
    hits = np.intersect1d(recommendations,
                          relevant_items[user])
    prec_at_N[user] = len(hits)/N

In [39]:
recommendations

[133, 483, 510, 17, 1122, 1204, 245, 1453, 1542, 1299]

In [40]:
np.mean([val for val in prec_at_N.values() if val is not None])

0.019468085106382982