# Setting up Colab environment

In [None]:
import os
username = 'Personalization-Technologies-Lab'
repo = 'Sber-RecSys-w2024'

# remove local directory if it already exists
if os.path.isdir(repo):
    !rm -rf {repo}

!git clone https://github.com/{username}/{repo}.git

In [None]:
!pip install lightfm

In [None]:
!pip install --no-cache-dir --upgrade git+https://github.com/evfro/polara.git@develop#egg=polara

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from tqdm.auto import tqdm

from lightfm import LightFM
from lightfm.datasets import fetch_stackexchange

from polara.evaluation.pipelines import random_grid
from polara.lib.earlystopping import early_stopping_callback
from polara.tools.display import print_frames

# navigating to cloned repo directory in Colab
%cd {repo}
from evaluation import topn_recommendations
# restoring original location
%cd -

# Data Preparation

The code below is adapted from official `LightFM`'s documentation for a cold-start scenario:  
https://making.lyst.com/lightfm/docs/examples/hybrid_crossvalidated.html

## Reading data

You will use the StackExchange data dump. The dataset consists of users and questions they answered.

**Task**:  
Find users that are most qualified for answering new questions.

Your recommendation algorithm must tailor matching between users and questions based on user expertise. You will need to use hybrid approach that utilizes side information about items. The dataset contains question labels in the form of user-assigned `tags`. Hence, even though questions will be "cold" (i.e., unanswered), you can still find the best match between experts and questions based on their answering history and tags used in the past.

In [2]:
data = fetch_stackexchange('crossvalidated',
                           test_set_fraction=0.1,
                           indicator_features=False,
                           tag_features=True)

In [3]:
data.keys()

dict_keys(['train', 'test', 'item_features', 'item_feature_labels'])

In [4]:
entities = ['users', 'questions']
training_data = pd.DataFrame(dict(zip(entities, data['train'].nonzero())))
test_data = pd.DataFrame(dict(zip(entities, data['test'].nonzero())))

In [5]:
item_tags = (
    pd.DataFrame(dict(zip(['questions', 'tags'], data['item_features'].nonzero())))
    .assign(tags = lambda x: data['item_feature_labels'][x['tags'].values])
    .groupby('questions')
    ['tags'].apply(list)
    .to_frame()
)

In [6]:
print_frames([training_data.head(), # data for training and validation
              test_data.head(), # data for testing
              item_tags.head()]) # item features data

Unnamed: 0_level_0,users,questions
Unnamed: 0_level_1,users,questions
Unnamed: 0_level_2,tags,Unnamed: 2_level_2
questions,Unnamed: 1_level_3,Unnamed: 2_level_3
0,0,2
1,0,4
2,1,5
3,1,13
4,1,17
0,1,31
1,1,115
2,1,151
3,1,251
4,1,1140

Unnamed: 0,users,questions
0,0,2
1,0,4
2,1,5
3,1,13
4,1,17

Unnamed: 0,users,questions
0,1,31
1,1,115
2,1,151
3,1,251
4,1,1140

Unnamed: 0_level_0,tags
questions,Unnamed: 1_level_1
0,"[bayesian, prior, elicitation]"
1,"[distributions, normality]"
2,"[software, open-source]"
3,"[distributions, statistical-significance]"
4,[machine-learning]


## Data Split

The dataset already provides splitting. But an additional step is still required: splitting into validation and actual test parts.

To simplify evaluation, only a single true expert will be withheld from each "cold" question.

In [7]:
random_state = np.random.RandomState(100)
final_test = (
    test_data
    .sample(frac=1, random_state=random_state)
    .drop_duplicates(subset=['questions'])
    .sample(frac=0.75, random_state=random_state) # make test and validation sizes more balanced
    .sort_values('questions')
)
validation = (
    test_data
    .drop(final_test.index)
    .sample(frac=1, random_state=random_state)
    .drop_duplicates(subset=['questions'])
    .sort_values('questions')
)

In [10]:
print_frames([validation.head(), final_test.head()])

Unnamed: 0_level_0,users,questions
Unnamed: 0_level_1,users,questions
187,137,2.0
2970,2364,42.0
50,46,80.0
2658,1931,87.0
958,421,91.0
67,84,15.0
23,19,19.0
2717,2067,26.0
0,1,31.0
29,21,36.0

Unnamed: 0,users,questions
187,137,2
2970,2364,42
50,46,80
2658,1931,87
958,421,91

Unnamed: 0,users,questions
67,84,15
23,19,19
2717,2067,26
0,1,31
29,21,36


In [11]:
validation.nunique()

users         345
questions    1356
dtype: int64

In [12]:
final_test.nunique()

users         509
questions    2823
dtype: int64

# Defining LightFM

In [40]:
def build_lfm_model(config, data, data_description, early_stop_config=None, iterator=None):
    # the model
    model = LightFM(
        no_components = config['num_components'],
        max_sampled = config['max_sampled'],
        loss = config['loss'],
        learning_schedule = config['learning_schedule'],
        user_alpha = config['user_alpha'],
        item_alpha = config['item_alpha']
    )
    if iterator is None:
        iterator = lambda x: x
    # early stoppping configuration
    es_config = check_early_stop_config(early_stop_config)
    # training
    for epoch in iterator(range(config['max_epochs'])):
        try:
            train_lfm_epoch(epoch, model, data, data_description, es_config)
        except StopIteration:
            break
    return model


def check_early_stop_config(early_stop_config):
    if early_stop_config is None:
        early_stop_config = {}
    try:
        es_dict = dict(
            early_stopper = early_stop_config['evaluation_callback'],
            callback_interval = early_stop_config['callback_interval'],
            holdout = early_stop_config['holdout'],
            stop_early = True
        )
    except KeyError:
        es_dict = dict(stop_early = False)
    return es_dict


def train_lfm_epoch(
    epoch, model, train, data_description, es_config,
):
    model.fit_partial(
        train,
        user_features=data_description['user_features'],
        item_features=data_description['item_features'],
        epochs=1
    )
    if es_config['stop_early'] and ((epoch+1) % es_config['callback_interval'] == 0):
        # evaluate model and raise StopIteration if early stopping condition is met
        es_config['early_stopper'](epoch, model, es_config['holdout'], data_description)


In [41]:
def lightfm_scoring(model, data, data_description):
    dtype = 'i4'
    all_users = np.arange(data_description['n_users'], dtype=dtype)
    test_items = data_description['cold_items'].astype(dtype)
    item_index, user_index = np.meshgrid(test_items, all_users, copy=False)

    scores = model.predict(
        user_index.ravel(),
        item_index.ravel(),
        item_features=data_description['item_features'],
    ).reshape(len(test_items), len(all_users), order='F')
    return scores

## Early stopping

Early stopping is based on the `polara`'s `early_stopping_callback` callback.

In [42]:
def coldstart_evaluate(recommended_users, holdout, data_description, topn=10):
    userid = data_description['users']
    holdout_users = holdout[userid].values
    assert recommended_users.shape[0] == len(holdout_users)
    hits_mask = recommended_users[:, :topn] == holdout_users.reshape(-1, 1)
    # HR calculation
    hr = np.mean(hits_mask.any(axis=1))
    # MRR calculation
    n_test_items = recommended_users.shape[0]
    hit_rank = np.where(hits_mask)[1] + 1.0
    mrr = np.sum(1 / hit_rank) / n_test_items
    return {'hr': hr, 'mrr': mrr}

def lfm_evaluator(model, holdout, data_description, target_metric='hr'):
    lfm_scores = lightfm_scoring(model, None, data_description)
    lfm_recs = topn_recommendations(lfm_scores)
    metrics = coldstart_evaluate(lfm_recs, holdout, {'users': 'users'})
    return metrics[target_metric]

## Quick check

In [43]:
lfm_config = dict(
    num_components = 30,
    loss = 'warp',
    max_sampled = 3,
    max_epochs = 100,
    learning_schedule = 'adagrad',
    user_alpha = 1e-3,
    item_alpha = 1e-3,
)

try_early_stop = early_stopping_callback(
        lfm_evaluator, max_fails=3, verbose=True
)

early_stop_config = dict(
    evaluation_callback = try_early_stop,
    callback_interval = 10, # break between consequent evaluation in epochs
    holdout = validation,
)

In [61]:
data_description = dict(
    users = 'users',
    items = 'questions',
    n_users = data['train'].shape[0],
    cold_items = validation['questions'].values,
    user_features = data.get('user_features'),
    item_features = data.get('item_features'),
)
data_description

{'users': 'users',
 'items': 'questions',
 'n_users': 3213,
 'cold_items': array([    2,    42,    80, ..., 72314, 72323, 72358], dtype=int32),
 'user_features': None,
 'item_features': <72360x1246 sparse matrix of type '<class 'numpy.float32'>'
 	with 198963 stored elements in Compressed Sparse Row format>}

In [45]:
lfm_params = build_lfm_model(
    lfm_config,
    data['train'],
    data_description,
    early_stop_config=early_stop_config,
    iterator=tqdm
)

 10%|█         | 10/100 [00:06<01:07,  1.34it/s]

Step 9 metric score: 0.021386430678466076


 20%|██        | 20/100 [00:12<01:05,  1.23it/s]

Step 19 metric score: 0.019174041297935103


 30%|███       | 30/100 [00:18<00:54,  1.28it/s]

Step 29 metric score: 0.021386430678466076


 40%|████      | 40/100 [00:24<00:43,  1.37it/s]

Step 39 metric score: 0.022123893805309734


 50%|█████     | 50/100 [00:30<00:33,  1.47it/s]

Step 49 metric score: 0.022861356932153392


 60%|██████    | 60/100 [00:36<00:28,  1.39it/s]

Step 59 metric score: 0.022123893805309734


 70%|███████   | 70/100 [00:42<00:21,  1.36it/s]

Step 69 metric score: 0.021386430678466076


 79%|███████▉  | 79/100 [00:47<00:12,  1.65it/s]

Step 79 metric score: 0.02064896755162242
Metric no longer improves. Best score 0.022861356932153392, attained in 50 iterations.





# Hyper-parameters tuning

In [46]:
lfm_params_grid = dict(
    num_components = [8, 12, 16, 24, 32, 48, 64],
    loss = ['warp'],
    max_sampled = [3, 10, 30, 100],
    max_epochs = [100],
    learning_schedule = ['adagrad'],
    user_alpha = [1e-5],
    item_alpha = [1e-5],
)

In [47]:
param_grid, param_names = random_grid(lfm_params_grid, n=5)

In [48]:
early_stop_config = dict(
    callback_interval = 10, # break between consequent evaluation in epochs
    holdout = validation,
)

lfm_results = {}

for grid_params in tqdm(param_grid):
    lfm_config = dict(zip(param_names, grid_params))
    early_stop_config['evaluation_callback'] = es_call = early_stopping_callback(
        lfm_evaluator, max_fails=3, verbose=False
    )
    lfm_params = build_lfm_model(
        lfm_config,
        data['train'],
        data_description,
        early_stop_config=early_stop_config,
    )
    num_epochs = es_call.iter + 1 # store optimal number of epochs
    lfm_results[grid_params+(num_epochs,)] = es_call.target # store optimal value

100%|██████████| 5/5 [02:32<00:00, 30.48s/it]


In [50]:
optimal_lfm_config = pd.Series(lfm_results).idxmax()

(48, 'warp', 30, 100, 'adagrad', 1e-05, 1e-05, 10)

# Final evaluation

In [53]:
lfm_optimal_config = dict(zip(param_names, optimal_lfm_config[:-1]))
lfm_optimal_config['max_epochs'] = optimal_lfm_config[-1]
lfm_optimal_config

{'num_components': 48,
 'loss': 'warp',
 'max_sampled': 30,
 'max_epochs': 10,
 'learning_schedule': 'adagrad',
 'user_alpha': 1e-05,
 'item_alpha': 1e-05}

In [67]:
def matrix_from_observations(data, data_description, dtype='f4'):
    useridx = data[data_description['users']]
    itemidx = data[data_description['items']]
    values = np.ones(data.shape[0])
    return csr_matrix((values, (useridx, itemidx)), dtype=dtype)

In [69]:
final_train = training_data.append(validation, ignore_index=True)
train_matrix = matrix_from_observations(
    final_train, data_description, dtype=data['train'].dtype
)

In [70]:
lfm_params = build_lfm_model(
    lfm_optimal_config,
    train_matrix,
    data_description,
    early_stop_config=None,
    iterator=tqdm
)

100%|██████████| 10/10 [00:11<00:00,  1.19s/it]


In [72]:
data_description['cold_items'] = final_test[data_description['items']].values

In [73]:
lfm_scores = lightfm_scoring(lfm_params, None, data_description)
lfm_recs = topn_recommendations(lfm_scores)
metrics = coldstart_evaluate(lfm_recs, final_test, data_description)


In [74]:
metrics

{'hr': 0.024087849805171802, 'mrr': 0.008928922850283105}