# Fit WMF (weighted matrix factorization) to the binarized ML20M

In [21]:
import itertools
import os
import sys
os.environ['OPENBLAS_NUM_THREADS'] = '1'

import numpy as np
import pandas as pd
from scipy import sparse

In [2]:
import content_wmf
import batched_inv_joblib
import rec_eval

### Load pre-processed data

Change this to wherever you saved the pre-processed data following [this notebook](./preprocess_ML20M.ipynb).

In [5]:
DATA_DIR = '/home/net1/appry001/data/ml-20m/pro'

In [6]:
unique_uid = list()
with open(os.path.join(DATA_DIR, 'unique_uid.txt'), 'r') as f:
    for line in f:
        unique_uid.append(line.strip())
    
unique_sid = list()
with open(os.path.join(DATA_DIR, 'unique_sid.txt'), 'r') as f:
    for line in f:
        unique_sid.append(line.strip())

In [7]:
n_items = len(unique_sid)
n_users = len(unique_uid)

print n_users, n_items

111148 11711


In [10]:
def load_data(csv_file, shape=(n_users, n_items)):
    tp = pd.read_csv(csv_file)
    timestamps, rows, cols = np.array(tp['timestamp']), np.array(tp['uid']), np.array(tp['sid'])
    #seq = np.concatenate((rows[:, None], cols[:, None], np.ones((rows.size, 1), dtype='int'), timestamps[:, None]), axis=1)
    data = sparse.csr_matrix((np.ones_like(rows), (rows, cols)), dtype=np.int16, shape=shape)
    return data

In [11]:
train_data = load_data(os.path.join(DATA_DIR, 'train.csv'))

In [12]:
vad_data = load_data(os.path.join(DATA_DIR, 'validation.csv'))

### Train the model

In [13]:
num_factors = 100
num_iters = 50
batch_size = 1000

n_jobs = 4
lam_theta = lam_beta = 1e-5

In [14]:
best_ndcg = -np.inf
U_best = None
V_best = None
best_alpha = 0

for alpha in [2, 5, 10, 30, 50]: 
    S = content_wmf.linear_surplus_confidence_matrix(train_data, alpha=alpha)

    U, V, vad_ndcg = content_wmf.factorize(S, num_factors, vad_data=vad_data, num_iters=num_iters, 
                                           init_std=0.01, lambda_U_reg=lam_theta, lambda_V_reg=lam_beta, 
                                           dtype='float32', random_state=98765, verbose=True, 
                                           recompute_factors=batched_inv_joblib.recompute_factors_batched, 
                                           batch_size=batch_size, n_jobs=n_jobs)
    if vad_ndcg > best_ndcg:
        best_ndcg = vad_ndcg
        U_best = U.copy()
        V_best = V.copy()
        best_alpha = alpha

Precompute S^T (if necessary)
  took 0.146 seconds
Iteration 0:
	Updating user factors: time=16.11
	Updating item factors: time=3.96


  return DCG / IDCG


	Validation NDCG@k: 0.21821
Iteration 1:
	Updating user factors: time=16.00
	Updating item factors: time=4.20
	Validation NDCG@k: 0.30866
Iteration 2:
	Updating user factors: time=16.28
	Updating item factors: time=3.74
	Validation NDCG@k: 0.31885
Iteration 3:
	Updating user factors: time=16.71
	Updating item factors: time=3.98
	Validation NDCG@k: 0.32216
Iteration 4:
	Updating user factors: time=16.33
	Updating item factors: time=3.73
	Validation NDCG@k: 0.32435
Iteration 5:
	Updating user factors: time=16.85
	Updating item factors: time=3.70
	Validation NDCG@k: 0.32606
Iteration 6:
	Updating user factors: time=16.68
	Updating item factors: time=3.92
	Validation NDCG@k: 0.32745
Iteration 7:
	Updating user factors: time=16.94
	Updating item factors: time=3.98
	Validation NDCG@k: 0.32839
Iteration 8:
	Updating user factors: time=16.39
	Updating item factors: time=3.86
	Validation NDCG@k: 0.32909
Iteration 9:
	Updating user factors: time=16.71
	Updating item factors: time=3.70
	Validatio

In [15]:
print(best_alpha, best_ndcg)

(10, 0.35495107675688048)


In [17]:
test_data = load_data(os.path.join(DATA_DIR, 'test.csv'))
test_data.data = np.ones_like(test_data.data)

In [18]:
# alpha = 10 gives the best validation performance
print 'Test Recall@20: %.4f' % rec_eval.recall_at_k(train_data, test_data, U_best, V_best, k=20, vad_data=vad_data)
print 'Test Recall@50: %.4f' % rec_eval.recall_at_k(train_data, test_data, U_best, V_best, k=50, vad_data=vad_data)
print 'Test NDCG@100: %.4f' % rec_eval.normalized_dcg_at_k(train_data, test_data, U_best, V_best, k=100, vad_data=vad_data)
print 'Test MAP@100: %.4f' % rec_eval.map_at_k(train_data, test_data, U_best, V_best, k=100, vad_data=vad_data)

  recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))


Test Recall@20: 0.1338
Test Recall@50: 0.1650
Test NDCG@100: 0.1603
Test MAP@100: 0.0474


In [19]:
np.savez('WMF_K100_ML20M.npz', U=U_best, V=V_best)

In [1]:
U[0]

NameError: name 'U' is not defined

In [20]:
# one off
num_factors = 100
num_iters = 50
batch_size = 1000

n_jobs = 4
lam_theta = lam_beta = 1e-5

best_ndcg = -np.inf
U_best = None
V_best = None
best_alpha = 0

for alpha in [10]: 
    S = content_wmf.linear_surplus_confidence_matrix(train_data, alpha=alpha)

    U, V, vad_ndcg = content_wmf.factorize(S, num_factors, vad_data=vad_data, num_iters=num_iters, 
                                           init_std=0.01, lambda_U_reg=lam_theta, lambda_V_reg=lam_beta, 
                                           dtype='float32', random_state=98765, verbose=True, 
                                           recompute_factors=batched_inv_joblib.recompute_factors_batched, 
                                           batch_size=batch_size, n_jobs=n_jobs)
    if vad_ndcg > best_ndcg:
        best_ndcg = vad_ndcg
        U_best = U.copy()
        V_best = V.copy()
        best_alpha = alpha

Precompute S^T (if necessary)
  took 0.148 seconds
Iteration 0:
	Updating user factors: time=16.41
	Updating item factors: time=3.77
	Validation NDCG@k: 0.19086
Iteration 1:
	Updating user factors: time=16.61
	Updating item factors: time=3.92
	Validation NDCG@k: 0.31008
Iteration 2:
	Updating user factors: time=16.72
	Updating item factors: time=4.24
	Validation NDCG@k: 0.32825
Iteration 3:
	Updating user factors: time=17.78
	Updating item factors: time=3.77
	Validation NDCG@k: 0.33675
Iteration 4:
	Updating user factors: time=16.91
	Updating item factors: time=4.02
	Validation NDCG@k: 0.34171
Iteration 5:
	Updating user factors: time=16.87
	Updating item factors: time=4.17
	Validation NDCG@k: 0.34491
Iteration 6:
	Updating user factors: time=16.94
	Updating item factors: time=4.32
	Validation NDCG@k: 0.34729
Iteration 7:
	Updating user factors: time=16.74
	Updating item factors: time=3.77
	Validation NDCG@k: 0.34894
Iteration 8:
	Updating user factors: time=16.79
	Updating item factor

In [None]:
# alpha = 10 gives the best validation performance
print 'Test Recall@20: %.4f' % rec_eval.recall_at_k(train_data, test_data, U_best, V_best, k=20, vad_data=vad_data)
print 'Test Recall@50: %.4f' % rec_eval.recall_at_k(train_data, test_data, U_best, V_best, k=50, vad_data=vad_data)
print 'Test NDCG@100: %.4f' % rec_eval.normalized_dcg_at_k(train_data, test_data, U_best, V_best, k=100, vad_data=vad_data)
