# Fit Exposure MF with exposure covariantes (Location ExpoMF) to the Gowalla dataset

In [1]:
import glob
import os
# if you are using OPENBLAS, you might want to turn this option on. Otherwise, joblib might get stuck
os.environ['OPENBLAS_NUM_THREADS'] = '1'

import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.sparse
import pandas as pd

In [2]:
DATA_ROOT = '../../data/USERCOUNT_20000_MINSC_20'

In [3]:
unique_uid = list()
with open(os.path.join(DATA_ROOT, 'unique_uid.txt'), 'r') as f:
    for line in f:
        unique_uid.append(line.strip())
    
unique_sid = list()
with open(os.path.join(DATA_ROOT, 'unique_sid.txt'), 'r') as f:
    for line in f:
        unique_sid.append(line.strip())

In [4]:
n_songs = len(unique_sid)
n_users = len(unique_uid)
print n_songs, n_users

617 9636


### Load the data and train the model

In [5]:
def load_data(csv_file, shape=(n_users, n_songs)):
    tp = pd.read_csv(csv_file)    
    rows, cols = np.array(tp['uid'], dtype=np.int32), np.array(tp['sid'], dtype=np.int32)
    count = tp['rating']
    return scipy.sparse.csr_matrix((count,(rows, cols)), dtype=np.int16, shape=shape), rows, cols

In [6]:
train_data, rows, cols = load_data(os.path.join(DATA_ROOT, 'train.num.csv'))
# binarize the data, setting all data equal 1
train_data.data = np.ones_like(train_data.data)

In [7]:
vad_data, rows_vad, cols_vad = load_data(os.path.join(DATA_ROOT, 'vad.num.csv'))
# binarize the data
vad_data.data = np.ones_like(vad_data.data)

In [8]:
test_data, rows_test, cols_test = load_data(os.path.join(DATA_ROOT, 'test.num.csv'))
# binarize the data
test_data.data = np.ones_like(test_data.data)

`feat_venue_locs.tsv` contains the location features (part of the [pre-processed data](http://dawenl.github.io/data/gowalla_pro.zip)), which are generated in the following way: 
- Run GMM (from [scikit.learn](http://scikit-learn.org/)) on all the venue locations.
- For each venue, take the expected cluster assignment as location features `pi`.

In [9]:
pi = np.loadtxt(os.path.join(DATA_ROOT, 'feat_venue_locs.tsv'), dtype='float32')

In [10]:
# filtering out locations that should have been filtered out. see filter_triplets
pi = np.loadtxt(os.path.join(DATA_ROOT, 'feat_venue_locs.tsv'), dtype='float32')
mask = np.ones(pi.shape, dtype=bool)
for i in range(0, pi.shape[0]):
    if "%d" % pi[i, 0] not in unique_sid:
        mask[i,:] = False
pi = pi[mask,...]
pi = pi.reshape(pi.shape[0]/101, 101)


# sanity check to make sure all the venues has its corresponding feature    
for i, s in enumerate(unique_sid):
    if s != "%d" % pi[i, 0]:
        print i, s, pi[i, 0]
        break
# the first column of pi is sid

In [11]:
# the first column is ID, don't need them
pi = pi[:, 1:]

In [12]:
import gibbs3_expomf_cov
n_components = 100
max_iter = 10
n_jobs = 1
lam = 1e-5
# here we use the best performing init_mu from per-item \mu_i experiment
init_mu = 0.01
max_epoch = 10

save_dir="Gowalla_Location_ExpoMF_params_K%d_lam%1.0E_initmu%1.0E_maxepoch%d" % (n_components, lam, init_mu, max_epoch)

#coder = expomf_cov.ExpoMF(n_components=n_components, max_iter=max_iter, batch_size=1000, 
#                          batch_sgd=10, max_epoch=max_epoch, init_std=0.01,
#                          n_jobs=n_jobs, random_state=98765, save_params=True, save_dir=save_dir, 
#                          early_stopping=True, verbose=True, 
#                          lam_y=1., lam_theta=lam, lam_beta=lam, lam_nu=lam, init_mu=init_mu, learning_rate=.5)
coder = gibbs3_expomf_cov.ExpoMF(n_components=n_components, max_iter=max_iter, batch_size=10000, 
                          batch_sgd=100, max_epoch=max_epoch, init_std=0.01,
                          n_jobs=n_jobs, random_state=98765, save_params=True, save_dir=save_dir, 
                          early_stopping=True, verbose=True, 
                          lam_y=1., lam_theta=lam, lam_beta=lam, lam_nu=lam, init_mu=init_mu, learning_rate=.5)

In [13]:
from numpy import zeros

In [14]:
para_dir = 'YELP_20000USERS_100_feature_restaurant_only_location_only_Numeric_Id100_lxam1E-05_initmu1E-02_maxepoch10/ExpoMF_cov_K100_mu1.0e-02_iter12.npz'
coder.fit(train_data, pi, para_dir, init_only_mu=False, random_mu = True)

Start to sample...
Iteration: #0
	Sampling exposure covariate: time=1.29
	Sampling user factors: time=55.72
	Sampling item factors: time=623.10
	Sampling location exposure covariates: time=0.99
The MSE are:515340697600000.00 and 1735649022471.28
Iteration: #1
	Sampling exposure covariate: time=1.00
	Sampling user factors: time=61.49
	Sampling item factors...

LinAlgError: SVD did not converge

It seems that after a few epochs the validation loss will not decrease. However, we empirically found that it is still better to train for more epochs, instead of stop the SGD

## Evaluate the performance on heldout testset

In [None]:
n_params = len(glob.glob(os.path.join(save_dir, '*.npz')))

params = np.load(os.path.join(save_dir, 'ExpoMF_cov_K%d_mu%.1e_iter%d.npz' % (n_components, init_mu, 5)))
U, V, nu, alpha = params['U'], params['V'], params['nu'], params['alpha']

### Rank by $\mathbb{E}[y_{ui}] = \mu_{ui}\theta_u^\top\beta_i$

In [None]:
import rec_eval
mu = {'params': [nu, pi, alpha], 'func': gibbs3_expomf_cov.get_mu}

print 'Test Recall@20: %.4f' % rec_eval.recall_at_k(train_data, test_data, U, V, k=20, mu=mu, vad_data=vad_data)
print 'Test Recall@50: %.4f' % rec_eval.recall_at_k(train_data, test_data, U, V, k=50, mu=mu, vad_data=vad_data)
print 'Test NDCG@100: %.4f' % rec_eval.normalized_dcg_at_k(train_data, test_data, U, V, k=100, mu=mu, vad_data=vad_data)
print 'Test MAP@100: %.4f' % rec_eval.map_at_k(train_data, test_data, U, V, k=100, mu=mu, vad_data=vad_data)