In [1]:
import numpy as np
from scipy.linalg import lu
from scipy.sparse.linalg import svds
from numpy.linalg import svd

In [2]:
def check_spanrd(vectors, d):
    """
    Inputs:
        - vectors (array): matrix (N, d)
        - d (int): dimension of the space to be spanned
    Return:
        - True or False
    """
    # https://math.stackexchange.com/questions/56201/how-to-tell-if-a-set-of-vectors-spans-a-space
    # https://stackoverflow.com/questions/15638650/is-there-a-standard-solution-for-gauss-elimination-in-python
    pl, u = lu(vectors, permute_l=True)
    rank = np.linalg.matrix_rank(u)
    return d == int(rank)

def span(vectors):
    
    d = vectors.shape[1]
    for i in range(d):
        if check_spanrd(vectors, d - i):
            return d - i

In [3]:
files = ["lastfm_d6_span2.npz"]

dims = [6, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56, 61]
spns = [2, 7, 6, 11, 16, 21, 24, 25, 29, 33, 46, 35]

files = ["lastfm_d{0}_span{1}.npz".format(i,j) for i,j in zip(dims,spns)]

features = {}
thetas = {}

for file, d in zip(files, dims):
    
        f = np.load(file)
        features[d] = f['features']
        thetas[d] = f['theta']
        print("Loaded d={}".format(d))
        del(f)
print()

Loaded d=6
Loaded d=11
Loaded d=16
Loaded d=21
Loaded d=26
Loaded d=31
Loaded d=36
Loaded d=41
Loaded d=46
Loaded d=51
Loaded d=56
Loaded d=61



In [5]:
# remove useless features

tol = 1e-8  # threshold to consider an eigenvalue equal to zero

new_features = {}
new_thetas = {}

for d in dims:
    
    print("Starting d={}".format(d))
    fmat = features[d].reshape(-1, d)
    
    U, s, Vt = svd(fmat, full_matrices=False)
    sp = np.sum(s > tol)
    print("[d={0}] span: {1}".format(d,sp))
    s = s[:sp]
    U = U[:, :sp]
    Vt = Vt[:sp, :]

    s = np.diag(s)
    U = np.dot(U, s)
    M = U.dot(Vt)
    rmse = np.sqrt(np.mean(np.abs(M - fmat) ** 2))
    print("[d={0}] Reconstruction rmse: {1}".format(d, rmse))
    
    idx = (d, sp)
        
    # create new features/parameters
    new_features[idx] = U.reshape(features[d].shape[0], features[d].shape[1], sp)
    new_thetas[idx] = Vt.dot(thetas[d])
    
    # normalize parameters
    norm = np.linalg.norm(new_thetas[idx])
    new_thetas[idx] /= norm
    new_features[idx] *= norm
    
    # check errors
    old_mu = features[d].dot(thetas[d])
    new_mu = new_features[idx].dot(new_thetas[idx])
    err = np.abs(old_mu - new_mu)
    print("[d={0}] mu error: max {1} - mean {2}".format(d, np.max(err), np.mean(err)))
    
    del(old_mu)
    del(new_mu)
    del(err)
    
    print()

Starting d=6
[d=6] span: 2
[d=6] Reconstruction rmse: 6.632094340375261e-08
[d=6] mu error: max 9.5367431640625e-07 - mean 2.1124025195717877e-08

Starting d=11
[d=11] span: 7
[d=11] Reconstruction rmse: 2.53853556841932e-07
[d=11] mu error: max 3.814697265625e-06 - mean 6.319343270888567e-08

Starting d=16
[d=16] span: 6
[d=16] Reconstruction rmse: 1.389801127515966e-07
[d=16] mu error: max 2.384185791015625e-06 - mean 4.5735070841601555e-08

Starting d=21
[d=21] span: 11
[d=21] Reconstruction rmse: 2.692454188490956e-07
[d=21] mu error: max 1.9073486328125e-06 - mean 3.200731768515652e-08

Starting d=26
[d=26] span: 17
[d=26] Reconstruction rmse: 2.369277609659548e-07
[d=26] mu error: max 2.86102294921875e-06 - mean 4.4091358120113e-08

Starting d=31
[d=31] span: 21
[d=31] Reconstruction rmse: 2.4554216793148953e-07
[d=31] mu error: max 2.1457672119140625e-06 - mean 5.456265483871903e-08

Starting d=36
[d=36] span: 26
[d=36] Reconstruction rmse: 5.455304972201702e-07
[d=36] mu error:

In [12]:
# check prediction errors and select ground-truth representation

# load data

data_path = "lastfmlog.npy"
ratings = np.load(data_path)
ratings = (ratings - np.mean(ratings)) / np.std(ratings)

d_gt = None
min_mse = 1

for d in new_features.keys():
    mu = new_features[d].dot(new_thetas[d])
    mse = np.mean(np.abs(mu - ratings)**2)
    print("{0} MSE: {1}".format(d, mse))
    if mse < min_mse:
        d_gt = d
        min_mse = mse
    del(mu)

print()
print("Ground truth: {0} - MSE: {1}".format(d_gt, min_mse))

(41, 26) MSE: 0.05806022679265509
(11, 7) MSE: 0.05503259350574423
(61, 40) MSE: 0.05759062640887625
(16, 6) MSE: 0.06025843054627975
(56, 48) MSE: 0.06954942736096482
(36, 26) MSE: 0.05020914764015797
(6, 2) MSE: 0.4588333044236889
(51, 35) MSE: 0.060536053410608234
(26, 17) MSE: 0.0649631016765901
(21, 11) MSE: 0.06300121264719635
(31, 21) MSE: 0.06588723283874275
(46, 30) MSE: 0.05691209225231987

Ground truth: (36, 26) - MSE: 0.05020914764015797


In [13]:
# filter gaps

thresh = 0.01

# ground truth
mu_gt = new_features[d_gt].dot(new_thetas[d_gt])
gap_gt = np.max(mu_gt, axis=1)[:, np.newaxis] - mu_gt
gap_gt[gap_gt == 0] = 100
print("gap min:", gap_gt.min())
gap_gt = np.min(gap_gt, axis=1)

# indexes of contexts with minimum gap above threshold
good_contexts = gap_gt > thresh
print("# contexts with gap_min > {0}: {1}".format(thresh, np.sum(good_contexts)))

# filter
for d in new_features.keys():
    new_features[d] = new_features[d][good_contexts, :, :]

n_contexts = np.sum(good_contexts)
mu_gt = mu_gt[good_contexts, :]

gap min: 0.00016403198
# contexts with gap_min > 0.01: 1291


In [15]:
# check misspecification

for d in new_features.keys():
    mu = new_features[d].dot(new_thetas[d])
    err = np.abs(mu - mu_gt)
    print("[d={0}] error wrt ground truth: max {1} - min {2} - mean {3}".format(d, err.max(), np.min(err), np.mean(err)))
    err_cont = np.min(err, axis=1)
    print("[d={0}] min error per context: max {1} - min {2} - mean {3}".format(d, err_cont.max(), np.min(err_cont), np.mean(err_cont)))
    del(mu)
    del(err)

[d=(41, 26)] error wrt ground truth: max 4.173834800720215 - min 5.960464477539063e-08 - mean 0.06733388453722
[d=(41, 26)] min error per context: max 0.0041342973709106445 - min 5.960464477539063e-08 - mean 7.758802530588582e-05
[d=(11, 7)] error wrt ground truth: max 5.3570380210876465 - min 2.9802322387695312e-08 - mean 0.06417728215456009
[d=(11, 7)] min error per context: max 0.0006009042263031006 - min 2.9802322387695312e-08 - mean 7.032357825664803e-05
[d=(61, 40)] error wrt ground truth: max 4.190573215484619 - min 1.4901161193847656e-07 - mean 0.07328637689352036
[d=(61, 40)] min error per context: max 0.0015204846858978271 - min 1.4901161193847656e-07 - mean 0.00010984537948388606
[d=(16, 6)] error wrt ground truth: max 4.845789909362793 - min 5.960464477539063e-08 - mean 0.06859587877988815
[d=(16, 6)] min error per context: max 0.005938291549682617 - min 5.960464477539063e-08 - mean 0.0001079004505299963
[d=(56, 48)] error wrt ground truth: max 4.5373969078063965 - min 2.98

In [18]:
# check span optimal arms

span_opt = {}

for d in new_features.keys():
    
    mu = new_features[d].dot(new_thetas[d])
    astar = np.argmax(mu, axis=1)
    fstar = np.array([new_features[d][x, astar[x]] for x in range(n_contexts)])

    span = d[1]
    for i in range(d[1]):
        if check_spanrd(fstar, d[1] - i):
            span = d[1] - i
            break
    
    span_opt[d] = span
    
    outer = np.matmul(fstar.T, fstar) / n_contexts
    lambda_hls = np.linalg.eigvals(outer).min()
    
    print("[d={0}] span optimal arms: {1} - lambda HLS: {2}".format(d, span, lambda_hls))
    
    del(mu)
    del(astar)
    del(fstar)
    del(outer)

[d=(41, 26)] span optimal arms: 25 - lambda HLS: -2.672402772165017e-13
[d=(11, 7)] span optimal arms: 7 - lambda HLS: 0.0007666386081837118
[d=(61, 40)] span optimal arms: 35 - lambda HLS: -9.876667483865731e-09
[d=(16, 6)] span optimal arms: 6 - lambda HLS: 0.009543540887534618
[d=(56, 48)] span optimal arms: 45 - lambda HLS: -1.7859152649180032e-05
[d=(36, 26)] span optimal arms: 24 - lambda HLS: -4.0317371485798503e-07
[d=(6, 2)] span optimal arms: 2 - lambda HLS: 0.007481295149773359
[d=(51, 35)] span optimal arms: 33 - lambda HLS: -1.5818211117449603e-15
[d=(26, 17)] span optimal arms: 16 - lambda HLS: -2.1248512931809627e-12
[d=(21, 11)] span optimal arms: 11 - lambda HLS: 0.00011938941315747797
[d=(31, 21)] span optimal arms: 21 - lambda HLS: 0.0002496532688383013
[d=(46, 30)] span optimal arms: 29 - lambda HLS: 2.1365487910774902e-14


In [21]:
# save

for d in new_features.keys():
    np.savez_compressed('lastfm_post_d{0}_span{1}{2}.npz'.format(d[1],span_opt[d], "_gt" if d is d_gt else ""), 
                        features=new_features[d], theta=new_thetas[d])