In [1]:
import numpy as np
from scipy.sparse.linalg import svds
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from scipy.linalg import lu

In [2]:
def check_spanrd(vectors, d):
    """
    Inputs:
        - vectors (array): matrix (N, d)
        - d (int): dimension of the space to be spanned
    Return:
        - True or False
    """
    # https://math.stackexchange.com/questions/56201/how-to-tell-if-a-set-of-vectors-spans-a-space
    # https://stackoverflow.com/questions/15638650/is-there-a-standard-solution-for-gauss-elimination-in-python
    pl, u = lu(vectors, permute_l=True)
    rank = np.linalg.matrix_rank(u)
    return d == int(rank)

In [3]:
# load data

data_path = "jester_data_40jokes_19181users.npy"

ratings = np.load(data_path)
print("Loaded dataset: {}".format(ratings.shape))

n_users, n_items = ratings.shape
ratings = ratings / 10  # normalize ratings
print("ratings: max {0} - min {1}".format(ratings.max(), ratings.min()))

Loaded dataset: (19181, 40)
ratings: max 0.9710000000000001 - min -0.9949999999999999


In [4]:
# SVD

K = 36
U, s, Vt = svds(ratings, k=K)
s = np.diag(s)
U = np.dot(U, s)

# MSE
Yhat = U.dot(Vt)
rmse = np.sqrt(np.mean(np.abs(Yhat - ratings) ** 2))
print("K: ", K)
print("RMSE:", rmse)
print("MAX_ERR:", np.abs(Yhat - ratings).max())

K:  36
RMSE: 0.09934649260590526
MAX_ERR: 1.4339307776635835


In [5]:
def save_model(net, normalize=False):

    # Build features
    X_pred = X

    hidden_layer_sizes = list(net.hidden_layer_sizes)

    layer_units = [X_pred.shape[1]] + hidden_layer_sizes + [1]
    activations = [X_pred]
    for i in range(net.n_layers_ - 1):
        activations.append(np.empty((X_pred.shape[0], layer_units[i + 1])))

    net._forward_pass(activations)
    y_pred = activations[-1]
    print("MSE (original):", np.mean((y_pred.flatten() - y) ** 2))

    # get weights
    last_w = net.coefs_[-1]
    bias = np.array(net.intercepts_[-1]).reshape((1, 1))
    last_w = np.concatenate([last_w, bias])

    # get last-layer features
    last_feat = np.array(activations[-2], dtype=np.float32)
    last_feat = np.concatenate([last_feat, np.ones((X_pred.shape[0], 1))], axis=1)

    # get prediction
    pred = last_feat.dot(last_w)
    print("MSE (recomputed with last layer only):", np.mean((pred.flatten() - y) ** 2))

    # get feature matrix
    d = hidden_layer_sizes[-1] + 1
    print("d={0}".format(d))
    phi = np.empty((n_users, n_items, d), dtype=np.float32)
    idx = 0
    for t in range(n_users):
        for z in range(n_items):
            phi[t, z, :] = last_feat[idx, :] / (np.linalg.norm(last_feat[idx, :]) if normalize else 1)
            idx += 1
    assert idx == last_feat.shape[0]

    # get param
    theta = np.array(last_w, dtype=np.float32).squeeze()
    if normalize:
        theta = theta / np.linalg.norm(theta)
        
    phi_norm = round(np.linalg.norm(phi, axis=2).max(), 2)
    print("phi max norm:", phi_norm)
    theta_norm = round(np.linalg.norm(theta), 2)
    print("theta norm:", theta_norm)

    # check predictions
    mu = phi.dot(theta)
    print("MSE (mu):", np.mean(np.abs(ratings - mu).flatten()**2))
    print("mu: max {0} - min {1}".format(mu.max(), mu.min()))
    gap = np.max(mu, axis=1)[:, np.newaxis] - mu
    print("gap max:", gap.max())
    gap[gap == 0] = 100
    print("gap min:", gap.min())
    gap = np.min(gap, axis=1)
    print("# contexts with gap_min > 0.001:", np.sum(gap > 0.001))
    print("# contexts with gap_min > 0.01:", np.sum(gap > 0.01))
    print("# contexts with gap_min > 0.1:", np.sum(gap > 0.1))

    # check span
    astar = np.argmax(mu, axis=1)
    fstar = np.array([phi[x, astar[x]] for x in range(n_users)])

    span = d
    for i in range(d):
        if check_spanrd(fstar, d - i):
            span = d - i
            break

    print("{0}Spanning R^{1}".format("WARNING: " if span == d else "", span))
    
    # compute lambda HLS
    
    outer = np.matmul(fstar.T, fstar) / n_users
    lambda_hls = np.linalg.eigvals(outer).min()
    print("lambda HLS:", lambda_hls)

    # save
    np.savez_compressed('jester_d{0}_span{1}_L{2:.2f}_S{3:.2f}_hls{4:.5f}.npz'.format(d,span,phi_norm,theta_norm, lambda_hls), 
                        features=phi, theta=theta)
    
    return mu

In [None]:
# fit large "ground-truth" network

hidden = 32
test_size=0.25

X, y = [], []
for t in range(n_users):
    for z in range(n_items):
        feat = np.concatenate([U[t], Vt[:, z]]).ravel()
        X.append(feat)
        y.append(ratings[t, z])
X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
print("Training NN -- Size {0}".format((hidden, hidden)))
net = MLPRegressor(hidden_layer_sizes=(hidden, hidden), max_iter=500, verbose=True).fit(X_train, y_train)
print("R^2:", net.score(X_test, y_test))
print()
print("Saving model...")
mu = save_model(net)
print()

# fit smaller networks

ds = [30, 28, 26, 24]
nets = {}

# redefine targets based on the outputs of the larger network (the input is still X)
y_mu = []
for t in range(n_users):
    for z in range(n_items):
        y_mu.append(mu[t, z])
y_mu = np.array(y_mu)

del(mu)
del(net)

# X_train, X_test, y_train, y_test = train_test_split(X, y_mu, test_size=test_size)

for j in ds:
    print("Training NN -- Size {0}".format((hidden, j)))
    nets[j] = MLPRegressor(hidden_layer_sizes=(hidden, j), max_iter=500, tol=1e-6, verbose=True).fit(X, y_mu)
    print("R^2 (size {0}): {1}".format(j, nets[j].score(X, y_mu)))
    print()
    print("Saving model...")
    save_model(nets[j])
    print()
    nets[j] = None

Training NN -- Size (32, 32)
Iteration 1, loss = 0.05848277
Iteration 2, loss = 0.02352317
Iteration 3, loss = 0.01624333
Iteration 4, loss = 0.01466286
Iteration 5, loss = 0.01404582
Iteration 6, loss = 0.01369594
Iteration 7, loss = 0.01347050
Iteration 8, loss = 0.01326524
Iteration 9, loss = 0.01317906
Iteration 10, loss = 0.01305848
Iteration 11, loss = 0.01298343
Iteration 12, loss = 0.01290867
Iteration 13, loss = 0.01284185
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
R^2: 0.9018798276644511

Saving model...
MSE (original): 0.025166534201624412
MSE (recomputed with last layer only): 0.025166534201903564
d=33
phi max norm: 4.21
theta norm: 3.79
MSE (mu): 0.02516653480510046
mu: max 1.783227562904358 - min -1.7974305152893066
gap max: 2.9464345
gap min: 1.5974045e-05
# contexts with gap_min > 0.001: 18964
# contexts with gap_min > 0.01: 17042
# contexts with gap_min > 0.1: 5603
Spanning R^32
lambda HLS: 0.0

Training NN -- Size (32, 3