In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

n_users, n_movies = 20, 10

# A is a matrix of size (n_users, n_movies) randomly generated values between 1 and 5
A = torch.randint(1, 6, (n_users, n_movies), dtype=torch.float)
A

tensor([[2., 1., 3., 2., 1., 5., 2., 1., 3., 1.],
        [2., 5., 2., 2., 1., 2., 4., 3., 4., 1.],
        [5., 3., 5., 1., 1., 5., 4., 1., 3., 1.],
        [4., 2., 5., 1., 3., 2., 4., 2., 3., 3.],
        [3., 1., 3., 2., 2., 3., 5., 3., 4., 3.],
        [4., 3., 5., 4., 3., 5., 1., 3., 1., 3.],
        [1., 4., 5., 3., 3., 2., 2., 2., 2., 5.],
        [5., 4., 4., 5., 3., 5., 2., 4., 3., 4.],
        [1., 5., 1., 5., 3., 4., 4., 4., 2., 5.],
        [4., 4., 3., 5., 5., 4., 2., 4., 2., 4.],
        [5., 3., 4., 3., 1., 3., 1., 2., 1., 1.],
        [4., 3., 5., 5., 5., 2., 3., 4., 5., 1.],
        [4., 4., 5., 5., 3., 3., 4., 3., 2., 3.],
        [1., 1., 4., 2., 5., 2., 1., 5., 1., 1.],
        [3., 1., 3., 2., 2., 5., 3., 1., 3., 3.],
        [5., 5., 4., 4., 4., 3., 3., 5., 3., 1.],
        [4., 4., 2., 4., 1., 5., 1., 2., 3., 3.],
        [5., 3., 5., 4., 1., 4., 4., 4., 4., 5.],
        [1., 3., 4., 1., 5., 1., 3., 4., 2., 4.],
        [4., 2., 5., 5., 4., 3., 1., 3., 4., 2.]])

In [3]:
A.shape

torch.Size([20, 10])

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

r = 3
W = torch.randn(n_users, r, requires_grad=True, device=device)
H = torch.randn(r, n_movies, requires_grad=True, device=device)

A = A.to(device)

# Compute the loss
with torch.no_grad():
    loss = torch.norm(torch.mm(W, H) - A)
    print(loss)

tensor(50.4185)


In [5]:
pd.DataFrame(torch.mm(W, H).cpu().detach().numpy()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.025517,-0.563092,-0.025753,-0.829404,0.513992,-0.248029,-0.091841,-0.199761,0.472571,-0.094532
1,-5.458479,0.149226,-1.662141,-3.959543,3.646095,0.308745,-3.372631,-3.370413,0.984193,1.541942
2,-0.421752,-0.07571,0.267425,1.09072,-0.775943,0.673958,-0.334709,-0.214609,-0.921345,1.143325
3,0.993805,-0.958298,0.312328,-0.467862,0.044746,-0.400146,0.482238,0.317088,0.487932,-0.34596
4,4.433204,0.87832,1.217203,4.024843,-3.373741,-0.092877,2.89976,3.032882,-1.187196,-1.501388


In [6]:
pd.DataFrame(A.cpu().detach().numpy()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,2.0,1.0,3.0,2.0,1.0,5.0,2.0,1.0,3.0,1.0
1,2.0,5.0,2.0,2.0,1.0,2.0,4.0,3.0,4.0,1.0
2,5.0,3.0,5.0,1.0,1.0,5.0,4.0,1.0,3.0,1.0
3,4.0,2.0,5.0,1.0,3.0,2.0,4.0,2.0,3.0,3.0
4,3.0,1.0,3.0,2.0,2.0,3.0,5.0,3.0,4.0,3.0


In [7]:
# Optimizer
optimizer = optim.Adam([W, H], lr=0.01)

# Train the model

for i in range(600):
    # Compute the loss
    loss = torch.norm(torch.mm(W, H) - A)
    
    # Zero the gradients
    optimizer.zero_grad()
    
    # Backpropagate
    loss.backward()
    
    # Update the parameters
    optimizer.step()
    
    # Print the loss
    if i % 10 == 0:
        print(i, loss.item())

0 50.41853713989258
10 48.356048583984375
20 46.19203186035156
30 43.69348907470703
40 40.736228942871094
50 37.35312271118164
60 33.741363525390625
70 30.15149688720703
80 26.817447662353516
90 23.925443649291992
100 21.544315338134766
110 19.651371002197266
120 18.222267150878906
130 17.21449089050293
140 16.540056228637695
150 16.09402084350586
160 15.785305976867676
170 15.553569793701172
180 15.364673614501953
190 15.19947338104248
200 15.045867919921875
210 14.895147323608398
220 14.741188049316406
230 14.580561637878418
240 14.412769317626953
250 14.240296363830566
260 14.068126678466797
270 13.90241813659668
280 13.748640060424805
290 13.610075950622559
300 13.487528800964355
310 13.380098342895508
320 13.28618049621582
330 13.204069137573242
340 13.132193565368652
350 13.0691556930542
360 13.013742446899414
370 12.964905738830566
380 12.9217529296875
390 12.88353157043457
400 12.849600791931152
410 12.81942081451416
420 12.792531967163086
430 12.768539428710938
440 12.74710559

In [10]:
pd.DataFrame(torch.mm(W, H).cpu().detach().numpy()).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,3.575108,1.702532,2.909967,1.962098,0.512695,3.499099,2.060418,0.994849,2.341826,1.372867
1,2.551795,2.15423,3.68168,1.887276,2.698015,1.948732,3.099001,2.675164,2.8122,1.965179


In [9]:
pd.DataFrame(A.cpu()).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,2.0,1.0,3.0,2.0,1.0,5.0,2.0,1.0,3.0,1.0
1,2.0,5.0,2.0,2.0,1.0,2.0,4.0,3.0,4.0,1.0


In [11]:
def factorize(A, k, device=torch.device("cpu")):
    """Factorize the matrix A into W and H
    A: input matrix of size (n_users, n_movies)
    k: number of latent features
    
    Returns W and H
    W: matrix of size (n_users, k)
    H: matrix of size (k, n_movies)
    """
    A = A.to(device)
    # Randomly initialize W and H
    W = torch.randn(A.shape[0], k, requires_grad=True, device=device)
    H = torch.randn(k, A.shape[1], requires_grad=True, device=device)
    
    # Optimizer
    optimizer = optim.Adam([W, H], lr=0.01)
    
    # Train the model
    for i in range(1000):
        # Compute the loss
        loss = torch.norm(torch.mm(W, H) - A)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Backpropagate
        loss.backward()
        
        # Update the parameters
        optimizer.step()
        
    return W, H, loss

In [12]:
for k in [1, 2, 3, 4, 5, 6, 9]:
    W, H, loss = factorize(A, k, device=device)
    print(k, loss.item())

1 17.237581253051758
2 14.64045524597168
3 12.626526832580566
4 10.090561866760254
5 8.068482398986816
6 6.445639610290527
9 2.5516481399536133


In [13]:
pd.DataFrame(torch.mm(W,H).cpu().detach().numpy()).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,2.246811,1.106579,2.772262,1.764285,1.264044,4.958205,1.843984,0.829514,3.163038,1.138836
1,2.047336,5.023692,1.958394,1.962901,1.051947,1.987499,3.977806,2.955147,4.026937,1.025083


In [14]:
pd.DataFrame(A.cpu()).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,2.0,1.0,3.0,2.0,1.0,5.0,2.0,1.0,3.0,1.0
1,2.0,5.0,2.0,2.0,1.0,2.0,4.0,3.0,4.0,1.0


In [15]:
# With missing values

# Randomly replace some entries with NaN

A = torch.randint(1, 6, (n_users, n_movies), dtype=torch.float)
A[torch.rand(A.shape) < 0.5] = float('nan')
A

tensor([[nan, 5., nan, 5., 5., 4., nan, 2., nan, 5.],
        [4., nan, 1., 5., nan, 3., 1., 4., 5., 3.],
        [nan, 5., nan, nan, 1., nan, 4., nan, nan, 1.],
        [nan, 5., 1., 3., 4., 5., 3., 3., nan, 2.],
        [nan, nan, nan, nan, 1., nan, nan, nan, 1., 4.],
        [nan, 1., nan, 5., nan, 3., 3., nan, nan, 1.],
        [nan, 2., 1., 2., 2., nan, 5., nan, nan, 1.],
        [nan, nan, nan, 1., nan, 4., 5., 4., nan, 3.],
        [nan, nan, 1., nan, nan, nan, 4., 5., 5., nan],
        [3., nan, nan, 4., 1., nan, 4., 5., 5., 3.],
        [5., 4., nan, nan, nan, nan, 4., nan, 4., nan],
        [1., nan, nan, 3., nan, nan, 4., 5., 4., 5.],
        [nan, 3., nan, 5., nan, 4., nan, nan, nan, nan],
        [nan, nan, nan, 2., 5., 4., 5., 4., 1., nan],
        [3., 4., 2., nan, 3., 2., 1., nan, nan, 4.],
        [nan, 5., 3., 4., 5., 5., 5., nan, 3., 1.],
        [nan, nan, nan, 4., 5., 1., 2., nan, nan, 4.],
        [nan, nan, 3., nan, 2., nan, nan, 1., 2., nan],
        [2., nan, 5

In [16]:
W, H, loss = factorize(A, 2, device=device)
loss

tensor(nan, grad_fn=<LinalgVectorNormBackward0>)

In [17]:
A.shape

torch.Size([20, 10])

In [18]:
mask = ~torch.isnan(A)
mask

tensor([[False,  True, False,  True,  True,  True, False,  True, False,  True],
        [ True, False,  True,  True, False,  True,  True,  True,  True,  True],
        [False,  True, False, False,  True, False,  True, False, False,  True],
        [False,  True,  True,  True,  True,  True,  True,  True, False,  True],
        [False, False, False, False,  True, False, False, False,  True,  True],
        [False,  True, False,  True, False,  True,  True, False, False,  True],
        [False,  True,  True,  True,  True, False,  True, False, False,  True],
        [False, False, False,  True, False,  True,  True,  True, False,  True],
        [False, False,  True, False, False, False,  True,  True,  True, False],
        [ True, False, False,  True,  True, False,  True,  True,  True,  True],
        [ True,  True, False, False, False, False,  True, False,  True, False],
        [ True, False, False,  True, False, False,  True,  True,  True,  True],
        [False,  True, False,  True, Fal

In [19]:
mask.sum()

tensor(110)

In [20]:
W = torch.randn(A.shape[0], k, requires_grad=True, device=device)
H = torch.randn(k, A.shape[1],  requires_grad=True, device=device)

diff_matrix = torch.mm(W, H)-A.to(device)
diff_matrix.shape

torch.Size([20, 10])

In [21]:
diff_matrix

tensor([[     nan,  -7.8159,      nan,  -9.2992,  -2.0465,  -3.0997,      nan,
          -3.7238,      nan,  -0.3907],
        [ -4.9107,      nan,   1.1464,  -7.3782,      nan,  -2.9703,  -6.8321,
          -4.4837,  -5.0601,  -4.0388],
        [     nan,  -5.6427,      nan,      nan,   0.4110,      nan,  -1.2884,
              nan,      nan,  -3.2478],
        [     nan,  -6.7050,  -1.3138,  -4.4632,  -2.2068,  -5.2276,  -1.3345,
          -6.3128,      nan,   1.6927],
        [     nan,      nan,      nan,      nan,  -2.9860,      nan,      nan,
              nan,  -0.7004,  -4.3378],
        [     nan,   0.6152,      nan,  -3.9136,      nan,  -4.0817,   0.2097,
              nan,      nan,  -2.7844],
        [     nan,   0.8705,  -3.6554,   2.8964,  -2.2121,      nan,  -5.0491,
              nan,      nan,  -0.3602],
        [     nan,      nan,      nan,  -0.4069,      nan,  -6.1057,  -4.9046,
          -2.6270,      nan,  -0.1867],
        [     nan,      nan,  -0.5713,      nan,

In [22]:
diff_matrix[mask].shape

torch.Size([110])

In [23]:
# Modify the loss function to ignore NaN values

def factorize(A, k, device=torch.device("cpu")):
    """Factorize the matrix D into A and B"""
    A = A.to(device)
    # Randomly initialize A and B
    W = torch.randn(A.shape[0], k, requires_grad=True, device=device)
    H = torch.randn(k, A.shape[1], requires_grad=True, device=device)
    # Optimizer
    optimizer = optim.Adam([W, H], lr=0.01)
    mask = ~torch.isnan(A)
    # Train the model
    for i in range(1000):
        # Compute the loss
        diff_matrix = torch.mm(W, H) - A
        diff_vector = diff_matrix[mask]
        loss = torch.norm(diff_vector)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Backpropagate
        loss.backward()
        
        # Update the parameters
        optimizer.step()
        
    return W, H, loss

In [24]:
W, H, loss = factorize(A, 5, device=device)
loss

tensor(0.9144, grad_fn=<LinalgVectorNormBackward0>)

In [25]:
torch.mm(W, H)

tensor([[  2.5415,   5.0024,  18.7974,   5.0142,   4.9969,   3.9942,   9.7405,
           2.0035,   0.8238,   4.9845],
        [  4.0599,   6.7069,   0.9981,   4.9338,   8.0675,   3.0101,   0.9941,
           4.0033,   4.9817,   3.0420],
        [  6.7136,   5.0008, -15.3943,   5.3809,   0.9999,   6.1654,   3.9998,
          -4.1563,   6.7969,   1.0010],
        [  3.3376,   4.8514,   1.0618,   3.4958,   3.9746,   4.9690,   2.9209,
           3.0104,   4.5712,   1.6154],
        [  4.0478,   1.0344, -15.9771,   4.9748,   1.0000,  -5.9608,  -1.8545,
          -7.5956,   1.0004,   3.9999],
        [  6.7118,   1.0085, -26.4594,   4.9814,  -5.9511,   2.9992,   3.0036,
          -9.6801,   5.7766,   1.0123],
        [  2.1379,   1.9227,   1.0254,   2.1982,   1.9975,   1.2639,   4.9633,
          -5.3452,  -0.7114,   0.8575],
        [  0.2025,  -5.4665,  -1.9851,   0.9977, -17.1253,   4.0034,   4.9972,
           3.9981,   4.3034,   3.0048],
        [  1.5030,   0.5447,   0.9971,   1.2707,

In [27]:
pd.DataFrame(torch.mm(W,H).cpu().detach().numpy()).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,2.541451,5.002381,18.797421,5.014181,4.996908,3.994238,9.740495,2.003469,0.823797,4.984483
1,4.059857,6.706933,0.998094,4.933802,8.067493,3.010138,0.994078,4.003255,4.981701,3.042045


In [28]:
pd.DataFrame(A.cpu()).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,,5.0,,5.0,5.0,4.0,,2.0,,5.0
1,4.0,,1.0,5.0,,3.0,1.0,4.0,5.0,3.0
