In [1]:
from random import random
import math
import numpy as np
import copy

In [2]:
def loadMovieLens(path='./data/movielens'):
    #Get movie titles
    movies={}
    rev_movies={}
    for idx,line in enumerate(open(path+'/u.item')):
        idx,title=line.split('|')[0:2]
        movies[idx]=title
        rev_movies[title]=idx

    # Load data
    prefs={}
    for line in open(path+'/u.data'):
        (user,movieid,rating,ts)=line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
        
    return prefs,rev_movies

In [3]:
data,movies = loadMovieLens("data/ml-100k")

In [4]:
data['3']

{'187 (1997)': 2.0,
 'Air Force One (1997)': 2.0,
 'Alien: Resurrection (1997)': 3.0,
 'Apostle, The (1997)': 4.0,
 'Bean (1997)': 2.0,
 'Boogie Nights (1997)': 5.0,
 'Chasing Amy (1997)': 3.0,
 'Conspiracy Theory (1997)': 5.0,
 'Contact (1997)': 2.0,
 'Cop Land (1997)': 4.0,
 'Crash (1996)': 1.0,
 'Critical Care (1997)': 1.0,
 "Dante's Peak (1997)": 2.0,
 'Deconstructing Harry (1997)': 3.0,
 'Deep Rising (1998)': 1.0,
 'Desperate Measures (1998)': 4.0,
 "Devil's Advocate, The (1997)": 3.0,
 "Devil's Own, The (1997)": 1.0,
 'Edge, The (1997)': 4.0,
 'Event Horizon (1997)': 4.0,
 'Everyone Says I Love You (1996)': 2.0,
 'Fallen (1998)': 3.0,
 'G.I. Jane (1997)': 2.0,
 'Game, The (1997)': 2.0,
 'Good Will Hunting (1997)': 2.0,
 'Hard Rain (1998)': 3.0,
 'Hoodlum (1997)': 3.0,
 'House of Yes, The (1997)': 1.0,
 'How to Be a Player (1997)': 1.0,
 'In the Name of the Father (1993)': 2.0,
 'Jackie Brown (1997)': 5.0,
 'Kiss the Girls (1997)': 1.0,
 'L.A. Confidential (1997)': 2.0,
 'Liar Lia

In [5]:
def getRawArray(data):
    d = []
    for u in data.keys():
        for i in data[u].keys():
            d.append([u,i,data[u][i]])
    return np.array(d)

In [6]:
# splitting while avoiding to reduce the dataset too much
def split_train_test(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        test.setdefault(u,{})
        train.setdefault(u,{})
        for movie in data[u]:
            #print(data[u][movie])
            if (random()<percent_test):
                test[u][movie]=data[u][movie]
            else:
                train[u][movie]=data[u][movie]
    return train, test

In [7]:
def split_train_test_by_movies(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        for movie in data[u]:
            if (random()<percent_test):
                try:
                    test[movie][u]=data[u][movie]
                except KeyError:
                    test.setdefault(movie,{})
                    test[movie][u]=data[u][movie]
            else:
                try:
                    train[movie][u]=data[u][movie]
                except KeyError:
                    train.setdefault(movie,{})
                    train[movie][u]=data[u][movie]
    return train, test

In [8]:
percent_test=0.2
train,test=split_train_test(data,percent_test)

In [9]:
percent_test=0.2
m_train,m_test=split_train_test_by_movies(data,percent_test)

In [10]:
def deleteUnseenInTest(train,test):
    for k in test.keys():
        try:
            train[k]
        except KeyError:
            test.pop(k,None)

In [11]:
deleteUnseenInTest(train,test)
deleteUnseenInTest(m_train,m_test)

In [12]:
rawArray = getRawArray(data)
rawArrayTest = getRawArray(test)

---

## Baseline: mean by user

In [13]:
class baselineMeanUser:
    def __init__(self):
        self.users={}
    def fit(self,train):
        for user in train.keys():
            note=0.0
            for movie in train[user].keys():
                note+=train[user][movie]
            note=note/len(train[user])
            self.users[user]=note
        
    def predict(self,users):
        return [self.users[u] for u in users]

In [14]:
baseline_mu= baselineMeanUser()
baseline_mu.fit(train)
pred = baseline_mu.predict(rawArray[:,0])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(rawArray[:,2], float)) ** 2).mean())

Mean Error 1.065301


In [15]:
class baselineMeanMovie:
    def __init__(self):
        self.movies={}
    def fit(self,train):
        for movie in train.keys():
            note=0.0
            for user in train[movie].keys():
                note+=train[movie][user]
            note=note/len(train[movie])
            self.movies[movie]=note
        
    def predict(self,movies):
        res=[]
        for m in movies:
            try:
                res.append(self.movies[m])
            except:
                res.append(3)
        return res

In [16]:
baseline_mm= baselineMeanMovie()
baseline_mm.fit(m_train)
pred = baseline_mm.predict(rawArrayTest[:,1])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(rawArrayTest[:,2], float)) ** 2).mean())

Mean Error 1.004342


In [17]:
m_test['Adventures of Pinocchio, The (1996)']

{'109': 4.0,
 '125': 4.0,
 '274': 4.0,
 '592': 2.0,
 '648': 2.0,
 '83': 3.0,
 '887': 5.0,
 '911': 4.0,
 '919': 3.0}

In [18]:
rawArray[:5]

array([['344', 'Birdcage, The (1996)', '4.0'],
       ['344', 'Enchanted April (1991)', '4.0'],
       ['344', 'Diabolique (1996)', '2.0'],
       ['344', 'Face/Off (1997)', '4.0'],
       ['344', 'My Fellow Americans (1996)', '3.0']], 
      dtype='|S81')

In [19]:
len(m_train['Birdcage, The (1996)'])

230

---

In [20]:
print(len(movies))
print(len(data.keys()))

1664
943


In [21]:
movies["Adventures of Pinocchio, The (1996)"]

'1060'

In [22]:
rawMatrix = np.zeros((len(data.keys())+1,1682+1))
for u in data:
    for m in data[u]:
        rawMatrix[int(u)][int(movies[m])] = data[u][m]

In [23]:
np.shape(rawMatrix)

(944, 1683)

In [24]:
train["1"]["101 Dalmatians (1996)"]

2.0

In [25]:
rawMatrixTrain = np.zeros((len(data.keys())+1,1682+1))
for u in train:
    for m in train[u]:
        rawMatrixTrain[int(u)][int(movies[m])] = train[u][m]
        
rawMatrixTest = np.zeros((len(data.keys())+1,1682+1))
for u in test:
    for m in test[u]:
        rawMatrixTest[int(u)][int(movies[m])] = test[u][m]

In [26]:
rawMatrixTrain[:10,:10]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  5.,  3.,  0.,  3.,  3.,  0.,  0.,  1.,  5.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  2.,  4.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  0.,  0.,  5.,  5.,  5.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  5.,  4.,  0.,  0.]])

In [27]:
rawMatrixTest[:10,:10]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  4.,  0.,  0.,  5.,  4.,  0.,  0.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [28]:
np.shape(rawMatrixTest)

(944, 1683)

In [30]:
import numpy as np
from scipy import linalg
from numpy import dot

def nmf(X, latent_features, max_iter=100, error_limit=1e-6, fit_error_limit=1e-6, eps = 1e-5):
    """
    Decompose X to A*Y
    """
    eps = 1e-5
    print 'Starting NMF decomposition with {} latent features and {} iterations.'.format(latent_features, max_iter)
    #X = X.toarray()  # I am passing in a scipy sparse matrix

    # mask
    mask = np.sign(X)

    # initial matrices. A is random [0,1] and Y is A\X.
    rows, columns = X.shape
    A = np.random.rand(rows, latent_features)
    A = np.maximum(A, eps)

    Y = linalg.lstsq(A, X)[0]
    Y = np.maximum(Y, eps)

    masked_X = mask * X
    X_est_prev = dot(A, Y)
    for i in range(1, max_iter + 1):
        # updates
        top = dot(masked_X, Y.T)
        bottom = (dot((mask * dot(A, Y)), Y.T)) + eps
        A *= top / bottom

        A = np.maximum(A, eps)
        # print 'A',  np.round(A, 2)

        top = dot(A.T, masked_X)
        bottom = dot(A.T, mask * dot(A, Y)) + eps
        Y *= top / bottom
        Y = np.maximum(Y, eps)
        # print 'Y', np.round(Y, 2)


        # evaluation
        if i % 200 == 0 or i == 1 or i == max_iter:
            print 'Iteration {}:'.format(i),
            X_est = dot(A, Y)
            err = mask * (X_est_prev - X_est)
            fit_residual = np.sqrt(np.sum(err ** 2))
            X_est_prev = X_est

            curRes = linalg.norm(mask * (X - X_est), ord='fro')
            print 'fit residual', np.round(fit_residual, 4),
            print 'total residual', np.round(curRes, 4)
            if curRes < error_limit or fit_residual < fit_error_limit:
                break

    return A, Y

In [170]:
cpr = copy.deepcopy(rawMatrixTrain)

In [118]:
cpr[:10,:10]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  5.,  3.,  4.,  3.,  3.,  5.,  4.,  1.,  5.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  2.,  4.,  0.],
       [ 0.,  0.,  0.,  0.,  5.,  0.,  0.,  0.,  0.,  5.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  5.,  0.,  0.,  0.]])

In [119]:
t1 = np.array([[7,1],
               [1,1]])
t2 = np.array([[1,2],
               [3,4]])

In [120]:
t1.dot(t2)

array([[10, 18],
       [ 4,  6]])

In [121]:
(t1*t2).sum()

16

In [122]:
cpr[1,1]=0

In [171]:
%%time
A,Y = nmf(cpr,100,max_iter=4000)

Starting NMF decomposition with 100 latent features and 4000 iterations.
Iteration 1: fit residual 341.2397 total residual 251.295


KeyboardInterrupt: 

In [131]:
resMatrix = A.dot(Y)

In [132]:
resMatrix[1,1]

3.8662150994784032

In [29]:
class evalMF:
    def __init__(self,resMatrix,dicU,dicI):
        self.resMatrix=resMatrix
        self.dicU = dicU
        self.dicI = dicI
    def fit(self):
        pass
        
    def predict(self,user,movie):
        return self.resMatrix[int(user)][int(self.dicI[movie])]

In [134]:
mf= evalMF(resMatrix,data,movies)

In [139]:
data["200"]

{'101 Dalmatians (1996)': 4.0,
 '20,000 Leagues Under the Sea (1954)': 4.0,
 '2001: A Space Odyssey (1968)': 4.0,
 'Absolute Power (1997)': 3.0,
 'Adventures of Pinocchio, The (1996)': 3.0,
 'Aladdin (1992)': 5.0,
 'Alice in Wonderland (1951)': 5.0,
 'Alien (1979)': 5.0,
 'Alien 3 (1992)': 4.0,
 'Aliens (1986)': 5.0,
 'All Dogs Go to Heaven 2 (1996)': 2.0,
 'Amadeus (1984)': 5.0,
 'American President, The (1995)': 3.0,
 'Andre (1994)': 4.0,
 'Apollo 13 (1995)': 5.0,
 'Around the World in 80 Days (1956)': 3.0,
 'Assassins (1995)': 4.0,
 'Babe (1995)': 4.0,
 'Back to the Future (1985)': 5.0,
 'Barbarella (1968)': 3.0,
 'Batman Forever (1995)': 4.0,
 'Batman Returns (1992)': 4.0,
 'Beauty and the Beast (1991)': 5.0,
 'Birdcage, The (1996)': 4.0,
 'Birds, The (1963)': 5.0,
 'Blade Runner (1982)': 5.0,
 'Boot, Das (1981)': 5.0,
 'Brady Bunch Movie, The (1995)': 2.0,
 'Braveheart (1995)': 4.0,
 'Broken Arrow (1996)': 3.0,
 'Cape Fear (1962)': 5.0,
 'Cape Fear (1991)': 5.0,
 'Carrie (1976)': 

In [142]:
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["All Dogs Go to Heaven 2 (1996)"]
print mf.predict("1","All Dogs Go to Heaven 2 (1996)")
print "***"
print data["18"]["Don Juan DeMarco (1995)"]
print mf.predict("1","Don Juan DeMarco (1995)")
print data["18"]["Winnie the Pooh and the Blustery Day (1968)"]
print mf.predict("1","Winnie the Pooh and the Blustery Day (1968)")
print "***"
print data["200"]["Assassins (1995)"]
print mf.predict("1","Assassins (1995)")
print data["200"]["Casablanca (1942)"]
print mf.predict("1","Casablanca (1942)")

4.0
3.39729089826
1.0
1.62532843747
***
2.0
4.57738258746
3.0
4.07796516329
***
4.0
2.86867907628
5.0
4.7192522862


In [143]:
summ=0
for i in data["1"]:
    summ+=(float(data["1"][i]) - mf.predict("1",i))**2
summ/len(data["1"])

0.72992292053033703

In [144]:
summ=0
for i in data["3"]:
    summ+=(float(data["3"][i]) - mf.predict("3",i))**2
summ/len(data["3"])

1.0374567107180328

In [175]:
tot=[]
ttt=[]
for j in test:
    summ=0
    for i in test[j]:
        summ+=(float(test[j][i]) - mf.predict(j,i))**2
    #print j, ">>", summ/len(data[j])
    ttt.append(len(test[j]))
    tot.append(summ)
    
#import pdb
#pdb.set_trace()

In [176]:
t = np.array(tot)
tt = np.array(ttt)

In [177]:
t.mean()/tt.mean()

1.1942361914404664

In [162]:
tt.std()

100.56729085359559

In [169]:
t.sum()/tt.sum()

0.60572217944012463

In [161]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawArrayTest]).mean()

1.1942361914404731

---

In [225]:
%%time
A,Y = nmf(rawMatrixTrain,500,max_iter=300)

Starting NMF decomposition with 500 latent features and 300 iterations.
Iteration 1: fit residual 5298.0864 total residual 244.6212
Iteration 200: fit residual 240.2685 total residual 13.9941
Iteration 300: fit residual 7.7158 total residual 6.8284
CPU times: user 2min 45s, sys: 5.49 s, total: 2min 50s
Wall time: 1min 2s


In [226]:
resMatrix = A.dot(Y)

In [227]:
a=np.array((1,2,4))
b=np.array((1,3,6))
(a-b).dot(a-b)

masqueTest=np.sign(rawMatrixTest)
masqueTest[:10,:10]

A=masqueTest*rawMatrix


In [228]:
aa = masqueTest*resMatrix

In [229]:
for idxi,i in enumerate(aa):
    for idxj,j in enumerate(i):
        if j>5:
            aa[idxi][idxj]=5

In [235]:
q = masqueTest*resMatrix - rawMatrixTest

In [236]:
(q*q).sum()/ masqueTest.sum()

1.1273948891755721

In [None]:
masqueTest=np.sign(rawMatrixTest)

aa=masqueTest*rawMatrix

for idxi,i in enumerate(aa):
    for idxj,j in enumerate(i):
        if j>5:
            aa[idxi][idxj]=5
            
q = masqueTest*resMatrix - rawMatrixTest

(q*q).sum()/ masqueTest.sum()

In [232]:
aa[:10,:10]

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  4.25790085,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        

In [111]:
rawMatrix[:10,:10]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  5.,  3.,  4.,  3.,  3.,  5.,  4.,  1.,  5.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  2.,  4.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  0.,  0.,  5.,  5.,  5.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  5.,  4.,  0.,  0.]])

In [65]:
resMatrix[:10,:10]

array([[  1.00000000e-09,   2.11263017e-05,   1.75772784e-05,
          1.59467964e-05,   1.91700638e-05,   1.73865300e-05,
          2.06765659e-05,   1.94248575e-05,   2.17730688e-05,
          2.03784277e-05],
       [  1.78801970e-04,   3.95625546e+00,   3.54997734e+00,
          3.72895234e+00,   4.27096338e+00,   2.72442378e+00,
          4.01474525e+00,   4.52884136e+00,   3.69341713e+00,
          3.98399173e+00],
       [  1.96184298e-04,   4.03306906e+00,   2.99267665e+00,
          2.07003583e+00,   3.73621474e+00,   3.57324546e+00,
          4.50847524e+00,   4.03266244e+00,   4.35395407e+00,
          4.65985982e+00],
       [  1.76302740e-04,   3.61148196e+00,   3.27282155e+00,
          2.35550426e+00,   2.02886726e+00,   3.87498134e+00,
          1.48801040e+00,   2.95799674e+00,   2.77898478e+00,
          3.14122547e+00],
       [  2.63225365e-04,   4.84628612e+00,   3.96173386e+00,
          4.22460058e+00,   3.47028279e+00,   3.99521914e+00,
          5.25970696e+00

In [59]:
mf = evalMF(resMatrix,data,movies)

In [69]:
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["All Dogs Go to Heaven 2 (1996)"]
print mf.predict("1","All Dogs Go to Heaven 2 (1996)")

4.0
3.68121941569
1.0
0.96164687419


In [61]:
print train["1"]["All Dogs Go to Heaven 2 (1996)"]
print test["1"]["Akira (1988)"]

1.0


KeyError: 'Akira (1988)'

In [80]:
len(rawMatrixTest)

944

In [78]:
t = []
c = 10
for idxi,i in enumerate(rawMatrixTest):
    for idxj,j in enumerate(i):
        if rawMatrixTest[idxi][idxj] != 0:
            t.append( (resMatrix[idxi][idxj] - float(rawMatrixTest[idxi][idxj]))**2 )
            if c>0:
                print(rawMatrixTest[idxi][idxj],resMatrix[idxi][idxj])
                c-=1
np.array(t).mean()

(5.0, 2.5931883684545336)
(4.0, 5.0047797241971219)
(4.0, 3.469010363892524)
(4.0, 3.8143373120809678)
(4.0, 5.1669137101192657)
(3.0, 4.4963428645445518)
(5.0, 3.6366298516628395)
(3.0, 3.1448263066152959)
(5.0, 2.7517463531147883)
(5.0, 4.6741099156866328)


2.0909226578962459

In [87]:
t = []
c = 10
for idxi,i in enumerate(resMatrix):
    for idxj,j in enumerate(i):
        if rawMatrixTest[idxi][idxj] != 0:
            t.append( (resMatrix[idxi][idxj] - float(rawMatrix[idxi][idxj]))**2 )
            if c>0:
                print(rawMatrix[idxi][idxj],resMatrix[idxi][idxj])
                c-=1
np.array(t).mean()

(5.0, 2.5931883684545336)
(4.0, 5.0047797241971219)
(4.0, 3.469010363892524)
(4.0, 3.8143373120809678)
(4.0, 5.1669137101192657)
(3.0, 4.4963428645445518)
(5.0, 3.6366298516628395)
(3.0, 3.1448263066152959)
(5.0, 2.7517463531147883)
(5.0, 4.6741099156866328)


2.0909226578962459

In [108]:
t = []
c = 3
for idxi,i in enumerate(rawMatrixTrain):
    for idxj,j in enumerate(i):
        if rawMatrixTrain[idxi][idxj] != 0:
            t.append( (float(rawMatrixTrain[idxi][idxj]) - resMatrix[idxi][idxj])**2 )
            if c>0:
                print(rawMatrixTrain[idxi][idxj],resMatrix[idxi][idxj])
                c-=1
np.array(t).mean()

(5.0, 4.3114944585785064)
(3.0, 3.3257920434476187)
(3.0, 4.0022181201367522)


0.45320954210197834

In [80]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawMatrixTest]).mean()

KeyError: 0.0

---

In [60]:
R = [
     [5,3,5,3],
     [4,0,0,1],
     [1,5,1,5],
     [1,0,1,4],
     [0,4,5,4],
    ]

R = np.array(R)
K = 10
np.random.rand

<function rand>

In [87]:
%%time
nP, nQ = matrix_factorization(R, K, steps=1000)

CPU times: user 1.55 s, sys: 18.9 ms, total: 1.57 s
Wall time: 1.57 s


In [88]:
nR = np.dot(nP, nQ.T)

In [89]:
((nR-R)**2).sum()/np.sign(R).sum()

2.7470816228443531

---

In [110]:
np.shape(R)

(5, 4)

In [189]:
def matrix_factorization(R, K, steps=100, eps=0.0001, beta=0.02):
    N,M = np.shape(R)
    P = np.random.rand(N,K)
    #P = np.maximum(P, eps)
    
    #Q = np.random.rand(M,K).T
    Q = linalg.lstsq(P, R)[0]
    Q = np.maximum(Q, eps)

    #masked_X = mask * X
    #X_est_prev = dot(A, Y)
    
    #mask = np.sign(R)
    #masked_R = mask * R
    for step in xrange(steps):
        #"""
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in xrange(K):
                        P[i][k] = P[i][k] + eps * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + eps * (2 * eij * P[i][k] - beta * Q[k][j])
        """
        ###
        top = np.dot(masked_R, Q.T)
        bottom = (np.dot((mask * np.dot(P, Q)), Q.T)) + eps
        
        P *= top / bottom

        P = np.maximum(P, eps)
        # print 'A',  np.round(A, 2)

        top = np.dot(P.T, masked_R)
        bottom = np.dot(P.T, mask * np.dot(P, Q)) + eps
        Q *= top / bottom
        Q = np.maximum(Q, eps)
        # print 'Y', np.round(Y, 2)
        """
        
        
    return P, Q.T

In [145]:
%%time
nP, nQ = matrix_factorization(R, K, steps=4000,eps=1e-5)

CPU times: user 6.14 s, sys: 63.6 ms, total: 6.21 s
Wall time: 6.29 s


In [146]:
nR = np.dot(nP, nQ.T)
((nR-R)**2).sum()/np.sign(R).sum()

1.2305583063164482

In [None]:
%%time
N,M = np.shape(R)
P = np.random.rand(N,K)
#P = np.maximum(P, eps)

#Q = np.random.rand(M,K).T
Q = linalg.lstsq(P, R)[0]
Q = np.maximum(Q, eps)

#masked_X = mask * X
#X_est_prev = dot(A, Y)

#mask = np.sign(R)
#masked_R = mask * R

for i in xrange(len(R)):
    for j in xrange(len(R[i])):
        if R[i][j] > 0:
            eij = R[i][j] - np.dot(P[i,:],Q[:,j])
            for k in xrange(K):
                P[i][k] = P[i][k] + eps * (2 * eij * Q[k][j] - beta * P[i][k])
                Q[k][j] = Q[k][j] + eps * (2 * eij * P[i][k] - beta * Q[k][j])

In [161]:
for _ in range(1,5):
    nP, nQ = matrix_factorization(R, K, steps=1000,eps=1e-3)
    nR = np.dot(nP, nQ.T)
    print ((nR-GR)**2).sum()/np.sign(GR).sum()

54.0298250116
56.7997458309
54.0097982207
54.3613471474


In [141]:
GR = [
     [0,0,0,0],
     [0,1,1,0],
     [0,0,0,0],
     [0,4,0,0],
     [4,0,0,0],
    ]

In [139]:
R

array([[5, 3, 5, 3],
       [4, 0, 0, 1],
       [1, 5, 1, 5],
       [1, 0, 1, 4],
       [0, 4, 5, 4]])

In [149]:
nR

array([[ 4.98064509,  3.00337373,  4.98567601,  2.99254764],
       [ 3.98478181,  1.19632854,  1.66596662,  1.0013092 ],
       [ 0.99961108,  4.9783092 ,  1.01288321,  4.97950296],
       [ 0.99817346,  2.34980003,  1.0067616 ,  3.97615341],
       [ 1.26528218,  3.99790715,  4.98389234,  3.9907852 ]])

In [55]:
R[1,1]

5.0

---

---

---

In [98]:
from scipy import linalg

In [100]:
rows, columns = R.shape
A = np.random.rand(rows, 2)

In [None]:
    mask = np.sign(X)
    # initial matrices. A is random [0,1] and Y is A\X.
    rows, columns = X.shape
    A = np.random.rand(rows, latent_features)
    A = np.maximum(A, eps)

    Y = linalg.lstsq(A, X)[0]
    Y = np.maximum(Y, eps)

    masked_X = mask * X
    X_est_prev = dot(A, Y)
# updates
        top = dot(masked_X, Y.T)
        bottom = (dot((mask * dot(A, Y)), Y.T)) + eps
        A *= top / bottom

        A = np.maximum(A, eps)
        # print 'A',  np.round(A, 2)

        top = dot(A.T, masked_X)
        bottom = dot(A.T, mask * dot(A, Y)) + eps
        Y *= top / bottom
        Y = np.maximum(Y, eps)
        # print 'Y', np.round(Y, 2)


        # evaluation
        if i % 200 == 0 or i == 1 or i == max_iter:
            print 'Iteration {}:'.format(i),
            X_est = dot(A, Y)
            err = mask * (X_est_prev - X_est)
            fit_residual = np.sqrt(np.sum(err ** 2))
            X_est_prev = X_est

            curRes = linalg.norm(mask * (X - X_est), ord='fro')
            print 'fit residual', np.round(fit_residual, 4),
            print 'total residual', np.round(curRes, 4)
            if curRes < error_limit or fit_residual < fit_error_limit:
                break

In [190]:
%%time

R = rawMatrixTrain

nP, nQ = matrix_factorization(R, 10, steps=40,eps=1e-3)
nR = np.dot(nP, nQ.T)

CPU times: user 5min 57s, sys: 1.89 s, total: 5min 59s
Wall time: 6min 1s


In [191]:
masqueTest=np.sign(rawMatrixTest)

aa=masqueTest*rawMatrix
"""
for idxi,i in enumerate(aa):
    for idxj,j in enumerate(i):
        if j>5:
            aa[idxi][idxj]=5
            """
q = masqueTest*nR - rawMatrixTest

(q*q).sum()/ masqueTest.sum()

0.93011954506905736

In [30]:
nR[:5,:5]

array([[ 2.2328501 ,  2.86127689,  2.78138309,  2.18936119,  1.94124401],
       [ 2.873597  ,  3.01024103,  3.39995977,  2.86989372,  2.98342851],
       [ 2.39437824,  2.9654892 ,  2.95946595,  2.25596745,  2.07565552],
       [ 1.51408587,  2.19458866,  2.42556026,  1.70330838,  1.66845702],
       [ 3.28392919,  3.92596689,  3.90235368,  3.32824406,  3.11129257]])

In [31]:
rawMatrix[:5,:5]

array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  5.,  3.,  4.,  3.],
       [ 0.,  4.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])

In [32]:
mf= evalMF(nR,data,movies)
mf.predict("1","Akira (1988)")

NameError: name 'evalMF' is not defined

In [47]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawArrayTest]).mean()

1.5532842864204328