In [1]:
from random import random
import math
import numpy as np
import copy

In [2]:
def loadMovieLens(path='./data/movielens'):
    #Get movie titles
    movies={}
    rev_movies={}
    for idx,line in enumerate(open(path+'/u.item')):
        idx,title=line.split('|')[0:2]
        movies[idx]=title
        rev_movies[title]=idx

    # Load data
    prefs={}
    for line in open(path+'/u.data'):
        (user,movieid,rating,ts)=line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
        
    return prefs,rev_movies

In [3]:
data,movies = loadMovieLens("data/ml-100k")

In [4]:
data['3']

{'187 (1997)': 2.0,
 'Air Force One (1997)': 2.0,
 'Alien: Resurrection (1997)': 3.0,
 'Apostle, The (1997)': 4.0,
 'Bean (1997)': 2.0,
 'Boogie Nights (1997)': 5.0,
 'Chasing Amy (1997)': 3.0,
 'Conspiracy Theory (1997)': 5.0,
 'Contact (1997)': 2.0,
 'Cop Land (1997)': 4.0,
 'Crash (1996)': 1.0,
 'Critical Care (1997)': 1.0,
 "Dante's Peak (1997)": 2.0,
 'Deconstructing Harry (1997)': 3.0,
 'Deep Rising (1998)': 1.0,
 'Desperate Measures (1998)': 4.0,
 "Devil's Advocate, The (1997)": 3.0,
 "Devil's Own, The (1997)": 1.0,
 'Edge, The (1997)': 4.0,
 'Event Horizon (1997)': 4.0,
 'Everyone Says I Love You (1996)': 2.0,
 'Fallen (1998)': 3.0,
 'G.I. Jane (1997)': 2.0,
 'Game, The (1997)': 2.0,
 'Good Will Hunting (1997)': 2.0,
 'Hard Rain (1998)': 3.0,
 'Hoodlum (1997)': 3.0,
 'House of Yes, The (1997)': 1.0,
 'How to Be a Player (1997)': 1.0,
 'In the Name of the Father (1993)': 2.0,
 'Jackie Brown (1997)': 5.0,
 'Kiss the Girls (1997)': 1.0,
 'L.A. Confidential (1997)': 2.0,
 'Liar Lia

In [5]:
def getRawArray(data):
    d = []
    for u in data.keys():
        for i in data[u].keys():
            d.append([u,i,data[u][i]])
    return np.array(d)

In [6]:
# splitting while avoiding to reduce the dataset too much
def split_train_test(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        test.setdefault(u,{})
        train.setdefault(u,{})
        for movie in data[u]:
            #print(data[u][movie])
            if (random()<percent_test):
                test[u][movie]=data[u][movie]
            else:
                train[u][movie]=data[u][movie]
    return train, test

In [7]:
def split_train_test_by_movies(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        for movie in data[u]:
            if (random()<percent_test):
                try:
                    test[movie][u]=data[u][movie]
                except KeyError:
                    test.setdefault(movie,{})
                    test[movie][u]=data[u][movie]
            else:
                try:
                    train[movie][u]=data[u][movie]
                except KeyError:
                    train.setdefault(movie,{})
                    train[movie][u]=data[u][movie]
    return train, test

In [8]:
percent_test=0.2
train,test=split_train_test(data,percent_test)

In [9]:
percent_test=0.2
m_train,m_test=split_train_test_by_movies(data,percent_test)

In [10]:
def deleteUnseenInTest(train,test):
    for k in test.keys():
        try:
            train[k]
        except KeyError:
            test.pop(k,None)

In [11]:
deleteUnseenInTest(train,test)
deleteUnseenInTest(m_train,m_test)

In [12]:
rawArray = getRawArray(data)
rawArrayTest = getRawArray(test)

---

## Baseline: mean by user

In [13]:
class baselineMeanUser:
    def __init__(self):
        self.users={}
    def fit(self,train):
        for user in train.keys():
            note=0.0
            for movie in train[user].keys():
                note+=train[user][movie]
            note=note/len(train[user])
            self.users[user]=note
        
    def predict(self,users):
        return [self.users[u] for u in users]

In [14]:
baseline_mu= baselineMeanUser()
baseline_mu.fit(train)
pred = baseline_mu.predict(rawArray[:,0])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(rawArray[:,2], float)) ** 2).mean())

Mean Error 1.065209


In [15]:
class baselineMeanMovie:
    def __init__(self):
        self.movies={}
    def fit(self,train):
        for movie in train.keys():
            note=0.0
            for user in train[movie].keys():
                note+=train[movie][user]
            note=note/len(train[movie])
            self.movies[movie]=note
        
    def predict(self,movies):
        res=[]
        for m in movies:
            try:
                res.append(self.movies[m])
            except:
                res.append(3)
        return res

In [16]:
baseline_mm= baselineMeanMovie()
baseline_mm.fit(m_train)
pred = baseline_mm.predict(rawArrayTest[:,1])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(rawArrayTest[:,2], float)) ** 2).mean())

Mean Error 1.002688


In [17]:
m_test['Adventures of Pinocchio, The (1996)']

{'181': 1.0,
 '381': 5.0,
 '395': 2.0,
 '399': 3.0,
 '699': 3.0,
 '83': 3.0,
 '887': 5.0}

In [18]:
rawArray[:5]

array([['344', 'Birdcage, The (1996)', '4.0'],
       ['344', 'Enchanted April (1991)', '4.0'],
       ['344', 'Diabolique (1996)', '2.0'],
       ['344', 'Face/Off (1997)', '4.0'],
       ['344', 'My Fellow Americans (1996)', '3.0']], 
      dtype='|S81')

In [19]:
len(m_train['Birdcage, The (1996)'])

249

---

In [21]:
class matrixFactorisation():
    def __init__(self, k, lambd=0.2, eps=1e-5, maxIter=2000, alternate=0):
        self.k = k
        self.lambd = lambd
        self.eps = eps
        self.maxIter = maxIter
        self.alternate = alternate
    def fit(self, dataUsers, dataItems, couples):
        self.p = {}
        self.q = {}
        self.couples = couples
        self.loss = []
        optimP = True
        optimQ = (self.alternate == 0)
        for i in xrange(self.maxIter):
            loss = 0
            for j in xrange(len(couples)):
                r = np.random.randint(len(couples))
                user = couples[r][0]
                item = couples[r][1]
                if not user in self.p:
                    self.p[user] = np.random.rand(1,self.k)
                if not item in self.q:
                    self.q[item] = np.random.rand(self.k,1)
                tmp = dataUsers[user][item] - self.p[user].dot(self.q[item])[0][0]
                if (optimP):
                    self.p[user] = (1 - self.lambd * self.eps) * self.p[user] + self.eps * 2 * tmp * self.q[item].transpose()
                if (optimQ):
                    self.q[item] = (1 - self.lambd * self.eps) * self.q[item] + self.eps * 2 * tmp * self.p[user].transpose()
                loss = loss + tmp*tmp #Sans régularisation
            self.loss.append(loss)
            if (self.alternate != 0):
                if (i % self.alternate == 0):
                    oprimP = False if optimQ else True
                    print i, loss / len(couples)
            else:
                if (i % 100 == 0):
                    print i, loss / len(couples)
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.p[c[0]].dot(self.q[c[1]])[0][0]
        return pred

In [176]:
model3 = matrixFactorisation(10, alternate=0)
model3.fit(trainUsers, trainItems, trainCouples)

0 2.77841840868
100 1.26732246398
200 1.05346402861
300 0.983480018466
400 0.940916112566
500 0.912340589664
600 0.895851403893
700 0.880687032448


KeyboardInterrupt: 

In [22]:

dm = np.dok_matrix(train)

AttributeError: 'module' object has no attribute 'dok_matrix'

In [20]:
print(len(movies))
print(len(data.keys()))

1664
943


In [21]:
movies["Adventures of Pinocchio, The (1996)"]

'1060'

In [38]:
rawMatrix = np.zeros((len(data.keys())+1,1682+1))
for u in data:
    for m in data[u]:
        rawMatrix[int(u)][int(movies[m])] = data[u][m]

In [40]:
np.shape(rawMatrix)

(944, 1683)

In [24]:
train["1"]["101 Dalmatians (1996)"]

2.0

In [39]:
rawMatrixTrain = np.zeros((len(data.keys())+1,1682+1))
for u in train:
    for m in train[u]:
        rawMatrixTrain[int(u)][int(movies[m])] = train[u][m]
        
rawMatrixTest = np.zeros((len(data.keys())+1,1682+1))
for u in test:
    for m in test[u]:
        rawMatrixTest[int(u)][int(movies[m])] = test[u][m]

In [28]:
rawMatrixTrain[:10,:10]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  5.,  3.,  4.,  3.,  3.,  5.,  4.,  1.,  5.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  2.,  4.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  0.,  0.,  5.,  5.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  5.,  4.,  0.,  0.]])

In [29]:
rawMatrixTest[:10,:10]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  5.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [33]:
np.shape(rawMatrixTest)

(944, 1683)

In [166]:
import numpy as np
from scipy import linalg
from numpy import dot

def nmf(X, latent_features, max_iter=100, error_limit=1e-6, fit_error_limit=1e-6):
    """
    Decompose X to A*Y
    """
    eps = 1e-5
    print 'Starting NMF decomposition with {} latent features and {} iterations.'.format(latent_features, max_iter)
    #X = X.toarray()  # I am passing in a scipy sparse matrix

    # mask
    mask = np.sign(X)

    # initial matrices. A is random [0,1] and Y is A\X.
    rows, columns = X.shape
    A = np.random.rand(rows, latent_features)
    A = np.maximum(A, eps)

    Y = linalg.lstsq(A, X)[0]
    Y = np.maximum(Y, eps)

    masked_X = mask * X
    X_est_prev = dot(A, Y)
    for i in range(1, max_iter + 1):
        # updates
        top = dot(masked_X, Y.T)
        bottom = (dot((mask * dot(A, Y)), Y.T)) + eps
        A *= top / bottom

        A = np.maximum(A, eps)
        # print 'A',  np.round(A, 2)

        top = dot(A.T, masked_X)
        bottom = dot(A.T, mask * dot(A, Y)) + eps
        Y *= top / bottom
        Y = np.maximum(Y, eps)
        # print 'Y', np.round(Y, 2)


        # evaluation
        if i % 200 == 0 or i == 1 or i == max_iter:
            print 'Iteration {}:'.format(i),
            X_est = dot(A, Y)
            err = mask * (X_est_prev - X_est)
            fit_residual = np.sqrt(np.sum(err ** 2))
            X_est_prev = X_est

            curRes = linalg.norm(mask * (X - X_est), ord='fro')
            print 'fit residual', np.round(fit_residual, 4),
            print 'total residual', np.round(curRes, 4)
            if curRes < error_limit or fit_residual < fit_error_limit:
                break

    return A, Y

In [112]:
cpr = copy.deepcopy(rawMatrix)

In [124]:
cpr[:10,:10]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  3.,  4.,  3.,  3.,  5.,  4.,  1.,  5.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  2.,  4.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  0.,  0.,  5.,  5.,  5.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  5.,  4.,  0.,  0.]])

In [136]:
t1 = np.array([[7,1],
               [1,1]])
t2 = np.array([[1,2],
               [3,4]])

In [137]:
t1.dot(t2)

array([[10, 18],
       [ 4,  6]])

In [146]:
(t1*t2).sum()

16

In [123]:
cpr[1,1]=0

In [139]:
%%time
A,Y = nmf(cpr,100,max_iter=3000)

Starting NMF decomposition with 100 latent features and 3000 iterations.
Iteration 1: fit residual 393.8573 total residual 281.7802
Iteration 50: fit residual 193.9636 total residual 161.7064
Iteration 100: fit residual 59.0108 total residual 132.3743
Iteration 150: fit residual 32.0228 total residual 118.1091
Iteration 200: fit residual 21.1338 total residual 109.4591
Iteration 250: fit residual 15.359 total residual 103.5741
Iteration 300: fit residual 11.9724 total residual 99.2019
Iteration 350: fit residual 9.952 total residual 95.7222
Iteration 400: fit residual 8.5715 total residual 92.85
Iteration 450: fit residual 7.4787 total residual 90.4282
Iteration 500: fit residual 6.6231 total residual 88.3419
Iteration 550: fit residual 5.9386 total residual 86.5123
Iteration 600: fit residual 5.4035 total residual 84.8834
Iteration 650: fit residual 4.9618 total residual 83.4181
Iteration 700: fit residual 4.5829 total residual 82.0886
Iteration 750: fit residual 4.276 total residual 

In [144]:
resMatrix = A.dot(Y)

In [145]:
resMatrix[1,1]

3.0386369718493849

In [73]:
class evalMF:
    def __init__(self,resMatrix,dicU,dicI):
        self.resMatrix=resMatrix
        self.dicU = dicU
        self.dicI = dicI
    def fit(self):
        pass
        
    def predict(self,user,movie):
        return self.resMatrix[int(user)][int(self.dicI[movie])]

In [74]:
mf= evalMF(resMatrix,data,movies)

In [75]:
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["All Dogs Go to Heaven 2 (1996)"]
print mf.predict("1","All Dogs Go to Heaven 2 (1996)")

4.0
4.01525359804
1.0
1.00798119413


In [76]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawArrayTest]).mean()

2.0909226578962512

---

In [225]:
%%time
A,Y = nmf(rawMatrixTrain,500,max_iter=300)

Starting NMF decomposition with 500 latent features and 300 iterations.
Iteration 1: fit residual 5298.0864 total residual 244.6212
Iteration 200: fit residual 240.2685 total residual 13.9941
Iteration 300: fit residual 7.7158 total residual 6.8284
CPU times: user 2min 45s, sys: 5.49 s, total: 2min 50s
Wall time: 1min 2s


In [226]:
resMatrix = A.dot(Y)

In [227]:
a=np.array((1,2,4))
b=np.array((1,3,6))
(a-b).dot(a-b)

masqueTest=np.sign(rawMatrixTest)
masqueTest[:10,:10]

A=masqueTest*rawMatrix


In [228]:
aa = masqueTest*resMatrix

In [229]:
for idxi,i in enumerate(aa):
    for idxj,j in enumerate(i):
        if j>5:
            aa[idxi][idxj]=5

In [235]:
q = masqueTest*resMatrix - rawMatrixTest

In [236]:
(q*q).sum()/ masqueTest.sum()

1.1273948891755721

In [232]:
aa[:10,:10]

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  4.25790085,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        

In [111]:
rawMatrix[:10,:10]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  5.,  3.,  4.,  3.,  3.,  5.,  4.,  1.,  5.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  0.,  0.,  0.,  0.,  0.,  2.,  4.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  0.,  0.,  5.,  5.,  5.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  5.,  4.,  0.,  0.]])

In [65]:
resMatrix[:10,:10]

array([[  1.00000000e-09,   2.11263017e-05,   1.75772784e-05,
          1.59467964e-05,   1.91700638e-05,   1.73865300e-05,
          2.06765659e-05,   1.94248575e-05,   2.17730688e-05,
          2.03784277e-05],
       [  1.78801970e-04,   3.95625546e+00,   3.54997734e+00,
          3.72895234e+00,   4.27096338e+00,   2.72442378e+00,
          4.01474525e+00,   4.52884136e+00,   3.69341713e+00,
          3.98399173e+00],
       [  1.96184298e-04,   4.03306906e+00,   2.99267665e+00,
          2.07003583e+00,   3.73621474e+00,   3.57324546e+00,
          4.50847524e+00,   4.03266244e+00,   4.35395407e+00,
          4.65985982e+00],
       [  1.76302740e-04,   3.61148196e+00,   3.27282155e+00,
          2.35550426e+00,   2.02886726e+00,   3.87498134e+00,
          1.48801040e+00,   2.95799674e+00,   2.77898478e+00,
          3.14122547e+00],
       [  2.63225365e-04,   4.84628612e+00,   3.96173386e+00,
          4.22460058e+00,   3.47028279e+00,   3.99521914e+00,
          5.25970696e+00

In [59]:
mf = evalMF(resMatrix,data,movies)

In [69]:
print data["1"]["Akira (1988)"]
print mf.predict("1","Akira (1988)")
print data["1"]["All Dogs Go to Heaven 2 (1996)"]
print mf.predict("1","All Dogs Go to Heaven 2 (1996)")

4.0
3.68121941569
1.0
0.96164687419


In [61]:
print train["1"]["All Dogs Go to Heaven 2 (1996)"]
print test["1"]["Akira (1988)"]

1.0


KeyError: 'Akira (1988)'

In [80]:
len(rawMatrixTest)

944

In [78]:
t = []
c = 10
for idxi,i in enumerate(rawMatrixTest):
    for idxj,j in enumerate(i):
        if rawMatrixTest[idxi][idxj] != 0:
            t.append( (resMatrix[idxi][idxj] - float(rawMatrixTest[idxi][idxj]))**2 )
            if c>0:
                print(rawMatrixTest[idxi][idxj],resMatrix[idxi][idxj])
                c-=1
np.array(t).mean()

(5.0, 2.5931883684545336)
(4.0, 5.0047797241971219)
(4.0, 3.469010363892524)
(4.0, 3.8143373120809678)
(4.0, 5.1669137101192657)
(3.0, 4.4963428645445518)
(5.0, 3.6366298516628395)
(3.0, 3.1448263066152959)
(5.0, 2.7517463531147883)
(5.0, 4.6741099156866328)


2.0909226578962459

In [87]:
t = []
c = 10
for idxi,i in enumerate(resMatrix):
    for idxj,j in enumerate(i):
        if rawMatrixTest[idxi][idxj] != 0:
            t.append( (resMatrix[idxi][idxj] - float(rawMatrix[idxi][idxj]))**2 )
            if c>0:
                print(rawMatrix[idxi][idxj],resMatrix[idxi][idxj])
                c-=1
np.array(t).mean()

(5.0, 2.5931883684545336)
(4.0, 5.0047797241971219)
(4.0, 3.469010363892524)
(4.0, 3.8143373120809678)
(4.0, 5.1669137101192657)
(3.0, 4.4963428645445518)
(5.0, 3.6366298516628395)
(3.0, 3.1448263066152959)
(5.0, 2.7517463531147883)
(5.0, 4.6741099156866328)


2.0909226578962459

In [108]:
t = []
c = 3
for idxi,i in enumerate(rawMatrixTrain):
    for idxj,j in enumerate(i):
        if rawMatrixTrain[idxi][idxj] != 0:
            t.append( (float(rawMatrixTrain[idxi][idxj]) - resMatrix[idxi][idxj])**2 )
            if c>0:
                print(rawMatrixTrain[idxi][idxj],resMatrix[idxi][idxj])
                c-=1
np.array(t).mean()

(5.0, 4.3114944585785064)
(3.0, 3.3257920434476187)
(3.0, 4.0022181201367522)


0.45320954210197834

In [80]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawMatrixTest]).mean()

KeyError: 0.0

---

In [124]:
R = [
     [5,3,0,1],
     [4,0,0,1],
     [1,1,0,5],
     [1,0,0,4],
     [0,1,5,4],
    ]

R = numpy.array(R)

N = len(R)
M = len(R[0])
K = 2

P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K)
nR = numpy.dot(nP, nQ.T)

In [125]:
nR

array([[ 4.99650255,  2.93418578,  3.99234219,  0.99820419],
       [ 3.96298764,  2.33573468,  3.36035393,  0.99707133],
       [ 1.0716961 ,  0.82538354,  5.337668  ,  4.96193011],
       [ 0.9633362 ,  0.72181627,  4.33820223,  3.97311633],
       [ 1.81106643,  1.21517699,  4.91344271,  4.03428495]])

---

In [41]:
import numpy

def matrix_factorization(R, K, steps=100, alpha=0.0002, beta=0.02):
    N = len(R)
    M = len(R[0])

    P = numpy.random.rand(N,K)
    Q = numpy.random.rand(M,K)
    Q = Q.T
    for step in xrange(steps):
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
                    for k in xrange(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = numpy.dot(P,Q)
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
                    for k in xrange(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        if e < 0.001:
            break
    return P, Q.T

In [43]:
%%time

R = rawMatrix

nP, nQ = matrix_factorization(R, 10, steps=2)
nR = numpy.dot(nP, nQ.T)

CPU times: user 34.9 s, sys: 229 ms, total: 35.1 s
Wall time: 35.3 s


In [44]:
nR[:5,:5]

array([[ 2.44278256,  2.87154427,  1.80549856,  2.37639505,  2.57721859],
       [ 2.58347667,  3.63952216,  1.88785598,  2.42137971,  3.78439782],
       [ 3.00074652,  3.17910098,  2.18194343,  3.32871375,  3.48184956],
       [ 3.98021546,  4.53548673,  2.88747289,  3.50115542,  3.84920728],
       [ 3.26661088,  4.4907536 ,  2.7049045 ,  3.2442335 ,  4.14210547]])

In [45]:
rawMatrix[:5,:5]

array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  5.,  3.,  4.,  3.],
       [ 0.,  4.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])

In [46]:
mf= evalMF(nR,data,movies)
mf.predict("1","Akira (1988)")

2.7812982370145853

In [47]:
np.array([ (float(ra[2]) - mf.predict(ra[0],ra[1]))**2 for ra in rawArrayTest]).mean()

1.5532842864204328