In [1]:
from random import random
import math
import numpy as np
import copy

In [2]:
def loadMovieLens(path='./data/movielens'):
    #Get movie titles
    movies={}
    for line in open(path+'/u.item'):
        id,title=line.split('|')[0:2]
        movies[id]=title

    # Load data
    prefs={}
    for line in open(path+'/u.data'):
        (user,movieid,rating,ts)=line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
        
    return prefs

## Analysing data

In [3]:
data = loadMovieLens("data/ml-100k")

In [4]:
data['3']

{'187 (1997)': 2.0,
 'Air Force One (1997)': 2.0,
 'Alien: Resurrection (1997)': 3.0,
 'Apostle, The (1997)': 4.0,
 'Bean (1997)': 2.0,
 'Boogie Nights (1997)': 5.0,
 'Chasing Amy (1997)': 3.0,
 'Conspiracy Theory (1997)': 5.0,
 'Contact (1997)': 2.0,
 'Cop Land (1997)': 4.0,
 'Crash (1996)': 1.0,
 'Critical Care (1997)': 1.0,
 "Dante's Peak (1997)": 2.0,
 'Deconstructing Harry (1997)': 3.0,
 'Deep Rising (1998)': 1.0,
 'Desperate Measures (1998)': 4.0,
 "Devil's Advocate, The (1997)": 3.0,
 "Devil's Own, The (1997)": 1.0,
 'Edge, The (1997)': 4.0,
 'Event Horizon (1997)': 4.0,
 'Everyone Says I Love You (1996)': 2.0,
 'Fallen (1998)': 3.0,
 'G.I. Jane (1997)': 2.0,
 'Game, The (1997)': 2.0,
 'Good Will Hunting (1997)': 2.0,
 'Hard Rain (1998)': 3.0,
 'Hoodlum (1997)': 3.0,
 'House of Yes, The (1997)': 1.0,
 'How to Be a Player (1997)': 1.0,
 'In the Name of the Father (1993)': 2.0,
 'Jackie Brown (1997)': 5.0,
 'Kiss the Girls (1997)': 1.0,
 'L.A. Confidential (1997)': 2.0,
 'Liar Lia

## Splitting, and getting raw datas for convenience

In [5]:
def getRawArray(data):
    d = []
    for u in data.keys():
        for i in data[u].keys():
            d.append([u,i,data[u][i]])
    return np.array(d)

In [6]:
# splitting while avoiding to reduce the dataset too much
def split_train_test(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        test.setdefault(u,{})
        train.setdefault(u,{})
        for movie in data[u]:
            #print(data[u][movie])
            if (random()<percent_test):
                test[u][movie]=data[u][movie]
            else:
                train[u][movie]=data[u][movie]
    return train, test

In [7]:
def split_train_test_by_movies(data,percent_test):
    test={}
    train={}
    movie={}
    for u in data.keys():
        for movie in data[u]:
            if (random()<percent_test):
                try:
                    test[movie][u]=data[u][movie]
                except KeyError:
                    test.setdefault(movie,{})
                    test[movie][u]=data[u][movie]
            else:
                try:
                    train[movie][u]=data[u][movie]
                except KeyError:
                    train.setdefault(movie,{})
                    train[movie][u]=data[u][movie]
    return train, test

In [8]:
percent_test=0.2
train,test=split_train_test(data,percent_test)

In [9]:
percent_test=0.2
m_train,m_test=split_train_test_by_movies(data,percent_test)

In [10]:
def deleteUnseenInTest(train,test):
    for k in test.keys():
        try:
            train[k]
        except KeyError:
            test.pop(k,None)

In [11]:
deleteUnseenInTest(train,test)
deleteUnseenInTest(m_train,m_test)

In [12]:
rawArray = getRawArray(data)
rawArrayTest = getRawArray(test)

---

## Baseline: mean by user

In [13]:
class baselineMeanMovie:
    def __init__(self):
        self.users={}
        self.movies={}
    def fit(self,train):
        movies = get_moove(train)
        for movie in movies:
            note=0.0
            cpt=0
            for user in train:
                try:
                    note+=train[user][movie]
                    cpt+=1
                except KeyError:
                    pass
            note=note/cpt
            self.movies[movie]=note
        
    def predict(self,user,movie):
        return self.movies[movie]
    def score(self,X):
        nb_movies = len(get_moove(X))
        score = 0.0
        for user in X:
            for movie in X[user]:
                score += (self.predict(user,movie) - X[user][movie])**2
        return score/nb_movies

In [14]:
class baselineMeanUser:
    def __init__(self):
        self.users={}
    def fit(self,train):
        for user in train.keys():
            note=0.0
            for movie in train[user].keys():
                note+=train[user][movie]
            note=note/len(train[user])
            self.users[user]=note
        
    def predict(self,users):
        return [self.users[u] for u in users]

In [15]:
baseline_mu= baselineMeanUser()
baseline_mu.fit(train)
pred = baseline_mu.predict(rawArray[:,0])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(rawArray[:,2], float)) ** 2).mean())

Mean Error 1.065380


## Baseline: mean by movies

In [16]:
class baselineMeanMovie:
    def __init__(self):
        self.movies={}
    def fit(self,train):
        for movie in train.keys():
            note=0.0
            for user in train[movie].keys():
                note+=train[movie][user]
            note=note/len(train[movie])
            self.movies[movie]=note
        
    def predict(self,movies):
        res=[]
        for m in movies:
            try:
                res.append(self.movies[m])
            except:
                res.append(3)
        return res

In [17]:
baseline_mm= baselineMeanMovie()
baseline_mm.fit(m_train)
pred = baseline_mm.predict(rawArrayTest[:,1])
print("Mean Error %0.6f" %(
        (np.array(pred) - np.array(rawArrayTest[:,2], float)) ** 2).mean())

Mean Error 1.007572


In [18]:
m_test['Adventures of Pinocchio, The (1996)']

{'189': 5.0,
 '348': 3.0,
 '434': 3.0,
 '496': 1.0,
 '60': 4.0,
 '637': 2.0,
 '650': 3.0}

In [19]:
rawArray[:5]

array([['344', 'Birdcage, The (1996)', '4.0'],
       ['344', 'Enchanted April (1991)', '4.0'],
       ['344', 'Diabolique (1996)', '2.0'],
       ['344', 'Face/Off (1997)', '4.0'],
       ['344', 'My Fellow Americans (1996)', '3.0']], 
      dtype='|S81')

In [20]:
len(m_train['Birdcage, The (1996)'])

230

---
## Matrix Factorisation

In [27]:
# Way too slow, not ready yet matrix Factorisation
# With massive help from Remi Cadene
class matrixFactorisation():
    def __init__(self, k, lambd=0.2, eps=1e-5, maxIter=2000, alternate=0):
        self.k = k
        self.lambd = lambd
        self.eps = eps
        self.maxIter = maxIter
        self.alternate = alternate
    def fit(self, dataUsers, dataItems, couples):
        self.p = {}
        self.q = {}
        self.couples = couples
        self.loss = []
        optimP = True
        optimQ = (self.alternate == 0)
        for i in xrange(self.maxIter):
            loss = 0
            for j in xrange(len(couples)):
                r = np.random.randint(len(couples))
                user = couples[r][0]
                item = couples[r][1]
                if not user in self.p:
                    self.p[user] = np.random.rand(1,self.k)*4
                if not item in self.q:
                    self.q[item] = np.random.rand(self.k,1)*4
                tmp = dataUsers[user][item] - self.p[user].dot(self.q[item])[0][0]
                if (optimP):
                    self.p[user] = (1 - self.lambd * self.eps) * self.p[user] + self.eps * 2 * tmp * self.q[item].transpose()
                if (optimQ):
                    self.q[item] = (1 - self.lambd * self.eps) * self.q[item] + self.eps * 2 * tmp * self.p[user].transpose()
                loss = loss + tmp*tmp #Sans régularisation
            self.loss.append(loss)
            if (self.alternate != 0):
                if (i % self.alternate == 0):
                    oprimP = False if optimQ else True
                    print i, loss / len(couples)
            else:
                if (i % 100 == 0):
                    print i, loss / len(couples)
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.p[c[0]].dot(self.q[c[1]])[0][0]
        return pred

In [31]:
model3 = matrixFactorisation(10, alternate=100)
model3.fit(trainUsers, trainItems, trainCouples)

0 1288.56302008
100 12.1687668873
200 5.69295900495
300 4.08720310516
400 3.24075311118
500 2.81686181623
600 2.43916635356
700 2.23621131174
800 2.04653316974


KeyboardInterrupt: 

In [36]:
pred = model3.predict(testCouples)
print "Erreur de test:", ((pred - np.array(testCouples[:,2], float)) ** 2).mean()

 Erreur de test: 2.65974007908


---