In [9]:
# A dictionary of movie critics and their ratings of a small
# set of movies
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
      'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
      'The Night Listener': 3.0},
     'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
      'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
      'You, Me and Dupree': 3.5},
     'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
      'Superman Returns': 3.5, 'The Night Listener': 4.0},
     'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
      'The Night Listener': 4.5, 'Superman Returns': 4.0,
      'You, Me and Dupree': 2.5},
     'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
      'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
      'You, Me and Dupree': 2.0},
     'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
      'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
     'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

In [10]:
critics['Lisa Rose']['Lady in the Water']

2.5

In [11]:
critics['Toby']['Snakes on a Plane']=4.5

In [12]:
critics['Toby']

{'Snakes on a Plane': 4.5, 'Superman Returns': 4.0, 'You, Me and Dupree': 1.0}

# 1. User-based filtering¶

## 1.0 Finding similar users¶

In [2]:
# 欧几里得距离   坐标系中的两个点
import numpy as np
np.sqrt(np.power(5-4, 2) + np.power(4-1, 2))   #2 平方

3.1622776601683795

In [3]:
1.0 /(1 + np.sqrt(np.power(5-4, 2) + np.power(4-1, 2)) )

0.2402530733520421

In [4]:
# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2):
    # Get the list of shared_items
    si={}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1   
    # if they have no ratings in common, return 0
    if len(si)==0: return 0
    # Add up the squares of all the differences
    sum_of_squares=np.sum([np.power(prefs[person1][item]-prefs[person2][item],2)
                      for item in prefs[person1] if item in prefs[person2]])    #如果有共同，取距离
    return 1/(1+sum_of_squares)

In [5]:
sim_distance(critics, 'Lisa Rose','Gene Seymour')

NameError: name 'critics' is not defined

In [6]:
# Returns the Pearson correlation coefficient for p1 and p2
def sim_pearson(prefs,p1,p2):
    # Get the list of mutually rated items
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]: si[item]=1
    # Find the number of elements
    n=len(si)
    # if they are no ratings in common, return 0
    if n==0: return 0
    # Add up all the preferences
    sum1=np.sum([prefs[p1][it] for it in si])
    sum2=np.sum([prefs[p2][it] for it in si])
    # Sum up the squares
    sum1Sq=np.sum([np.power(prefs[p1][it],2) for it in si])
    sum2Sq=np.sum([np.power(prefs[p2][it],2) for it in si])
    # Sum up the products
    pSum=np.sum([prefs[p1][it]*prefs[p2][it] for it in si])
    # Calculate Pearson score  皮尔逊相关系数
    num=pSum-(sum1*sum2/n)
    den=np.sqrt((sum1Sq-np.power(sum1,2)/n)*(sum2Sq-np.power(sum2,2)/n))
    if den==0: return 0
    return num/den

In [17]:
sim_pearson(critics, 'Lisa Rose','Gene Seymour')

0.39605901719066977

In [18]:
# Returns the best matches for person from the prefs dictionary.
# Number of results and similarity function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):      #返回跟他最相似的5个人
    scores=[(similarity(prefs,person,other),other)
        for other in prefs if other!=person]
    # Sort the list so the highest scores appear at the top 
    scores.sort( )
    scores.reverse( )
    return scores[0:n]

In [19]:
topMatches(critics,'Toby',n=3) # topN

[(0.99124070716192991, 'Lisa Rose'),
 (0.92447345164190486, 'Mick LaSalle'),
 (0.89340514744156474, 'Claudia Puig')]

## 1.1 Recommending Items  基于人

In [7]:
# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
    totals={}
    simSums={}
    for other in prefs:
        # don't compare me to myself
        if other==person: continue
        sim=similarity(prefs,person,other)
        # ignore scores of zero or lower
        if sim<=0: continue
        for item in prefs[other]:   
            # only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item]==0:
                # Similarity * Score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=sim
    # Create the normalized list
    rankings=[(total/simSums[item],item) for item,total in totals.items()]   #总分／相似总和
    # Return the sorted list
    rankings.sort()
    rankings.reverse()
    return rankings

In [8]:
# Now you can find out what movies I should watch next:
getRecommendations(critics,'Toby')

NameError: name 'critics' is not defined

In [22]:
# You’ll find that the results are only affected very slightly by the choice of similarity metric.
getRecommendations(critics,'Toby',similarity=sim_distance)   #基于欧几里得

[(3.5002478401415877, 'The Night Listener'),
 (2.7561242939959363, 'Lady in the Water'),
 (2.4619884860743739, 'Just My Luck')]

## 2. Item-based filtering 基于物

### 将item-user字典的键值翻转¶

In [23]:
# you just need to swap the people and the items. 
def transformPrefs(prefs):
    result={}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item,{})
            # Flip item and person
            result[item][person]=prefs[person][item]
    return result

movies = transformPrefs(critics)

In [24]:
topMatches(movies,'Superman Returns')

[(0.65795169495976946, 'You, Me and Dupree'),
 (0.48795003647426888, 'Lady in the Water'),
 (0.11180339887498941, 'Snakes on a Plane'),
 (-0.17984719479905439, 'The Night Listener'),
 (-0.42289003161103106, 'Just My Luck')]

In [25]:
getRecommendations(movies,'Just My Luck')

[(4.0, 'Michael Phillips'), (3.0, 'Jack Matthews')]

In [26]:
getRecommendations(movies, 'You, Me and Dupree')

[(3.1637361366111816, 'Michael Phillips')]

In [27]:
def calculateSimilarItems(prefs,n=10):
    # Create a dictionary of items showing which other items they
    # are most similar to.
    result={}
    # Invert the preference matrix to be item-centric
    itemPrefs=transformPrefs(prefs)
    c=0
    for item in itemPrefs:
        # Status updates for large datasets
        c+=1
        if c%100==0: print "%d / %d" % (c,len(itemPrefs))
        # Find the most similar items to this one
        scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
        result[item]=scores
    return result

itemsim=calculateSimilarItems(critics) 
itemsim

{'Just My Luck': [(0.22222222222222221, 'Lady in the Water'),
  (0.18181818181818182, 'You, Me and Dupree'),
  (0.15384615384615385, 'The Night Listener'),
  (0.10526315789473684, 'Snakes on a Plane'),
  (0.064516129032258063, 'Superman Returns')],
 'Lady in the Water': [(0.40000000000000002, 'You, Me and Dupree'),
  (0.2857142857142857, 'The Night Listener'),
  (0.22222222222222221, 'Snakes on a Plane'),
  (0.22222222222222221, 'Just My Luck'),
  (0.090909090909090912, 'Superman Returns')],
 'Snakes on a Plane': [(0.22222222222222221, 'Lady in the Water'),
  (0.18181818181818182, 'The Night Listener'),
  (0.16666666666666666, 'Superman Returns'),
  (0.10526315789473684, 'Just My Luck'),
  (0.05128205128205128, 'You, Me and Dupree')],
 'Superman Returns': [(0.16666666666666666, 'Snakes on a Plane'),
  (0.10256410256410256, 'The Night Listener'),
  (0.090909090909090912, 'Lady in the Water'),
  (0.064516129032258063, 'Just My Luck'),
  (0.053333333333333337, 'You, Me and Dupree')],
 'Th

In [28]:
4.5*0.222+4.0*0.091+1.0*0.4 

1.763/0.713

2.472650771388499

In [29]:
4.5*0.105+4.0*0.065+1.0*0.182 

0.914/0.352 

2.596590909090909

In [30]:
def getRecommendedItems(prefs,itemMatch,user):
    userRatings=prefs[user]
    scores={}
    totalSim={}
    # Loop over items rated by this user
    for (item,rating) in userRatings.items( ):
        # Loop over items similar to this one
        for (similarity,item2) in itemMatch[item]:
            # Ignore if this user has already rated this item
            if item2 in userRatings: continue
            # Weighted sum of rating times similarity
            scores.setdefault(item2,0)
            scores[item2]+=similarity*rating
            # Sum of all the similarities
            totalSim.setdefault(item2,0)
            totalSim[item2]+=similarity
    # Divide each total score by total weighting to get an average
    rankings=[(score/totalSim[item],item) for item,score in scores.items( )]
    # Return the rankings from highest to lowest
    rankings.sort( )
    rankings.reverse( )
    return rankings

getRecommendedItems(critics,itemsim,'Toby')

[(3.182634730538922, 'The Night Listener'),
 (2.5983318700614575, 'Just My Luck'),
 (2.4730878186968837, 'Lady in the Water')]

### 5.21上午课堂内容

In [35]:
# https://github.com/ParticleWave/RecommendationSystemStudy/blob/d1960056b96cfaad62afbfe39225ff680240d37e/PersonalRank.py
import os
import random

class Graph:
    def __init__(self):
        self.G = dict()    #创建一个空的字典
    
    def addEdge(self, p, q):
        if p not in self.G: self.G[p] = dict()   #如果p不在G中，则在G中添加一个p的字典
        if q not in self.G: self.G[q] = dict()
        self.G[p][q] = 1   #在p中添加q=1
        self.G[q][p] = 1

    def getGraphMatrix(self):
        return self.G

In [36]:
graph = Graph()
graph.addEdge('A', 'a')
graph.addEdge('A', 'c')
graph.addEdge('B', 'a')
graph.addEdge('B', 'b')
graph.addEdge('B', 'c')
graph.addEdge('B', 'd')
graph.addEdge('C', 'c')
graph.addEdge('C', 'd')
G = graph.getGraphMatrix()
print(G.keys())

['A', 'a', 'c', 'B', 'd', 'C', 'b']


In [37]:
G

{'A': {'a': 1, 'c': 1},
 'B': {'a': 1, 'b': 1, 'c': 1, 'd': 1},
 'C': {'c': 1, 'd': 1},
 'a': {'A': 1, 'B': 1},
 'b': {'B': 1},
 'c': {'A': 1, 'B': 1, 'C': 1},
 'd': {'B': 1, 'C': 1}}

In [38]:
def PersonalRank(G, alpha, root, max_step):
    # G is the biparitite graph of users' ratings on items
    # alpha is the probability of random walk forward   随机向前走的概率
    # root is the studied User                      从哪个点出发（对哪个人推荐）
    # max_step if the steps of iterations.  走多少步
    rank = dict()
    rank = {x:0.0 for x in G.keys()}    #G的key，value为浮点数0
    rank[root] = 1.0                    #rank字典中以root为key的value＝1
    for k in range(max_step):
        tmp = {x:0.0 for x in G.keys()}    #临时函数对象
        for i,ri in G.items():            #对G的每个key和value
            for j,wij in ri.items():      #对G value的每个key和value
                if j not in tmp: tmp[j] = 0.0   #如果小字典里的key不在tep字典中，则tep中key为j的value是0
                tmp[j] += alpha * rank[i] / (len(ri)*1.0)
                if j == root: tmp[j] += 1.0 - alpha
        rank = tmp
        print(k, rank)
    return rank

In [39]:
print(PersonalRank(G, 0.8, 'A', 20))
#    print(PersonalRank(G, 0.8, 'B', 20))
#    print(PersonalRank(G, 0.8, 'C', 20))

(0, {'A': 0.3999999999999999, 'a': 0.4, 'c': 0.4, 'B': 0.0, 'd': 0.0, 'C': 0.0, 'b': 0.0})
(1, {'A': 0.6666666666666666, 'a': 0.15999999999999998, 'c': 0.15999999999999998, 'B': 0.2666666666666667, 'd': 0.0, 'C': 0.10666666666666669, 'b': 0.0})
(2, {'A': 0.5066666666666666, 'a': 0.32, 'c': 0.3626666666666667, 'B': 0.10666666666666665, 'd': 0.09600000000000003, 'C': 0.04266666666666666, 'b': 0.053333333333333344})
(3, {'A': 0.624711111111111, 'a': 0.22399999999999998, 'c': 0.24106666666666665, 'B': 0.30577777777777787, 'd': 0.03839999999999999, 'C': 0.13511111111111113, 'b': 0.02133333333333333})
(4, {'A': 0.5538844444444444, 'a': 0.31104, 'c': 0.36508444444444443, 'B': 0.1863111111111111, 'd': 0.11520000000000002, 'C': 0.07964444444444443, 'b': 0.061155555555555574})
(5, {'A': 0.6217718518518518, 'a': 0.258816, 'c': 0.29067377777777775, 'B': 0.31677629629629633, 'd': 0.06911999999999999, 'C': 0.14343585185185187, 'b': 0.03726222222222222})
(6, {'A': 0.5810394074074073, 'a': 0.312064, '

## 3. MovieLens Recommender
人多电影少，所以以物为推荐，比较经济

In [40]:
def loadMovieLens(path='D:/program/cjc2016/data/'):
    # Get movie titles
    movies={}
    for line in open(path+'movies.dat'):
        (id,title)=line.split('::')[0:2]   
        movies[id]=title
  
    # Load data
    prefs={}
    for line in open(path+'/ratings.dat'):
        (user,movieid,rating,ts)=line.split('::')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
    return prefs

In [41]:
prefs=loadMovieLens()
prefs['87']

{'Alice in Wonderland (1951)': 1.0,
 'Army of Darkness (1993)': 3.0,
 'Bad Boys (1995)': 5.0,
 'Benji (1974)': 1.0,
 'Brady Bunch Movie, The (1995)': 1.0,
 'Braveheart (1995)': 5.0,
 'Buffalo 66 (1998)': 1.0,
 'Chambermaid on the Titanic, The (1998)': 1.0,
 'Cowboy Way, The (1994)': 1.0,
 'Cyrano de Bergerac (1990)': 4.0,
 'Dear Diary (Caro Diario) (1994)': 1.0,
 'Die Hard (1988)': 3.0,
 'Diebinnen (1995)': 1.0,
 'Dr. No (1962)': 1.0,
 'Escape from the Planet of the Apes (1971)': 1.0,
 'Fast, Cheap & Out of Control (1997)': 1.0,
 'Faster Pussycat! Kill! Kill! (1965)': 1.0,
 'From Russia with Love (1963)': 1.0,
 'Fugitive, The (1993)': 5.0,
 'Get Shorty (1995)': 1.0,
 'Gladiator (2000)': 5.0,
 'Goldfinger (1964)': 5.0,
 'Good, The Bad and The Ugly, The (1966)': 4.0,
 'Hunt for Red October, The (1990)': 5.0,
 'Hurricane, The (1999)': 5.0,
 'Indiana Jones and the Last Crusade (1989)': 4.0,
 'Jaws (1975)': 5.0,
 'Jurassic Park (1993)': 5.0,
 'King Kong (1933)': 1.0,
 'King of New York (199

### user-based filtering

In [42]:
getRecommendations(prefs,'87')[0:30]

NameError: name 'getRecommendations' is not defined

### Item-based filtering

In [43]:
itemsim=calculateSimilarItems(prefs,n=50)

NameError: name 'calculateSimilarItems' is not defined

## Buiding Recommendation System with GraphLab
基于隐含语义网络

In [44]:
#set product key using GraphLab Create API
import graphlab
graphlab.product_key.set_product_key('4972-65DF-8E02-816C-AB15-021C-EC1B-0367')

In [45]:
import graphlab as gl
# set canvas to show sframes and sgraphs in ipython notebook
gl.canvas.set_target('ipynb')
import matplotlib.pyplot as plt
%matplotlib inline

In [46]:
sf = graphlab.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"],
                       'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"],
                       'rating': [1, 3, 2, 5, 4, 1, 4, 3]})
sf

item_id,rating,user_id
a,1,0
b,3,0
c,2,0
a,5,1
b,4,1
b,1,2
c,4,2
d,3,2


In [47]:
m = graphlab.recommender.create(sf, target='rating')
recs = m.recommend()
print recs

+---------+---------+---------------+------+
| user_id | item_id |     score     | rank |
+---------+---------+---------------+------+
|    0    |    d    | 1.30389630795 |  1   |
|    1    |    c    |  4.0167632997 |  1   |
|    1    |    d    | 3.16467571259 |  2   |
|    2    |    a    | 2.46100602299 |  1   |
+---------+---------+---------------+------+
[4 rows x 4 columns]



In [48]:
m['coefficients']

{'intercept': 2.875, 'item_id': Columns:
 	item_id	str
 	linear_terms	float
 	factors	array
 
 Rows: 4
 
 Data:
 +---------+-----------------+-------------------------------+
 | item_id |   linear_terms  |            factors            |
 +---------+-----------------+-------------------------------+
 |    a    | -0.110387079418 | [-5.19597779203e-06, 0.001... |
 |    b    | -0.064489774406 | [-4.96627944813e-06, 0.001... |
 |    c    |  0.286174118519 | [5.58503188586e-06, -0.001... |
 |    d    | -0.565919041634 | [4.64454342364e-06, -0.001... |
 +---------+-----------------+-------------------------------+
 [4 rows x 3 columns], 'user_id': Columns:
 	user_id	str
 	linear_terms	float
 	factors	array
 
 Rows: 3
 
 Data:
 +---------+-----------------+-------------------------------+
 | user_id |   linear_terms  |            factors            |
 +---------+-----------------+-------------------------------+
 |    0    |  -1.00517702103 | [-1.11209214992e-06, 0.000... |
 |    1    |   0.8

## The CourseTalk dataset: loading and first look

In [49]:
#train_file = 'http://s3.amazonaws.com/dato-datasets/millionsong/10000.txt'
train_file = 'D:/program/cjc2016/data/ratings.dat'
sf = gl.SFrame.read_csv(train_file, header=False, delimiter='|', verbose=False)
sf.rename({'X1':'user_id', 'X2':'course_id', 'X3':'rating'}).show()

In [50]:
(train_set, test_set) = sf.random_split(0.8, seed=1)  #分训练集和测试集，随机种子是1，保证每次结果一致

### 1.Popularity model

In [51]:
import graphlab as gl
popularity_model = gl.popularity_recommender.create(train_set, 'user_id', 'course_id', target = 'rating')

### 2.Item similarity Model

In [52]:
item_sim_model = gl.item_similarity_recommender.create(train_set, 'user_id', 'course_id', target = 'rating', 
                                                       similarity_type='cosine')

### 3.Factorization Recommender Model    矩阵分解 

In [53]:
factorization_machine_model = gl.recommender.factorization_recommender.create(train_set, 'user_id', 'course_id',
                                                                              target='rating')

## Model Evaluation

In [54]:
import graphlab as gl
result = gl.recommender.util.compare_models(test_set, [popularity_model, item_sim_model, factorization_machine_model],
                                            user_sample=.1, skip_set=train_set)     #10%的user,去掉训练集
#rmse 均方根误差（用户实际打分－推荐算法预测打分） 越小越好
# MAE 平均绝对误差

compare_models: using 49 users to estimate model performance
PROGRESS: Evaluate model M0

Precision and recall summary statistics by cutoff
+--------+----------------+-------------+
| cutoff | mean_precision | mean_recall |
+--------+----------------+-------------+
|   1    |      0.0       |     0.0     |
|   2    |      0.0       |     0.0     |
|   3    |      0.0       |     0.0     |
|   4    |      0.0       |     0.0     |
|   5    |      0.0       |     0.0     |
|   6    |      0.0       |     0.0     |
|   7    |      0.0       |     0.0     |
|   8    |      0.0       |     0.0     |
|   9    |      0.0       |     0.0     |
|   10   |      0.0       |     0.0     |
+--------+----------------+-------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 1.0686165253589264)

Per User RMSE (best)
+---------+-------+------+
| user_id | count | rmse |
+---------+-------+------+
|   1491  |   1   | 0.0  |
+---------+-------+------+
[1 rows x 3 columns]


Per User RMSE (worst)
+-----

('\nOverall RMSE: ', 1.2345682364195378)

Per User RMSE (best)
+---------+-------+------+
| user_id | count | rmse |
+---------+-------+------+
|   1491  |   1   | 0.0  |
+---------+-------+------+
[1 rows x 3 columns]


Per User RMSE (worst)
+---------+-------+------+
| user_id | count | rmse |
+---------+-------+------+
|   1846  |   1   | 4.5  |
+---------+-------+------+
[1 rows x 3 columns]


Per Item RMSE (best)
+-----------+-------+------+
| course_id | count | rmse |
+-----------+-------+------+
|     9     |   1   | 0.0  |
+-----------+-------+------+
[1 rows x 3 columns]


Per Item RMSE (worst)
+-----------+-------+------+
| course_id | count | rmse |
+-----------+-------+------+
|    127    |   1   | 4.5  |
+-----------+-------+------+
[1 rows x 3 columns]

PROGRESS: Evaluate model M2

Precision and recall summary statistics by cutoff
+--------+------------------+------------------+
| cutoff |  mean_precision  |   mean_recall    |
+--------+------------------+---------------

In [73]:
K = 10
users = gl.SArray(sf['user_id'].unique().head(100))

In [74]:
recs = item_sim_model.recommend(users=users, k=K)
recs.head()

user_id,course_id,score,rank
118,172,5.0,1
118,147,5.0,2
118,51,5.0,3
118,12,5.0,4
118,16,5.0,5
118,15,5.0,6
118,13,5.0,7
118,11,5.0,8
118,7,5.0,9
118,9,5.0,10


In [41]:
# Get the meta data of the courses
courses = gl.SFrame.read_csv('D:/program/cjc2016/data/cursos.dat', header=False, delimiter='|', verbose=False)
courses.rename({'X1':'course_id', 'X2':'title', 'X3':'avg_rating', 
              'X4':'workload', 'X5':'university', 'X6':'difficulty', 'X7':'provider'}).show()

courses = courses[['course_id', 'title', 'provider']]
results = recs.join(courses, on='course_id', how='inner')

# Populate observed user-course data with course info
userset = frozenset(users)
ix = sf['user_id'].apply(lambda x: x in userset, int)  
user_data = sf[ix]
user_data = user_data.join(courses, on='course_id')[['user_id', 'title', 'provider']]

In [42]:
# Print out some recommendations 
for i in range(5):
    user = list(users)[i]
    print "User: " + str(i + 1)
    user_obs = user_data[user_data['user_id'] == user].head(K)
    del user_obs['user_id']
    user_recs = results[results['user_id'] == str(user)][['title', 'provider']]

    print "We were told that the user liked these courses: "
    print user_obs.head(K)

    print "We recommend these other courses:"
    print user_recs.head(K)

    print ""

User: 1
We were told that the user liked these courses: 
+-------------------------------+----------+
|             title             | provider |
+-------------------------------+----------+
| An Introduction to Interac... | coursera |
+-------------------------------+----------+
[1 rows x 2 columns]

We recommend these other courses:
+-------+----------+
| title | provider |
+-------+----------+
+-------+----------+
[0 rows x 2 columns]


User: 2
We were told that the user liked these courses: 
+-------------------------------+----------+
|             title             | provider |
+-------------------------------+----------+
| Design: Creation of Artifa... | coursera |
+-------------------------------+----------+
[1 rows x 2 columns]

We recommend these other courses:
+-------+----------+
| title | provider |
+-------+----------+
+-------+----------+
[0 rows x 2 columns]


User: 3
We were told that the user liked these courses: 
+-------------------------------+----------+
|       

## 使用Graph Lab隐语义网络推荐

In [1]:
import graphlab
graphlab.canvas.set_target("ipynb")
rating_sf = graphlab.SFrame('ratings')
users = graphlab.SFrame('users')
items = graphlab.SFrame('items')

This trial license of GraphLab Create assigned to 12313@e2e2e.com has expired. Please contact trial@turi.com for licensing options or to request a free non-commercial license for academic use.


InvalidLicense: License check failed. Please visit https://dato.com/support for support options.

## 使用GraphLab进行电影推荐¶

In [4]:
import graphlab
graphlab.canvas.set_target("ipynb")
# set canvas to show sframes and sgraphs in ipython notebook
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = graphlab.SFrame.read_csv('D:/program/cjc2016/data/ratings.dat', delimiter='\n', 
                                header=False)['X1'].apply(lambda x: x.split('::')).unpack()
for col in data.column_names():
    data[col] = data[col].astype(int)
data.rename({'X.0': 'user_id', 'X.1': 'movie_id', 'X.2': 'rating', 'X.3': 'timestamp'})
data.save('ratings')

users = graphlab.SFrame.read_csv('D:/program/cjc2016/data/users.dat', delimiter='\n', 
                                 header=False)['X1'].apply(lambda x: x.split('::')).unpack()
users.rename({'X.0': 'user_id', 'X.1': 'gender', 'X.2': 'age', 'X.3': 'occupation', 'X.4': 'zip-code'})
users['user_id'] = users['user_id'].astype(int)
users.save('users')

items = graphlab.SFrame.read_csv('D:/program/cjc2016/data/movies.dat', delimiter='\n', 
                                 header=False)['X1'].apply(lambda x: x.split('::')).unpack()
items.rename({'X.0': 'movie_id', 'X.1': 'title', 'X.2': 'genre'})
items['movie_id'] = items['movie_id'].astype(int)
items.save('items')

In [6]:
data.show()

In [7]:
items.head()

movie_id,title,genre
1,Toy Story (1995),Animation|Children's|Come dy ...
2,Jumanji (1995),Adventure|Children's|Fant asy ...
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995) ...,Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children's
9,Sudden Death (1995),Action
10,GoldenEye (1995),Action|Adventure|Thriller


In [8]:
data = data.join(items, on='movie_id')

In [9]:
data

user_id,movie_id,rating,timestamp,title,genre
1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975) ...,Drama
1,661,3,978302109,James and the Giant Peach (1996) ...,Animation|Children's|Musi cal ...
1,914,3,978301968,My Fair Lady (1964),Musical|Romance
1,3408,4,978300275,Erin Brockovich (2000),Drama
1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Come dy ...
1,1197,3,978302268,"Princess Bride, The (1987) ...",Action|Adventure|Comedy|R omance ...
1,1287,5,978302039,Ben-Hur (1959),Action|Adventure|Drama
1,2804,5,978300719,"Christmas Story, A (1983)",Comedy|Drama
1,594,4,978302268,Snow White and the Seven Dwarfs (1937) ...,Animation|Children's|Musi cal ...
1,919,4,978301368,"Wizard of Oz, The (1939)",Adventure|Children's|Dram a|Musical ...


In [10]:
(train_set, test_set) = data.random_split(0.95, seed=1)

In [11]:
m = graphlab.recommender.create(train_set, 'user_id', 'movie_id', 'rating')

In [12]:
m

Class                           : RankingFactorizationRecommender

Schema
------
User ID                         : user_id
Item ID                         : movie_id
Target                          : rating
Additional observation features : 3
Number of user side features    : 0
Number of item side features    : 0

Statistics
----------
Number of observations          : 949852
Number of users                 : 6040
Number of items                 : 3701

Training summary
----------------
Training time                   : 55.3548

Model Parameters
----------------
Model class                     : RankingFactorizationRecommender
num_factors                     : 32
binary_target                   : 0
side_data_factorization         : 1
solver                          : auto
nmf                             : 0
max_iterations                  : 25

Regularization Settings
-----------------------
regularization                  : 0.0
regularization_type             : normal
linear_regulariz

In [13]:
m2 = graphlab.item_similarity_recommender.create(train_set, 'user_id', 'movie_id', 'rating',
                                 similarity_type='pearson')

In [14]:
m2

Class                           : ItemSimilarityRecommender

Schema
------
User ID                         : user_id
Item ID                         : movie_id
Target                          : rating
Additional observation features : 0
Number of user side features    : 0
Number of item side features    : 0

Statistics
----------
Number of observations          : 949852
Number of users                 : 6040
Number of items                 : 3701

Training summary
----------------
Training time                   : 2.2216

Model Parameters
----------------
Model class                     : ItemSimilarityRecommender
only_top_k                      : 100
threshold                       : 0.001
similarity_type                 : pearson
training_method                 : auto

In [15]:
result = graphlab.recommender.util.compare_models(test_set, [m, m2],
                                            user_sample=.1, skip_set=train_set)

compare_models: using 562 users to estimate model performance
PROGRESS: Evaluate model M0

Precision and recall summary statistics by cutoff
+--------+-----------------+------------------+
| cutoff |  mean_precision |   mean_recall    |
+--------+-----------------+------------------+
|   1    |  0.05871886121  | 0.00415394097423 |
|   2    | 0.0604982206406 | 0.0102373242329  |
|   3    | 0.0575326215896 |  0.017098216765  |
|   4    | 0.0533807829181 | 0.0208825057104  |
|   5    | 0.0519572953737 |  0.028230003439  |
|   6    | 0.0489323843416 | 0.0306982682824  |
|   7    | 0.0444839857651 | 0.0319400789648  |
|   8    | 0.0438167259786 | 0.0355913311571  |
|   9    | 0.0405298536971 | 0.0373133821459  |
|   10   | 0.0402135231317 | 0.0454706085489  |
+--------+-----------------+------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.92123603090607)

Per User RMSE (best)
+---------+-------+------------------+
| user_id | count |       rmse       |
+---------+-------+------

('\nOverall RMSE: ', 0.8592006531033086)

Per User RMSE (best)
+---------+-------+------------------+
| user_id | count |       rmse       |
+---------+-------+------------------+
|   1672  |   1   | 0.00494259059299 |
+---------+-------+------------------+
[1 rows x 3 columns]


Per User RMSE (worst)
+---------+-------+---------------+
| user_id | count |      rmse     |
+---------+-------+---------------+
|   5216  |   1   | 3.23768928042 |
+---------+-------+---------------+
[1 rows x 3 columns]


Per Item RMSE (best)
+----------+-------+------+
| movie_id | count | rmse |
+----------+-------+------+
|   1829   |   1   | 0.0  |
+----------+-------+------+
[1 rows x 3 columns]


Per Item RMSE (worst)
+----------+-------+------+
| movie_id | count | rmse |
+----------+-------+------+
|   311    |   1   | 3.0  |
+----------+-------+------+
[1 rows x 3 columns]



## Getting similar items

In [16]:
m.get_similar_items([1287])  # movie_id is Ben-Hur

movie_id,similar,score,rank
1287,1204,0.618701994419,1
1287,2347,0.575415968895,2
1287,1387,0.540772914886,3
1287,260,0.508570075035,4
1287,474,0.502514183521,5
1287,2211,0.483749300241,6
1287,919,0.478861451149,7
1287,3066,0.473006010056,8
1287,2398,0.466828793287,9
1287,2402,0.45985430479,10


In [17]:
m.get_similar_items([1287]).join(items, on={'similar': 'movie_id'}).sort('rank')

movie_id,similar,score,rank,title,genre
1287,1204,0.618701994419,1,Lawrence of Arabia (1962),Adventure|War
1287,2347,0.575415968895,2,"Pope of Greenwich Village, The (1984) ...",Action
1287,1387,0.540772914886,3,Jaws (1975),Action|Horror
1287,260,0.508570075035,4,Star Wars: Episode IV - A New Hope (1977) ...,Action|Adventure|Fantasy |Sci-Fi ...
1287,474,0.502514183521,5,In the Line of Fire (1993) ...,Action|Thriller
1287,2211,0.483749300241,6,Secret Agent (1936),Thriller
1287,919,0.478861451149,7,"Wizard of Oz, The (1939)",Adventure|Children's|Dram a|Musical ...
1287,3066,0.473006010056,8,Tora! Tora! Tora! (1970),War
1287,2398,0.466828793287,9,Miracle on 34th Street (1947) ...,Drama
1287,2402,0.45985430479,10,Rambo: First Blood Part II (1985) ...,Action|War


## Making recommendations

In [18]:
recs = m.recommend()

In [19]:
recs

user_id,movie_id,score,rank
1,1259,4.39043042696,1
1,593,4.15463084615,2
1,1968,4.10094562806,3
1,1266,4.04903449192,4
1,1225,4.02175642158,5
1,590,4.01436398066,6
1,1214,4.01392466463,7
1,318,4.01332781352,8
1,2081,4.00469311214,9
1,2716,3.98063624538,10


In [20]:
data[data['user_id'] == 4].join(items, on='movie_id')

user_id,movie_id,rating,timestamp,title,genre
4,260,5,978294199,Star Wars: Episode IV - A New Hope (1977) ...,Action|Adventure|Fantasy |Sci-Fi ...
4,480,4,978294008,Jurassic Park (1993),Action|Adventure|Sci-Fi
4,1036,4,978294282,Die Hard (1988),Action|Thriller
4,1097,4,978293964,E.T. the Extra- Terrestrial (1982) ...,Children's|Drama|Fantasy |Sci-Fi ...
4,1196,2,978294199,Star Wars: Episode V - The Empire Strikes Back ...,Action|Adventure|Drama |Sci-Fi|War ...
4,1198,5,978294199,Raiders of the Lost Ark (1981) ...,Action|Adventure
4,1201,5,978294230,"Good, The Bad and The Ugly, The (1966) ...",Action|Western
4,1210,3,978293924,Star Wars: Episode VI - Return of the Jedi (1 ...,Action|Adventure|Romance |Sci-Fi|War ...
4,1214,4,978294260,Alien (1979),Action|Horror|Sci- Fi|Thriller ...
4,1240,5,978294260,"Terminator, The (1984)",Action|Sci-Fi|Thriller

title.1,genre.1
Star Wars: Episode IV - A New Hope (1977) ...,Action|Adventure|Fantasy |Sci-Fi ...
Jurassic Park (1993),Action|Adventure|Sci-Fi
Die Hard (1988),Action|Thriller
E.T. the Extra- Terrestrial (1982) ...,Children's|Drama|Fantasy |Sci-Fi ...
Star Wars: Episode V - The Empire Strikes Back ...,Action|Adventure|Drama |Sci-Fi|War ...
Raiders of the Lost Ark (1981) ...,Action|Adventure
"Good, The Bad and The Ugly, The (1966) ...",Action|Western
Star Wars: Episode VI - Return of the Jedi (1 ...,Action|Adventure|Romance |Sci-Fi|War ...
Alien (1979),Action|Horror|Sci- Fi|Thriller ...
"Terminator, The (1984)",Action|Sci-Fi|Thriller


In [21]:
m.recommend(users=[4], k=20).join(items, on='movie_id')

user_id,movie_id,score,rank,title,genre
4,50,4.08176738843,13,"Usual Suspects, The (1995) ...",Crime|Thriller
4,529,4.06445240303,19,Searching for Bobby Fischer (1993) ...,Drama
4,608,4.18539200052,9,Fargo (1996),Crime|Drama|Thriller
4,745,4.06479259654,18,"Close Shave, A (1995)",Animation|Comedy|Thriller
4,858,4.07894805893,14,"Godfather, The (1972)",Action|Crime|Drama
4,910,4.06670626327,17,Some Like It Hot (1959),Comedy|Crime
4,1079,4.07393412366,16,"Fish Called Wanda, A (1988) ...",Comedy
4,1197,4.13618976697,10,"Princess Bride, The (1987) ...",Action|Adventure|Comedy|R omance ...
4,1259,4.22820708975,7,Stand by Me (1986),Adventure|Comedy|Drama
4,1288,4.4368251197,1,This Is Spinal Tap (1984),Comedy|Drama|Musical


## Recommendations for new users

In [23]:
recent_data = graphlab.SFrame()
recent_data['movie_id'] = [1291] 
recent_data['user_id'] = 99999

In [24]:
m2.recommend(users=[99999], new_observation_data=recent_data).join(items, on='movie_id').sort('rank')

user_id,movie_id,score,rank,title,genre
99999,3881,5.0,1,Bittersweet Motel (2000),Documentary
99999,1830,5.0,2,Follow the Bitch (1998),Comedy
99999,3382,5.0,3,Song of Freedom (1936),Drama
99999,3656,5.0,4,Lured (1947),Crime
99999,572,5.0,5,Foreign Student (1994),Drama
99999,3172,5.0,6,Ulysses (Ulisse) (1954),Adventure
99999,3233,5.0,7,Smashing Time (1967),Comedy
99999,989,5.0,8,Schlafes Bruder (Brother of Sleep) (1995) ...,Drama
99999,787,5.0,9,"Gate of Heavenly Peace, The (1995) ...",Documentary
99999,3280,5.0,10,"Baby, The (1973)",Horror


## Saving and loading models

In [26]:
m.save('my_model')

In [27]:
m_again = graphlab.load_model('my_model')

In [28]:
m_again

Class                           : RankingFactorizationRecommender

Schema
------
User ID                         : user_id
Item ID                         : movie_id
Target                          : rating
Additional observation features : 3
Number of user side features    : 0
Number of item side features    : 0

Statistics
----------
Number of observations          : 949852
Number of users                 : 6040
Number of items                 : 3701

Training summary
----------------
Training time                   : 55.3548

Model Parameters
----------------
Model class                     : RankingFactorizationRecommender
num_factors                     : 32
binary_target                   : 0
side_data_factorization         : 1
solver                          : auto
nmf                             : 0
max_iterations                  : 25

Regularization Settings
-----------------------
regularization                  : 0.0
regularization_type             : normal
linear_regulariz