In [31]:
# A dictionary of movie critics and their ratings of a small
# set of movies
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
      'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
      'The Night Listener': 3.0},
     'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
      'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
      'You, Me and Dupree': 3.5},
     'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
      'Superman Returns': 3.5, 'The Night Listener': 4.0},
     'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
      'The Night Listener': 4.5, 'Superman Returns': 4.0,
      'You, Me and Dupree': 2.5},
     'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
      'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
      'You, Me and Dupree': 2.0},
     'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
      'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
     'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

In [32]:
critics['Lisa Rose']['Lady in the Water']

2.5

In [33]:
critics['Toby']

{'Snakes on a Plane': 4.5, 'Superman Returns': 4.0, 'You, Me and Dupree': 1.0}

# find similar users

In [34]:
# 欧几里得距离
import numpy as np
np.sqrt(np.power(5-4, 2) + np.power(4-1, 2))

3.1622776601683795

In [35]:
def sim_distance(prefs,person1,person2):
    # Get the list of shared_items
    si={}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1
    # if they have no ratings in common, return 0
    if len(si)==0: return 0
    # Add up the squares of all the differences
    sum_of_squares=np.sum([np.power(prefs[person1][item]-prefs[person2][item],2)
                      for item in prefs[person1] if item in prefs[person2]])
    return 1/(1+sum_of_squares)

In [36]:
sim_distance(critics, 'Lisa Rose','Gene Seymour')

0.14814814814814814

In [37]:
def sim_pearson(prefs,p1,p2):
    # Get the list of mutually rated items
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]: si[item]=1
    # Find the number of elements
    n=len(si)
    # if they are no ratings in common, return 0
    if n==0: return 0
    # Add up all the preferences
    sum1=np.sum([prefs[p1][it] for it in si])
    sum2=np.sum([prefs[p2][it] for it in si])
    # Sum up the squares
    sum1Sq=np.sum([np.power(prefs[p1][it],2) for it in si])
    sum2Sq=np.sum([np.power(prefs[p2][it],2) for it in si])
    # Sum up the products
    pSum=np.sum([prefs[p1][it]*prefs[p2][it] for it in si])
    # Calculate Pearson score
    num=pSum-(sum1*sum2/n)
    den=np.sqrt((sum1Sq-np.power(sum1,2)/n)*(sum2Sq-np.power(sum2,2)/n))
    if den==0: return 0
    return num/den

In [38]:
sim_pearson(critics, 'Lisa Rose','Gene Seymour')

0.39605901719066977

In [39]:
# Returns the best matches for person from the prefs dictionary.
# Number of results and similarity function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
    scores=[(similarity(prefs,person,other),other)
        for other in prefs if other!=person]
    # Sort the list so the highest scores appear at the top 
    scores.sort( )
    scores.reverse( )
    return scores[0:n]

In [40]:
topMatches(critics,'Toby',n=3) # topN

[(0.99124070716192991, 'Lisa Rose'),
 (0.92447345164190486, 'Mick LaSalle'),
 (0.89340514744156474, 'Claudia Puig')]

# recommending items

In [41]:

# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
    totals={}
    simSums={}
    for other in prefs:
        # don't compare me to myself
        if other==person: continue
        sim=similarity(prefs,person,other)
        # ignore scores of zero or lower
        if sim<=0: continue
        for item in prefs[other]:   
            # only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item]==0:
                # Similarity * Score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=sim
    # Create the normalized list
    rankings=[(total/simSums[item],item) for item,total in totals.items()]
    # Return the sorted list
    rankings.sort()
    rankings.reverse()
    return rankings

In [42]:
# Now you can find out what movies I should watch next:
getRecommendations(critics,'Toby')

[(3.3477895267131013, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.5309807037655645, 'Just My Luck')]

In [43]:
# You’ll find that the results are only affected very slightly by the choice of similarity metric.
getRecommendations(critics,'Toby',similarity=sim_distance)

[(3.5002478401415877, 'The Night Listener'),
 (2.7561242939959363, 'Lady in the Water'),
 (2.4619884860743739, 'Just My Luck')]

# item-based filtering

In [44]:
# you just need to swap the people and the items. 
def transformPrefs(prefs):
    result={}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item,{})
            # Flip item and person
            result[item][person]=prefs[person][item]
    return result

movies = transformPrefs(critics)

In [45]:

topMatches(movies,'Superman Returns')

[(0.65795169495976946, 'You, Me and Dupree'),
 (0.48795003647426888, 'Lady in the Water'),
 (0.11180339887498941, 'Snakes on a Plane'),
 (-0.17984719479905439, 'The Night Listener'),
 (-0.42289003161103106, 'Just My Luck')]

In [46]:
def calculateSimilarItems(prefs,n=10):
    # Create a dictionary of items showing which other items they
    # are most similar to.
    result={}
    # Invert the preference matrix to be item-centric
    itemPrefs=transformPrefs(prefs)
    c=0
    for item in itemPrefs:
        # Status updates for large datasets
        c+=1
        if c%100==0: print "%d / %d" % (c,len(itemPrefs))
        # Find the most similar items to this one
        scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
        result[item]=scores
    return result

itemsim=calculateSimilarItems(critics) 
itemsim

{'Just My Luck': [(0.22222222222222221, 'Lady in the Water'),
  (0.18181818181818182, 'You, Me and Dupree'),
  (0.15384615384615385, 'The Night Listener'),
  (0.10526315789473684, 'Snakes on a Plane'),
  (0.064516129032258063, 'Superman Returns')],
 'Lady in the Water': [(0.40000000000000002, 'You, Me and Dupree'),
  (0.2857142857142857, 'The Night Listener'),
  (0.22222222222222221, 'Snakes on a Plane'),
  (0.22222222222222221, 'Just My Luck'),
  (0.090909090909090912, 'Superman Returns')],
 'Snakes on a Plane': [(0.22222222222222221, 'Lady in the Water'),
  (0.18181818181818182, 'The Night Listener'),
  (0.16666666666666666, 'Superman Returns'),
  (0.10526315789473684, 'Just My Luck'),
  (0.05128205128205128, 'You, Me and Dupree')],
 'Superman Returns': [(0.16666666666666666, 'Snakes on a Plane'),
  (0.10256410256410256, 'The Night Listener'),
  (0.090909090909090912, 'Lady in the Water'),
  (0.064516129032258063, 'Just My Luck'),
  (0.053333333333333337, 'You, Me and Dupree')],
 'Th

In [47]:
def getRecommendedItems(prefs,itemMatch,user):
    userRatings=prefs[user]
    scores={}
    totalSim={}
    # Loop over items rated by this user
    for (item,rating) in userRatings.items( ):
        # Loop over items similar to this one
        for (similarity,item2) in itemMatch[item]:
            # Ignore if this user has already rated this item
            if item2 in userRatings: continue
            # Weighted sum of rating times similarity
            scores.setdefault(item2,0)
            scores[item2]+=similarity*rating
            # Sum of all the similarities
            totalSim.setdefault(item2,0)
            totalSim[item2]+=similarity
    # Divide each total score by total weighting to get an average
    rankings=[(score/totalSim[item],item) for item,score in scores.items( )]
    # Return the rankings from highest to lowest
    rankings.sort( )
    rankings.reverse( )
    return rankings

getRecommendedItems(critics,itemsim,'Toby')

[(3.182634730538922, 'The Night Listener'),
 (2.5983318700614575, 'Just My Luck'),
 (2.4730878186968837, 'Lady in the Water')]

# 使用graphlab进行电影推荐

In [57]:
%matplotlib inline
import graphlab
graphlab.canvas.set_target("ipynb")
# set canvas to show sframes and sgraphs in ipython notebook
import matplotlib.pyplot as plt


A newer version of GraphLab Create (v1.9) is available! Your current version is v1.8.5.

You can use pip to upgrade the graphlab-create package. For more information see https://dato.com/products/create/upgrade.


In [59]:
data = graphlab.SFrame.read_csv('/Users/apricot/documents/github/bigdata/ml-1m/ratings.dat', delimiter='\n', 
                                header=False)['X1'].apply(lambda x: x.split('::')).unpack()
for col in data.column_names():
    data[col] = data[col].astype(int)
data.rename({'X.0': 'user_id', 'X.1': 'movie_id', 'X.2': 'rating', 'X.3': 'timestamp'})
data.save('ratings')

users = graphlab.SFrame.read_csv('/Users/apricot/documents/github/bigdata/ml-1m/users.dat', delimiter='\n', 
                                 header=False)['X1'].apply(lambda x: x.split('::')).unpack()
users.rename({'X.0': 'user_id', 'X.1': 'gender', 'X.2': 'age', 'X.3': 'occupation', 'X.4': 'zip-code'})
users['user_id'] = users['user_id'].astype(int)
users.save('users')

items = graphlab.SFrame.read_csv('/Users/apricot/documents/github/bigdata/ml-1m/movies.dat', delimiter='\n', 
                                 header=False)['X1'].apply(lambda x: x.split('::')).unpack()
items.rename({'X.0': 'movie_id', 'X.1': 'title', 'X.2': 'genre'})
items['movie_id'] = items['movie_id'].astype(int)
items.save('items')

------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [60]:
data.show()


In [61]:
items.head()

movie_id,title,genre
1,Toy Story (1995),Animation|Children's|Come dy ...
2,Jumanji (1995),Adventure|Children's|Fant asy ...
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995) ...,Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children's
9,Sudden Death (1995),Action
10,GoldenEye (1995),Action|Adventure|Thriller


In [62]:
data = data.join(items, on='movie_id')

In [63]:
data

user_id,movie_id,rating,timestamp,title,genre
1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975) ...,Drama
1,661,3,978302109,James and the Giant Peach (1996) ...,Animation|Children's|Musi cal ...
1,914,3,978301968,My Fair Lady (1964),Musical|Romance
1,3408,4,978300275,Erin Brockovich (2000),Drama
1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Come dy ...
1,1197,3,978302268,"Princess Bride, The (1987) ...",Action|Adventure|Comedy|R omance ...
1,1287,5,978302039,Ben-Hur (1959),Action|Adventure|Drama
1,2804,5,978300719,"Christmas Story, A (1983)",Comedy|Drama
1,594,4,978302268,Snow White and the Seven Dwarfs (1937) ...,Animation|Children's|Musi cal ...
1,919,4,978301368,"Wizard of Oz, The (1939)",Adventure|Children's|Dram a|Musical ...


In [64]:
(train_set, test_set) = data.random_split(0.95, seed=1)

In [65]:
m = graphlab.recommender.create(train_set, 'user_id', 'movie_id', 'rating')#不是基于人或者物，default是基于隐语义网络

In [66]:
# RMSE:是预测准确度，和用户评分的，均方根误差，所以RMSE这个值越小越好

In [67]:
m

Class                           : RankingFactorizationRecommender

Schema
------
User ID                         : user_id
Item ID                         : movie_id
Target                          : rating
Additional observation features : 3
Number of user side features    : 0
Number of item side features    : 0

Statistics
----------
Number of observations          : 949852
Number of users                 : 6040
Number of items                 : 3701

Training summary
----------------
Training time                   : 73.9068

Model Parameters
----------------
Model class                     : RankingFactorizationRecommender
num_factors                     : 32
binary_target                   : 0
side_data_factorization         : 1
solver                          : auto
nmf                             : 0
max_iterations                  : 25

Regularization Settings
-----------------------
regularization                  : 0.0
regularization_type             : normal
linear_regulariz

In [68]:
m2 = graphlab.item_similarity_recommender.create(train_set, 'user_id', 'movie_id', 'rating',
                                 similarity_type='pearson')
#m2 是基于物品相似度，的推荐；m是基于graphlab默认的隐语义网络推荐

In [71]:
m2

Class                           : ItemSimilarityRecommender

Schema
------
User ID                         : user_id
Item ID                         : movie_id
Target                          : rating
Additional observation features : 0
Number of user side features    : 0
Number of item side features    : 0

Statistics
----------
Number of observations          : 949852
Number of users                 : 6040
Number of items                 : 3701

Training summary
----------------
Training time                   : 2.3529

Model Parameters
----------------
Model class                     : ItemSimilarityRecommender
only_top_k                      : 100
threshold                       : 0.001
similarity_type                 : pearson
training_method                 : auto

In [73]:
result = graphlab.recommender.util.compare_models(test_set, [m, m2],
                                            user_sample=.1, skip_set=train_set)

compare_models: using 562 users to estimate model performance
PROGRESS: Evaluate model M0

Precision and recall summary statistics by cutoff
+--------+-----------------+------------------+
| cutoff |  mean_precision |   mean_recall    |
+--------+-----------------+------------------+
|   1    | 0.0782918149466 | 0.00575729157217 |
|   2    | 0.0720640569395 | 0.0105311093398  |
|   3    | 0.0699881376038 | 0.0179170036041  |
|   4    | 0.0693950177936 | 0.0251115765608  |
|   5    | 0.0672597864769 | 0.0296188242818  |
|   6    | 0.0649466192171 | 0.0338761264731  |
|   7    | 0.0645653279105 |  0.041352072745  |
|   8    | 0.0609430604982 | 0.0437792398918  |
|   9    | 0.0581257413998 | 0.0480299929916  |
|   10   | 0.0564056939502 | 0.0518792269248  |
+--------+-----------------+------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.9410350524913681)

Per User RMSE (best)
+---------+-------+-----------------+
| user_id | count |       rmse      |
+---------+-------+------

('\nOverall RMSE: ', 0.8434476804449265)

Per User RMSE (best)
+---------+-------+-------------------+
| user_id | count |        rmse       |
+---------+-------+-------------------+
|   758   |   1   | 0.000898736484485 |
+---------+-------+-------------------+
[1 rows x 3 columns]


Per User RMSE (worst)
+---------+-------+---------------+
| user_id | count |      rmse     |
+---------+-------+---------------+
|   200   |   1   | 3.72375859435 |
+---------+-------+---------------+
[1 rows x 3 columns]


Per Item RMSE (best)
+----------+-------+------+
| movie_id | count | rmse |
+----------+-------+------+
|   1842   |   1   | 0.0  |
+----------+-------+------+
[1 rows x 3 columns]


Per Item RMSE (worst)
+----------+-------+---------------+
| movie_id | count |      rmse     |
+----------+-------+---------------+
|   3051   |   1   | 3.72375859435 |
+----------+-------+---------------+
[1 rows x 3 columns]



# getting similar items

In [74]:
m.get_similar_items([1287])  # movie_id is Ben-Hur

movie_id,similar,score,rank
1287,953,0.550211429596,1
1287,110,0.526422798634,2
1287,1304,0.517914295197,3
1287,3876,0.507396161556,4
1287,2948,0.499240607023,5
1287,2655,0.489075362682,6
1287,3585,0.487730115652,7
1287,3639,0.466367900372,8
1287,2947,0.457333534956,9
1287,943,0.451839417219,10


In [75]:
m.get_similar_items([1287]).join(items, on={'similar': 'movie_id'}).sort('rank')

movie_id,similar,score,rank,title,genre
1287,953,0.550211429596,1,It's a Wonderful Life (1946) ...,Drama
1287,110,0.526422798634,2,Braveheart (1995),Action|Drama|War
1287,1304,0.517914295197,3,Butch Cassidy and the Sundance Kid (1969) ...,Action|Comedy|Western
1287,3876,0.507396161556,4,Jerry & Tom (1998),Drama
1287,2948,0.499240607023,5,From Russia with Love (1963) ...,Action
1287,2655,0.489075362682,6,Howling II: Your Sister Is a Werewolf (1985) ...,Horror
1287,3585,0.487730115652,7,"Great Locomotive Chase, The (1956) ...",Adventure|War
1287,3639,0.466367900372,8,"Man with the Golden Gun, The (1974) ...",Action
1287,2947,0.457333534956,9,Goldfinger (1964),Action
1287,943,0.451839417219,10,"Ghost and Mrs. Muir, The (1947) ...",Drama|Romance


## making recommendations

In [76]:
recs = m.recommend()

In [87]:
recs

user_id,movie_id,score,rank
1,858,4.88081762827,1
1,2858,4.69082829989,2
1,1213,4.56876999773,3
1,296,4.50618979968,4
1,788,4.46558302901,5
1,593,4.41691503561,6
1,1259,4.404100186,7
1,1221,4.34598186127,8
1,1923,4.33906546868,9
1,356,4.33732641376,10


In [78]:
data[data['user_id'] == 4].join(items, on='movie_id')

user_id,movie_id,rating,timestamp,title,genre
4,260,5,978294199,Star Wars: Episode IV - A New Hope (1977) ...,Action|Adventure|Fantasy |Sci-Fi ...
4,480,4,978294008,Jurassic Park (1993),Action|Adventure|Sci-Fi
4,1036,4,978294282,Die Hard (1988),Action|Thriller
4,1097,4,978293964,E.T. the Extra- Terrestrial (1982) ...,Children's|Drama|Fantasy |Sci-Fi ...
4,1196,2,978294199,Star Wars: Episode V - The Empire Strikes Back ...,Action|Adventure|Drama |Sci-Fi|War ...
4,1198,5,978294199,Raiders of the Lost Ark (1981) ...,Action|Adventure
4,1201,5,978294230,"Good, The Bad and The Ugly, The (1966) ...",Action|Western
4,1210,3,978293924,Star Wars: Episode VI - Return of the Jedi (1 ...,Action|Adventure|Romance |Sci-Fi|War ...
4,1214,4,978294260,Alien (1979),Action|Horror|Sci- Fi|Thriller ...
4,1240,5,978294260,"Terminator, The (1984)",Action|Sci-Fi|Thriller

title.1,genre.1
Star Wars: Episode IV - A New Hope (1977) ...,Action|Adventure|Fantasy |Sci-Fi ...
Jurassic Park (1993),Action|Adventure|Sci-Fi
Die Hard (1988),Action|Thriller
E.T. the Extra- Terrestrial (1982) ...,Children's|Drama|Fantasy |Sci-Fi ...
Star Wars: Episode V - The Empire Strikes Back ...,Action|Adventure|Drama |Sci-Fi|War ...
Raiders of the Lost Ark (1981) ...,Action|Adventure
"Good, The Bad and The Ugly, The (1966) ...",Action|Western
Star Wars: Episode VI - Return of the Jedi (1 ...,Action|Adventure|Romance |Sci-Fi|War ...
Alien (1979),Action|Horror|Sci- Fi|Thriller ...
"Terminator, The (1984)",Action|Sci-Fi|Thriller


In [79]:
m.recommend(users=[4], k=20).join(items, on='movie_id')

user_id,movie_id,score,rank,title,genre
4,1,4.41114833094,5,Toy Story (1995),Animation|Children's|Come dy ...
4,318,4.35419570542,6,"Shawshank Redemption, The (1994) ...",Drama
4,357,4.16431414224,16,Four Weddings and a Funeral (1994) ...,Comedy|Romance
4,457,4.35095976926,7,"Fugitive, The (1993)",Action|Thriller
4,913,4.18459271178,15,"Maltese Falcon, The (1941) ...",Film-Noir|Mystery
4,919,4.56882541514,3,"Wizard of Oz, The (1939)",Adventure|Children's|Dram a|Musical ...
4,969,4.64023889861,2,"African Queen, The (1951)",Action|Adventure|Romance| War ...
4,1136,4.12968237027,19,Monty Python and the Holy Grail (1974) ...,Comedy
4,1172,4.12496899701,20,Cinema Paradiso (1988),Comedy|Drama|Romance
4,1197,4.31999694444,8,"Princess Bride, The (1987) ...",Action|Adventure|Comedy|R omance ...


## recommendations for new users

In [81]:
recent_data = graphlab.SFrame()
recent_data['movie_id'] = [1291] 
recent_data['user_id'] = 99999

In [82]:
m2.recommend(users=[99999], new_observation_data=recent_data).join(items, on='movie_id').sort('rank')

user_id,movie_id,score,rank,title,genre
99999,3607,5.0,1,One Little Indian (1973),Comedy|Drama|Western
99999,1830,5.0,2,Follow the Bitch (1998),Comedy
99999,3382,5.0,3,Song of Freedom (1936),Drama
99999,3656,5.0,4,Lured (1947),Crime
99999,572,5.0,5,Foreign Student (1994),Drama
99999,989,5.0,6,Schlafes Bruder (Brother of Sleep) (1995) ...,Drama
99999,3172,5.0,7,Ulysses (Ulisse) (1954),Adventure
99999,3233,5.0,8,Smashing Time (1967),Comedy
99999,787,5.0,9,"Gate of Heavenly Peace, The (1995) ...",Documentary
99999,3280,5.0,10,"Baby, The (1973)",Horror


### saving and loading models

In [83]:
m.save('my_model')

In [84]:
m_again = graphlab.load_model('my_model')

In [85]:
m_again

Class                           : RankingFactorizationRecommender

Schema
------
User ID                         : user_id
Item ID                         : movie_id
Target                          : rating
Additional observation features : 3
Number of user side features    : 0
Number of item side features    : 0

Statistics
----------
Number of observations          : 949852
Number of users                 : 6040
Number of items                 : 3701

Training summary
----------------
Training time                   : 73.9068

Model Parameters
----------------
Model class                     : RankingFactorizationRecommender
num_factors                     : 32
binary_target                   : 0
side_data_factorization         : 1
solver                          : auto
nmf                             : 0
max_iterations                  : 25

Regularization Settings
-----------------------
regularization                  : 0.0
regularization_type             : normal
linear_regulariz