# 协同过滤

## 搜集偏好

In [1]:
critics={'Lisa Rose': {'Lady in the Water': 2.5, 
                       'Snakes on a Plane': 3.5, 
                       'Just My Luck': 3.0, 
                       'Superman Returns': 3.5, 
                       'You, Me and Dupree': 2.5, 
                       'The Night Listener': 3.0},
         'Gene Seymour': {'Lady in the Water': 3.0, 
                                  'Snakes on a Plane': 3.5,
                                  'Just My Luck': 1.5, 
                                  'Superman Returns': 5.0, 
                                  'The Night Listener': 3.0, 
                                  'You, Me and Dupree': 3.5}, 
         'Michael Phillips': {'Lady in the Water': 2.5, 
                                  'Snakes on a Plane': 3.0,
                                  'Superman Returns': 3.5, 
                                  'The Night Listener': 4.0},
         'Claudia Puig': {'Snakes on a Plane': 3.5, 
                              'Just My Luck': 3.0,
                              'The Night Listener': 4.5, 
                              'Superman Returns': 4.0, 
                              'You, Me and Dupree': 2.5},
         'Mick LaSalle': {'Lady in the Water': 3.0,
                              'Snakes on a Plane': 4.0,  
                              'Just My Luck': 2.0, 
                              'Superman Returns': 3.0,
                              'The Night Listener': 3.0,
                              'You, Me and Dupree': 2.0}, 
         'Jack Matthews': {'Lady in the Water': 3.0, 
                                   'Snakes on a Plane': 4.0,
                                   'The Night Listener': 3.0, 
                                   'Superman Returns': 5.0, 
                                   'You, Me and Dupree': 3.5},
         'Toby': {'Snakes on a Plane':4.5,
                    'You, Me and Dupree':1.0,
                    'Superman Returns':4.0}
        }

In [2]:
critics['Lisa Rose']['Lady in the Water']

2.5

In [3]:
critics['Toby']

{'Snakes on a Plane': 4.5, 'You, Me and Dupree': 1.0, 'Superman Returns': 4.0}

## 欧氏距离评价

<font size=5>
$EDS=\frac{1}{1+\sqrt{p_a^2-p_b^2}}$
</font>

In [4]:
def sim_distance(prefs, person1, person2):
  si={}
  for item in prefs[person1]: 
    if item in prefs[person2]: 
        si[item]=1

  if not len(si): 
    return 0

  sum_of_squares=sum([(prefs[person1][item] - prefs[person2][item]) ** 2 
                      for item in prefs[person1] if item in prefs[person2]])

  return 1 / (1 + (sum_of_squares) ** 0.5)

In [5]:
sim_distance(critics, 'Lisa Rose', 'Gene Seymour')

0.29429805508554946

## 皮尔逊相关系数评价

<font size=5>
$PCS=\frac{\sum{p_{a(i)}p_{b(i)}}-\frac{1}{n}(\sum{p_{a(i)}}-\sum{p_{b(i)}})}{\sqrt{(\sum{p_{a(i)}^2}-(\sum{p_{a(i)}})^2)}\sqrt{(\sum{p_{b(i)}^2}-(\sum{p_{b(i)}})^2)}}$
</font>

In [6]:
def sim_pearson(prefs, p1, p2):
  si={}
  for item in prefs[p1]: 
    if item in prefs[p2]: 
        si[item]=1

  if not len(si): 
    return 0

  n = len(si)
  
  sum1 = sum([prefs[p1][it] for it in si])
  sum2 = sum([prefs[p2][it] for it in si])
  
  sum1Sq = sum([prefs[p1][it] ** 2 for it in si])
  sum2Sq = sum([prefs[p2][it] ** 2 for it in si])
  
  pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si])
  
  num = pSum - (sum1 * sum2 / n)
  den = ((sum1Sq - sum1 ** 2 / n) * (sum2Sq - sum2 ** 2 / n)) ** 0.5
  if not den: 
        return 0

  return num / den

In [7]:
sim_pearson(critics, 'Lisa Rose', 'Gene Seymour')

0.39605901719066977

## 相似评论者排名

In [8]:
def topMatches(prefs, person, n=5, similarity=sim_pearson):
  scores=[(similarity(prefs, person, other), other) 
                  for other in prefs if other != person]
  scores.sort()
  scores.reverse()
  return scores[0:n]

In [9]:
topMatches(critics, 'Toby', n=3)

[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig')]

## 推荐物品

In [10]:
def getRecommendations(prefs, person, similarity=sim_pearson):
  totals={}
  simSums={}
  for other in prefs:
    if other==person: 
        continue
        
    sim=similarity(prefs,person,other)

    if sim <= 0: 
        continue
        
    for item in prefs[other]:
      if item not in prefs[person] or not prefs[person][item]:
        totals.setdefault(item,0)
        totals[item] += prefs[other][item] * sim
        simSums.setdefault(item,0)
        simSums[item] += sim

  rankings=[(total / simSums[item], item) for item, total in totals.items()]

  rankings.sort()
  rankings.reverse()
  return rankings

In [11]:
getRecommendations(critics, 'Toby')

[(3.3477895267131017, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.530980703765565, 'Just My Luck')]

In [12]:
getRecommendations(critics, 'Toby', similarity=sim_distance)

[(3.457128694491423, 'The Night Listener'),
 (2.778584003814924, 'Lady in the Water'),
 (2.422482042361917, 'Just My Luck')]

## 匹配商品

### 人员物品对调

In [13]:
def transformPrefs(prefs):
  result={}
  for person in prefs:
    for item in prefs[person]:
      result.setdefault(item, {})
      result[item][person] = prefs[person][item]
  return result

In [14]:
movies = transformPrefs(critics)
movies

{'Lady in the Water': {'Lisa Rose': 2.5,
  'Gene Seymour': 3.0,
  'Michael Phillips': 2.5,
  'Mick LaSalle': 3.0,
  'Jack Matthews': 3.0},
 'Snakes on a Plane': {'Lisa Rose': 3.5,
  'Gene Seymour': 3.5,
  'Michael Phillips': 3.0,
  'Claudia Puig': 3.5,
  'Mick LaSalle': 4.0,
  'Jack Matthews': 4.0,
  'Toby': 4.5},
 'Just My Luck': {'Lisa Rose': 3.0,
  'Gene Seymour': 1.5,
  'Claudia Puig': 3.0,
  'Mick LaSalle': 2.0},
 'Superman Returns': {'Lisa Rose': 3.5,
  'Gene Seymour': 5.0,
  'Michael Phillips': 3.5,
  'Claudia Puig': 4.0,
  'Mick LaSalle': 3.0,
  'Jack Matthews': 5.0,
  'Toby': 4.0},
 'You, Me and Dupree': {'Lisa Rose': 2.5,
  'Gene Seymour': 3.5,
  'Claudia Puig': 2.5,
  'Mick LaSalle': 2.0,
  'Jack Matthews': 3.5,
  'Toby': 1.0},
 'The Night Listener': {'Lisa Rose': 3.0,
  'Gene Seymour': 3.0,
  'Michael Phillips': 4.0,
  'Claudia Puig': 4.5,
  'Mick LaSalle': 3.0,
  'Jack Matthews': 3.0}}

In [15]:
topMatches(movies, 'Superman Returns')

[(0.6579516949597695, 'You, Me and Dupree'),
 (0.4879500364742689, 'Lady in the Water'),
 (0.11180339887498941, 'Snakes on a Plane'),
 (-0.1798471947990544, 'The Night Listener'),
 (-0.42289003161103106, 'Just My Luck')]

In [16]:
getRecommendations(movies, 'Just My Luck')

[(4.0, 'Michael Phillips'), (3.0, 'Jack Matthews')]