## 基于物品的协同过滤

In [1]:
import pandas as pd
import math

In [2]:
# 电影评分集
data_path = "F:\\tmp\\u.data"

In [3]:
# DataFrame
udata = pd.read_csv(data_path, 
                    sep = "\t", 
                    header = None, 
                    names = ['user_id', 'item_id', 'rating', 'timestamp'])
udata.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
item_user_cnt = udata.groupby("item_id")["user_id"].count()

In [5]:
item_user_cnt.head()

item_id
1    452
2    131
3     90
4    209
5     86
Name: user_id, dtype: int64

In [6]:
t = item_user_cnt.reset_index()
t.columns = ["item_id", "user_cnt"]
t.head()

Unnamed: 0,item_id,user_cnt
0,1,452
1,2,131
2,3,90
3,4,209
4,5,86


In [7]:
t.loc[t["item_id"] == 300, :]

Unnamed: 0,item_id,user_cnt
299,300,431


构建数据格式 {user_id: {item_id: rating}}

In [8]:
train = dict()

In [9]:
for index, row in udata.iterrows():
    user_id = str(row["user_id"])
    item_id = str(row["item_id"])
    rating = row["rating"]
    
    # 找不到默认值为-1
    if train.get(user_id, -1) == -1:
        train[user_id] = dict()
    train[user_id][item_id] = rating

In [10]:
# user_id=196下的item_id及打分
train["196"]

{'1007': 4,
 '1022': 4,
 '108': 4,
 '110': 1,
 '111': 4,
 '1118': 4,
 '116': 3,
 '1241': 3,
 '13': 2,
 '153': 5,
 '173': 2,
 '202': 3,
 '238': 4,
 '242': 3,
 '25': 4,
 '251': 3,
 '257': 2,
 '269': 3,
 '285': 5,
 '286': 5,
 '287': 3,
 '306': 4,
 '340': 3,
 '381': 4,
 '382': 4,
 '393': 4,
 '411': 4,
 '428': 4,
 '580': 2,
 '655': 5,
 '66': 3,
 '663': 5,
 '67': 5,
 '692': 5,
 '70': 3,
 '762': 3,
 '8': 5,
 '845': 4,
 '94': 3}

### 1、对item_1和item_2相同的user计数

In [11]:
# 两个item相同user的个数，分子
C = dict()
# 单个item总的用户个数，分母
N = dict()

In [12]:
for u, items in train.items():
    for i in items:
        if N.get(i) == None:
            N[i] = 0
        N[i] += 1
        
        if C.get(i, -1) == -1:
            C[i] = dict()
        
        for j in items:
            if i == j:
                continue
            elif C[i].get(j, -1) == -1:
                C[i][j] = 0
            C[i][j] += 1

In [13]:
# 物品id302和504的相同用户
C['302']['504']

42

In [14]:
C['302']

{'566': 57,
 '250': 71,
 '148': 41,
 '263': 10,
 '470': 37,
 '983': 7,
 '281': 46,
 '385': 62,
 '588': 52,
 '406': 14,
 '925': 12,
 '977': 16,
 '322': 94,
 '53': 49,
 '333': 141,
 '591': 73,
 '742': 74,
 '770': 33,
 '550': 48,
 '237': 113,
 '1277': 8,
 '1253': 5,
 '71': 52,
 '554': 35,
 '257': 95,
 '44': 37,
 '117': 108,
 '327': 110,
 '288': 186,
 '225': 25,
 '988': 27,
 '31': 49,
 '939': 26,
 '546': 78,
 '100': 159,
 '338': 53,
 '717': 23,
 '118': 74,
 '226': 53,
 '300': 182,
 '299': 46,
 '596': 34,
 '95': 62,
 '243': 33,
 '1016': 52,
 '79': 107,
 '306': 64,
 '106': 23,
 '829': 10,
 '934': 12,
 '1399': 6,
 '754': 40,
 '595': 19,
 '121': 111,
 '568': 65,
 '303': 99,
 '332': 84,
 '540': 19,
 '1046': 18,
 '98': 113,
 '56': 123,
 '880': 35,
 '203': 68,
 '1336': 4,
 '1083': 4,
 '291': 42,
 '356': 32,
 '684': 64,
 '38': 42,
 '477': 29,
 '258': 210,
 '298': 77,
 '820': 26,
 '1033': 6,
 '887': 45,
 '330': 26,
 '689': 54,
 '159': 32,
 '12': 113,
 '55': 64,
 '294': 160,
 '1213': 3,
 '1042': 13,

In [15]:
N['302']

297

In [16]:
N['504']

122

### 2、计算相似度

In [17]:
W = dict()

In [18]:
for i, related_items in C.items():
    if W.get(i) == None:
        W[i] = dict()
    
    for j, cij in related_items.items():
        if W[i].get(j) == None:
            W[i][j] = 0
        
        # 共同的用户数 / 各自用户个数乘积再开方
        # jarcard 相似度
        W[i][j] = cij / math.sqrt(N[i] * N[j] * 1.0)

In [19]:
# 返回可遍历的(键, 值) 元组数组
W["504"]

{'474': 0.4420059671208345,
 '317': 0.3047887380849036,
 '281': 0.25475491628290936,
 '486': 0.2829242063828704,
 '181': 0.3739375287269727,
 '625': 0.18996201139620134,
 '1346': 0.12146644952683741,
 '252': 0.14405274448481928,
 '152': 0.3699260221926026,
 '91': 0.29526819756754014,
 '8': 0.37318816486149997,
 '842': 0.24393057429501447,
 '477': 0.16719788795310572,
 '261': 0.06903285917739924,
 '432': 0.41180983169273844,
 '151': 0.28581573894518836,
 '98': 0.4905367606786807,
 '418': 0.3347915745636082,
 '465': 0.2651394713124213,
 '1142': 0.10919021808652282,
 '660': 0.37328844382740006,
 '652': 0.20995272258175168,
 '172': 0.4300594001819017,
 '498': 0.4920090524360641,
 '144': 0.3252407657266859,
 '195': 0.4331266727724779,
 '496': 0.43484747721079253,
 '237': 0.3003086413252952,
 '200': 0.4037071567446037,
 '473': 0.18550800209006713,
 '286': 0.30960560176216756,
 '502': 0.3957278328443196,
 '311': 0.10454167469786334,
 '651': 0.33924863545776557,
 '993': 0.08915343973839863,
 '

In [20]:
import operator
# top10
# item的相似item
sorted(W["504"].items(), key=operator.itemgetter(1), reverse=True)[:10]

[('654', 0.5749792108382483),
 ('435', 0.5359354214420653),
 ('185', 0.5329206224906753),
 ('23', 0.5234545671218749),
 ('234', 0.5194123864197661),
 ('134', 0.5147276243214554),
 ('194', 0.5132089665069938),
 ('133', 0.5054112324166712),
 ('211', 0.5046339459307547),
 ('482', 0.5041452147357779)]