In [1]:
import pandas as pd
import numpy as np
import os

# 读取数据

In [2]:
# 用户数据集
users = pd.read_table("ml-1m/users.dat",sep = '::',header=None,engine='python')
users.columns = (['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
users

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [3]:
# 电影数据集
movies = pd.read_table("ml-1m/movies.dat",sep = '::',header=None,engine='python')
movies.columns = (['MovieID', 'Title', 'Genres'])
movies

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [4]:
# 用户对电影的评分
ratings = pd.read_table("ml-1m/ratings.dat",sep = '::',header=None,engine='python')
ratings.columns = (['UserID', 'MovieID', 'Rating', 'Timestamp'])
ratings

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [5]:
import random

In [6]:
ratings

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [7]:
# 数据集划分
def SplitData(data, M, k, seed):
    data = np.array(data)
    test = []
    train = []
    random.seed(seed)
    for data1 in data:
        if random.randint(0, M) == k:
            test.append([data1[0], data1[1]])
        else:
            train.append([data1[0], data1[1]])
    return train, test

In [8]:
train, test = SplitData(ratings, 7, 2, 2)

In [9]:
print(len(train))
print(len(test))
print(len(train) + len(test))

875165
125044
1000209


# 基于邻域的算法

### 基于用户的协同滤波算法

In [10]:
import math
# 计算用户之间的相似度：余弦相似度

def UserSimilarity(train, usersL):
    # 建立物体到用户的倒排表
    item_users = dict()
    for u, items in train:
        if items not in item_users:
            item_users[items] = set()
        item_users[items].add(u)
    
    C = dict() # 计算用户之间的兴趣相似度
    N = dict() # 保存每个用户感兴趣物品的数量
    for i, users in item_users.items():
        for u in users:
            if u not in N:
                N[u] = 0
            N[u] +=1
            
            for v in users: # C[u,v]和C[v,u]需要分别计算
                if u == v:
                    continue
                if (u, v) not in C:
                    C[u, v] = 0
                C[u, v] += 1
    
    # 计算相似性矩阵
    W = dict()
    for u, cuv in C.items():
        W[u] = cuv / math.sqrt(N[u[0]]*N[u[1]])
    
    return W           

In [102]:
# 根据训练数据计算用户兴趣相似度矩阵
W = UserSimilarity(train, users)
print(len(W))

34258192


In [103]:
import numpy as np
# 实现UserCF算法
def Recommend(user, train, W, K):
    train_arr = np.array(train)
    rank = dict()
    interacted_items = train_arr[train_arr[:, 0] == user] # 得到user现有的喜欢的物品
    
    W1 = dict()
    for u, cuv in W.items(): # 找出当前user与其它用户的兴趣相似度以及用户（用户， 相似度）
        if u[0] == user:
            W1[u[1]] = cuv
    
    W1 = sorted(W1.items(), key = lambda x : x[1], reverse=True)[0:K] # 按相似度大小，找到最相似的K个用户
    
    for v, wuv in W1:
        for i in set(train_arr[train_arr[:, 0] == v, 1]): # 找出相似用户感兴趣的电影
            if i in interacted_items: # 确保不是user现有的喜欢的电影
                continue
            if i not in rank:
                rank[i] = 0
            rank[i] += wuv #计算当前电影的感兴趣程度
    return rank

In [105]:
# 利用和当前用户兴趣相似度比较大的前K个用户来预测当前用户喜欢的电影
# 在训练集上的结果
K = 20
user = 5
rank_train = Recommend(user, train, W, K)
rank_train_sorted = sorted(rank_train.items(), key = lambda x : x[1], reverse=True)
print(len(rank_train_sorted))
#for i in rank_sorted:
#    print(i)
    
# 在测试集上的结果
rank_test = Recommend(user, test, W, K)
rank_test_sorted = sorted(rank_test.items(), key = lambda x : x[1], reverse=True)
print(len(rank_test_sorted))

tvt = set()

for m in rank_train.keys():
    if m in rank_test.keys():
        tvt.add(m)

# 同时存在
total_both = len(tvt)

# 至少存在一个
total_one = len(rank_train_sorted) + len(rank_test_sorted) - total_both

# 准确率
precision = total_both / len(rank_train_sorted)

# 召回率
recall = total_both / len(rank_test_sorted)

# 覆盖率
overcall = total_both / 3883

print('准确率：', precision)
print('召回率：', recall)
print('覆盖度: ', overcall)

1341
511
准确率： 0.26547352721849365
召回率： 0.6966731898238747
覆盖度:  0.09168168941540046


In [86]:
print(rank_train.keys())

dict_keys([1537, 515, 1544, 2571, 2067, 21, 3095, 3105, 3107, 3108, 1084, 1090, 2628, 3654, 1096, 1610, 3147, 589, 1103, 3678, 95, 1124, 110, 3699, 647, 648, 2194, 1687, 1690, 163, 1188, 165, 2728, 1193, 3255, 3256, 3257, 1207, 2236, 1213, 1217, 1225, 2268, 1244, 1245, 1246, 736, 3809, 1247, 1253, 2278, 235, 1784, 1792, 2312, 265, 1801, 780, 1293, 2321, 292, 2852, 2353, 3893, 2359, 318, 2881, 1357, 1873, 1370, 3418, 1372, 2396, 349, 2916, 356, 2427, 380, 1917, 3451, 2943, 1408, 902, 3468, 1954, 1955, 1957, 434, 442, 2501, 457, 459, 2002, 3030, 982, 3035, 2028, 498, 1527, 3578, 3071, 2947, 2692, 2951, 1036, 2366, 1214, 3527, 1097, 1240, 1387, 3702, 3072, 1, 3074, 3717, 1030, 1035, 3600, 1296, 17, 1043, 912, 2966, 920, 3610, 1947, 2082, 1441, 34, 2469, 1959, 296, 3753, 3624, 3501, 48, 3508, 2100, 569, 1088, 3524, 838, 199, 2506, 588, 1101, 3536, 3408, 595, 597, 2017, 3682, 1380, 3685, 2406, 364, 3565, 1007, 2802, 377, 383, 6, 1573, 1722, 1221, 1997, 3793, 861, 1270])


In [84]:
print(rank_test.keys())

dict_keys([1953, 1442, 1597, 3334, 1385, 1834, 1962, 2126, 3471, 368, 1552, 3735, 920, 1945, 2490, 3068, 2717, 480, 1201, 1569, 1188, 1028, 266, 1674, 1806, 914, 3604, 1688, 1566, 377, 442, 457, 474])


In [87]:
print( 1537 in rank_train.keys())

True


In [74]:
rank = Recommend(3, test[:600], W, 4)
rank_sorted = sorted(rank.items(), key = lambda x : x[1], reverse=True)
print(len(rank_sorted))
for i in rank_sorted:
    print(i)

33
(368, 0.2591427100024083)
(1953, 0.14848604296791068)
(1442, 0.14848604296791068)
(1597, 0.14848604296791068)
(3334, 0.14848604296791068)
(1385, 0.14848604296791068)
(1834, 0.14848604296791068)
(1962, 0.14848604296791068)
(2126, 0.14848604296791068)
(3471, 0.14848604296791068)
(1552, 0.14848604296791068)
(3735, 0.14848604296791068)
(920, 0.14848604296791068)
(1945, 0.14848604296791068)
(2490, 0.14848604296791068)
(3068, 0.14848604296791068)
(2717, 0.14848604296791068)
(480, 0.13109470506889243)
(1201, 0.13109470506889243)
(1569, 0.11065666703449763)
(1188, 0.11065666703449763)
(1028, 0.11065666703449763)
(266, 0.11065666703449763)
(1674, 0.11065666703449763)
(1806, 0.11065666703449763)
(914, 0.11065666703449763)
(3604, 0.11065666703449763)
(1688, 0.11065666703449763)
(1566, 0.11065666703449763)
(377, 0.109971479845643)
(442, 0.109971479845643)
(457, 0.109971479845643)
(474, 0.109971479845643)


In [54]:
a = np.array([[1,2,3],[4,5,6],[1,2,3]])
set(a[a[:, 0] == 1, 2])

{3}