In [11]:
import os
import math
import random
import pandas as pd

In [1]:
class UserBasedCF:
    def __init__(self, path):
        self.train = {}
        self.test = {}
        self.generate_dataset(path)

    def load_file(self, path):
        with open(path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                yield line.strip()

    def generate_dataset(self, path, pivot=0.7):
        i = 0
        for line in self.load_file(path):
            user, movie, rating, _ = line.split("::")
            if i < 10:
                print(' {}, {}, {}, {}'.format(user, movie, rating, _))
            i += 1
            if random.random() < pivot:
                self.train.setdefault(user, {})
                self.train[user][movie] = int(rating)
            else:
                self.test.setdefault(user, {})
                self.test[user][movie] = int(rating)

    def user_similarity(self):
        # 建立物品-用户倒排表
        self.item_users = dict()
        for user, items in self.train.items():
            for i in items.keys():
                if i not in self.item_users:
                    self.item_users[i] = set()
                self.item_users[i].add(user)
        
        C = dict()  # 用户-用户共现矩阵
        N = dict()  # 用户产生行为的物品个数
        for i, users in self.item_users.items():
            for u in users:
                N.setdefault(u, 0)
                N[u] += 1
                C.setdefault(u, {})
                for v in users:
                    if u == v:
                        continue
                    C[u].setdefault(v, 0)
                    C[u][v] += 1
        
        self.W = dict() # 相似度矩阵
        for u, related_users in C.items():
            self.W.setdefault(u, {})
            for v, cuv in related_users.items():
                self.W[u][v] = cuv / math.sqrt(N[u] * N[v])
        
        return self.W, C, N

    def recommend(self, u, k=3, n=10):
        rank = dict()
        action_item = self.train[u].keys()  # user产生过行为的item
        for v, wuv in sorted(self.W[u].items(), key=lambda x:x[1], reverse=True)[:k]:
            for i, rvi in self.train[v].items():
                if i in action_item:
                    continue
                rank.setdefault(i, 0)
                rank[i] += wuv * rvi
        return dict(sorted(rank.items(), key=lambda x: x[1], reverse=True)[:n])

    def recall_precision(self, k=8, nitem=10):
        hit = 0
        recall = 0
        precision = 0
        for user, items in self.test.items():
            rank = self.recommend(user, k=k, n=nitem)
            hit += len(set(rank.keys()) & set(items.keys()))
            recall += len(items)
            precision += nitem
        return hit / recall, hit / precision

In [3]:
def print_2_dim_dic(dic, n=3):
    n = 0
    for u, v_cnt in dic.items():
        if n >= 3:
            break
        n += 1
        m = 1
        for v, cnt in v_cnt.items():
            if m >= 3:
                break
            m += 1
            print(u, v, cnt)

In [4]:
def print_1_dim_dic(dic, n=3):
    n = 0
    for u,i_cnt in dic.items():
        if n >= 3:
            break
        n += 1    
        print(u, i_cnt)

In [6]:
def sort_2_dim_dic(dic, k, n=5):
    return sorted(dic[k].items(), key=lambda x:x[1],reverse=True)[:n]

def sort_1_dim_dic(dic, n=5):
    return sorted(dic.items(), key=lambda x:x[1],reverse=True)[:n]

def trans_dic_2_dim(dic):
     return pd.DataFrame(dic).T.fillna(0)

In [9]:
path = os.path.join(r"C:\Users\cmj\Documents\Codes\Dataset\ml-1m", "ratings.dat")
ucf = UserBasedCF(path)

 1, 1193, 5, 978300760
 1, 661, 3, 978302109
 1, 914, 3, 978301968
 1, 3408, 4, 978300275
 1, 2355, 5, 978824291
 1, 1197, 3, 978302268
 1, 1287, 5, 978302039
 1, 2804, 5, 978300719
 1, 594, 4, 978302268
 1, 919, 4, 978301368


In [12]:
W, C, N = ucf.user_similarity()

In [13]:
df_c = trans_dic_2_dim(C)

In [14]:
df_c.shape

(6040, 6040)

In [15]:
df_c.iloc[:10, :10]

Unnamed: 0,2041,300,1305,4956,319,4093,2008,5759,5838,3001
620,33.0,14.0,1.0,12.0,47.0,15.0,20.0,43.0,13.0,16.0
2041,0.0,66.0,2.0,60.0,149.0,53.0,46.0,128.0,62.0,65.0
300,66.0,0.0,2.0,35.0,66.0,30.0,22.0,53.0,34.0,38.0
1305,2.0,2.0,0.0,1.0,4.0,2.0,2.0,2.0,1.0,2.0
4956,60.0,35.0,1.0,0.0,77.0,24.0,29.0,59.0,40.0,28.0
319,149.0,66.0,4.0,77.0,0.0,47.0,59.0,144.0,59.0,90.0
4093,53.0,30.0,2.0,24.0,47.0,0.0,18.0,32.0,23.0,31.0
2008,46.0,22.0,2.0,29.0,59.0,18.0,0.0,43.0,19.0,26.0
5759,128.0,53.0,2.0,59.0,144.0,32.0,43.0,0.0,67.0,57.0
5838,62.0,34.0,1.0,40.0,59.0,23.0,19.0,67.0,0.0,21.0


In [16]:
sort_2_dim_dic(C, '1')

[('531', 29), ('2529', 26), ('1605', 26), ('4277', 25), ('5795', 24)]

In [17]:
print_1_dim_dic(N)

620 127
2041 383
300 172


In [18]:
sort_1_dim_dic(N)

[('4169', 1616), ('1680', 1303), ('4277', 1224), ('1941', 1126), ('889', 1065)]

In [19]:
df_w = trans_dic_2_dim(W)

In [20]:
df_w.shape

(6040, 6040)

In [21]:
df_w.iloc[:10, :10]

Unnamed: 0,2041,300,1305,4956,319,4093,2008,5759,5838,3001
620,0.149628,0.094724,0.022911,0.080957,0.184676,0.133774,0.162008,0.190544,0.091773,0.095289
2041,0.0,0.257146,0.026387,0.233093,0.337133,0.272182,0.214569,0.326617,0.252037,0.222914
300,0.257146,0.0,0.039375,0.202899,0.222841,0.2299,0.153133,0.201809,0.206246,0.194466
1305,0.026387,0.039375,0.0,0.01963,0.045733,0.0519,0.04714,0.025788,0.020541,0.034658
4956,0.233093,0.202899,0.01963,0.0,0.259228,0.183388,0.201272,0.224005,0.24194,0.142876
319,0.337133,0.222841,0.045733,0.259228,0.0,0.209168,0.238493,0.318424,0.207844,0.267474
4093,0.272182,0.2299,0.0519,0.183388,0.209168,0.0,0.165145,0.160605,0.1839,0.209107
2008,0.214569,0.153133,0.04714,0.201272,0.238493,0.165145,0.0,0.196022,0.137986,0.159297
5759,0.326617,0.201809,0.025788,0.224005,0.318424,0.160605,0.196022,0.0,0.266179,0.191041
5838,0.252037,0.206246,0.020541,0.24194,0.207844,0.1839,0.137986,0.266179,0.0,0.112128


In [22]:
sort_2_dim_dic(W, '520')

[('1988', 0.3176145442571894),
 ('5643', 0.30254703190195087),
 ('1317', 0.3019207918666382),
 ('4867', 0.2941092394640641),
 ('5333', 0.2921545008130789)]

In [24]:
recommend = ucf.recommend("520")
recommend

{'2599': 4.610411840128893,
 '3545': 4.610411840128893,
 '2858': 4.308491048262255,
 '1131': 4.307864808226942,
 '2352': 4.292797295871703,
 '922': 4.005944016360303,
 '3114': 4.005944016360303,
 '1': 3.990876504005065,
 '1641': 3.990876504005065,
 '3019': 3.9902502639697524}

In [26]:
ucf.recall_precision()

(0.06856232878534003, 0.34144039735099335)

- 增加惩罚

In [None]:
class UserBasedCF:
    def __init__(self, path):
        self.train = {}
        self.test = {}
        self.generate_dataset(path)

    def load_file(self, path):
        with open(path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                yield line.strip()

    def generate_dataset(self, path, pivot=0.7):
        i = 0
        for line in self.load_file(path):
            user, movie, rating, _ = line.split("::")
            if i < 10:
                print(' {}, {}, {}, {}'.format(user, movie, rating, _))
            i += 1
            if random.random() < pivot:
                self.train.setdefault(user, {})
                self.train[user][movie] = int(rating)
            else:
                self.test.setdefault(user, {})
                self.test[user][movie] = int(rating)

    def user_similarity(self):
        self.item_users = dict()
        for user, items in self.train.items():
            for i in items.keys():
                if i not in self.item_users:
                    self.item_users[i] = set()
                self.item_users[i].add(user)
        
        C = dict()
        W = dict()
        for i, users in self.item_users.items():
            for u in users:
                N.setdefault(u, 0)
                N[u] += 1
                C.setdefault(u, {})
                for v in users:
                    if u == v:
                        continue
                    C[u].setdefault(v, 0)
                    C[u][v] += 1 / math.log(1 + len(u))
        self.W = dict()
        for u, related_users in C.items():
            self.W.setdefault(u, {})
            for v, cuv in related_users.items():
                self.W[u][v] = cuv / math.sqrt(N[u] * N[v])
        return self.W, C, N

    def recommend(self, u, k=3, n=10):
        rank = dict()
        action_item = self.train[u].keys()  # user产生过行为的item
        for v, wuv in sorted(self.W[u].items(), key=lambda x:x[1], reverse=True)[:k]:
            for i, rvi in self.train[v].items():
                if i in action_item:
                    continue
                rank.setdefault(i, 0)
                rank[i] += wuv * rvi
        return dict(sorted(rank.items(), key=lambda x: x[1], reverse=True)[:n])

    def recall_precision(self, k=8, nitem=10):
        hit = 0
        recall = 0
        precision = 0
        for user, items in self.test.items():
            rank = self.recommend(user, k=k, n=nitem)
            hit += len(set(rank.keys()) & set(items.keys()))
            recall += len(items)
            precision += nitem
        return hit / recall, hit / precision