In [57]:
import os
import pandas as pd

from datetime import datetime
from tqdm import tqdm

from collections import defaultdict
import math
import numpy as np

import random

In [58]:
df = pd.read_csv('ml-1m.inter',sep='\t')

In [59]:
df = df.rename(columns={k:k.split(':')[0] for k in df.columns})

In [60]:
pos_df = df[df['rating']>3].reset_index(drop=True)
pos_df.shape

(575281, 4)

In [61]:
pos_df['user_count'] = pos_df['user_id'].map(pos_df['user_id'].value_counts())

In [62]:
pos_df = pos_df[pos_df['user_count']>5].reset_index(drop=True)
pos_df.shape

(575242, 5)

In [63]:
pos_df = pos_df.sort_values(by=['user_id','timestamp'],ascending=True)

In [64]:
pos_dict = pos_df.groupby('user_id')['item_id'].apply(list).to_dict()

In [65]:
# 构造样本
train_user_list = []
train_item_list = []
train_label_list = []

test_user_list = []
test_item_list = []
test_label_list = []
user_list = pos_df['user_id'].unique()
item_list = pos_df['item_id'].unique()
item_num = pos_df['item_id'].nunique()

for user in tqdm(user_list):
    # 训练集正样本
    for i in range(len(pos_dict[user])-1):
        train_user_list.append(user)
        train_item_list.append(pos_dict[user][i])
        train_label_list.append(1)
        
    # 测试集正样本
    test_user_list.append(user)
    test_item_list.append(pos_dict[user][-1])
    test_label_list.append(1)

        
train_df = pd.DataFrame()
train_df['user_id'] = train_user_list
train_df['item_id'] = train_item_list
train_df['label'] = train_label_list

test_df = pd.DataFrame()
test_df['user_id'] = test_user_list
test_df['item_id'] = test_item_list
test_df['label'] = test_label_list


100%|████████████████████████████████████████████████████████████████| 6028/6028 [00:00<00:00, 29268.80it/s]


In [66]:
def get_sim_item(df, user_col, item_col):
    # user_item_dict :key:user, values:item_list
    user_item_dict = df.groupby(user_col)[item_col].agg(list).to_dict()
    sim_item = {} # 如果不用字典，则需要：n_item x n_item；如果字典，只需要存储非0值 sim_item[item1][itme2]
    item_cnt = defaultdict(int)
    for user, items in tqdm(user_item_dict.items()):
        '''
        itmes : user的历史行为序列
        '''
        for i in items:
            item_cnt[i] += 1
            sim_item.setdefault(i, {}) # sim_item[i] = {}
            for relate_item in items:
                if i == relate_item:
                    continue
                sim_item[i].setdefault(relate_item, 0) #如果sim_item[i]字段的key没有relate_item,则赋值0
                sim_item[i][relate_item] += 1 / math.log(1 + len(items)) #User行为热度打压
                
    sim_item_corr = sim_item.copy()
    for i, related_items in tqdm(sim_item.items()):
        for j, cij in related_items.items():
            sim_item_corr[i][j] = cij / math.sqrt(item_cnt[i] * item_cnt[j]) #Item热度打压

    return sim_item_corr,user_item_dict

In [67]:
sim_item_corr,user_item_dict = get_sim_item(train_df,'user_id','item_id')

100%|███████████████████████████████████████████████████████████████████| 6028/6028 [01:22<00:00, 72.64it/s]
100%|█████████████████████████████████████████████████████████████████| 3530/3530 [00:02<00:00, 1452.15it/s]


In [68]:
order = train_df['item_id'].value_counts().reset_index()
order = order.sort_values('item_id', ascending=False)
popular_items = list(order['index'])

In [72]:
def recommend(user, sim_item_corr, popular_items, top_k, user_item_dict, item_num=20):  
    rank = {}  
    for i in user_item_dict[user]:  
        if i not in sim_item_corr.keys():
            continue
        for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1], reverse=True)[0:item_num]:  
            if j not in user_item_dict[user]:  
                rank.setdefault(j, 0)  
                rank[j] += wij
    
    rank = sorted(rank.items(), key=lambda d: d[1], reverse=True)[:top_k]
    rank = np.array(rank)
    item_list = list(rank[:,0].astype('int32'))
    score_list = rank[:,1]
    
    if len(item_list)<top_k:
        index = 0
        while(len(item_list)<top_k):
            item_list.append(popular_items[index])
            item_list = list(set(item_list))
            index +=1
            
    return item_list, score_list

In [73]:
item_list, score_list=recommend(22,sim_item_corr,popular_items,100,user_item_dict)

In [74]:
top_k = 20
preds = defaultdict(list)
for user_id in tqdm(test_df['user_id'].values):
    item_list, score_list = recommend(user_id,sim_item_corr,popular_items,top_k,user_item_dict)
    preds[user_id] = item_list

100%|███████████████████████████████████████████████████████████████████| 6028/6028 [08:39<00:00, 11.60it/s]


In [80]:
def hitrate(preds,test_df,topk=20):
    hit_num = 0
    for i in range(len(test_df)):
        if test_df['item_id'].iloc[i] in preds[test_df['user_id'].iloc[i]]:
            hit_num +=1
    return hit_num / len(test_df)

In [81]:
hitrate(preds, test_df)

0.11131386861313869

In [29]:
def predict_u2i(user,item,sim_item_corr,user_item_dict):
    score = 0
    for user_item in user_item_dict[user]:
        if item in sim_item_corr[item]:
            score += sim_item_corr[item][user_item]
    return score

In [30]:
predict_u2i(22,11,sim_item_corr,user_item_dict)

0