In [31]:
import pandas as pd
import numpy as np
import math
import datetime

In [41]:
user_index = {}  # user编号，从0开始
position_index = {}  # position编号，从0开始
link_out = {}  # user到position的邻接矩阵
link_in = {}  # position到user的邻接矩阵
trans_matrix=np.matrix(0)
user_sim_matrix = np.matrix(0)  # user之间的相似度矩阵
position_sim_matrix = np.matrix(0)  # position之间的相似度矩阵
user_damp = 0.8  # user阻尼系数
position_damp = 0.8  # position阻尼系数
df=pd.read_csv(r'F:\work\8.19 simrank\title.csv')
df=df.loc[df['c_title'].notna()]
title_dict=dict()
for i in df.index:
    itemid=df['itemid'][i]
    title=df['c_title'][i]
    title_dict[str(itemid)]=title

In [33]:
def indexUserAndPosition():
    '''
    读取user浏览position页面的记录文件，为user和position建立编号
    '''
    global user_index
    global position_index
    user_count = 0
    position_count = 0
    df=pd.read_csv(r'F:\work\8.19 simrank\7天user有效.csv')
    for i in df.index:
        user = df['deviceid'][i]
        if user not in user_index:
            user_index[user] = user_count
            user_count += 1
        item_list=df['list'][i].strip().split(',')
        for ele in item_list:
            if ele not in position_index:
                position_index[ele] = position_count
                position_count += 1

In [34]:
def readLink():
    '''
    读取user浏览position页面的记录文件，建构出度和入度邻接矩阵
    '''
    global link_out
    global link_in
    global position_index
    global user_index
    global trans_matrix

    df=pd.read_csv(r'F:\work\8.19 simrank\7天user有效.csv')
    for i in df.index:
        user = df['deviceid'][i]
        userid = user_index[user]
        link_out.setdefault(userid,[])
        item_list=df['list'][i].strip().split(',')
        for item in item_list:
            itemid=position_index[item]
            if item not in link_out[userid]:
                link_out[userid].append(itemid)
            link_in.setdefault(itemid,[])
            link_in[itemid].append(userid)
    trans_matrix=np.zeros((len(position_index),len(position_index)))
    for p1 in link_in.keys():
        lst1=list(set(link_in[p1]))
        pct1=1.0/len(lst1)
        for u1 in lst1:
            lst2=list(set(link_out[u1]))
            if len(lst2)>1:
                pct2=1.0/(len(lst2)-1)
                prob=pct1*pct2
                for p2 in lst2:
                    if p2!=p1:
                        trans_matrix[p2,p1]+=prob

In [35]:
def initSimMatrix():
    '''
    初始化相似度矩阵position_sim_matrix/user_sim_matrix
    '''
    global link_out
    global link_in
    global position_sim_matrix
    global trans_matrix

    positionnum = len(link_in)  # position节点的数量
    position_sim_matrix=np.identity(positionnum)*(1-position_damp)

In [40]:
def updateSim():
    '''
    迭代更新相似度
    '''
    global link_out
    global link_in
    global user_sim_matrix
    global position_sim_matrix
    global user_damp
    global position_damp
    global user_trans_matrix
    global position_trans_matrix
        
    usernum = len(link_out)  # user节点的数量
    positionnum = len(link_in)
    position_sim_matrix=position_damp*np.dot(trans_matrix.transpose(),position_sim_matrix)+np.identity(positionnum)*(1-position_damp)

In [37]:
def printResult():
    '''
    打印输出相似度计算结果
    '''
    global position_sim_matrix
    global position_index
    global title_dict
    
    f_out_user = open(r'C:\Users\JDD\Desktop\simrank_single_1.txt', "w",encoding='utf-8')
    for p1,p1id in position_index.items():
        if p1 in title_dict.keys():
            neighbour = []
            for p2,p2id in position_index.items():
                if p1id !=p2id:
                    sim = position_sim_matrix[p1id,p2id]
                    if sim == None:
                        sim = 0
                    if sim > 0:
                        neighbour.append((p2,sim))
            # 按相似度由大到小排序
            neighbour = sorted(
                neighbour, key=lambda x:x[1], reverse=True)
            for (u, sim) in neighbour[:20]:
                if u in title_dict.keys():
                    f_out_user.write(title_dict[p1]+"\t"+title_dict[u] + ":" + str(sim)+'\n')
    f_out_user.close()

In [38]:
def simrank(iteration):
    starttime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(str(starttime),'init_sim_matrix:')
    indexUserAndPosition()
    readLink()
    initSimMatrix()
    print(position_sim_matrix)
    for i in range(iteration):
        finishtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(str(finishtime),'iteration{}-sim_matrix:'.format(i+1),'--------------------------------------------->')
        updateSim()
        print(position_sim_matrix)
    endtime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    printResult()
    print(str(endtime),'计算结束')

In [42]:
simrank(10)

2020-09-02 15:48:38 init_sim_matrix:
[[0.2 0.  0.  ... 0.  0.  0. ]
 [0.  0.2 0.  ... 0.  0.  0. ]
 [0.  0.  0.2 ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.2 0.  0. ]
 [0.  0.  0.  ... 0.  0.2 0. ]
 [0.  0.  0.  ... 0.  0.  0.2]]
2020-09-02 15:50:24 iteration1-sim_matrix: --------------------------------------------->
[[2.00000000e-01 8.99848419e-04 5.35679323e-04 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [9.43948413e-05 2.00000000e-01 1.21590671e-04 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [4.70374940e-04 1.01779578e-03 2.00000000e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 2.00000000e-01
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  2.00000000e-01 6.15384615e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  6.15384615e-03 2.00000000e-01]]
2020-09-02 15:51:19 iteration2-sim_matrix: --------------------------------------