In [1]:
import web
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np
import urllib
# import sys
import json
import tensorflow.keras as keras 
# 获取隐向量并归一化
model = keras.models.load_model('model_d30_interrupted.model')
X = model.layers[2].embeddings.numpy()
normalized_X = [x / np.sqrt(x.dot(x)) for x in X]

# 建立zid到index的映射
anchor_list = pd.read_csv('anchor_list.csv')
anchor_list = anchor_list.reset_index()
anchor_list['index'] = anchor_list['index'] + 1
zid2index = {zid: index for zid, index in zip(anchor_list['live_uid'], anchor_list['index'])}
index2zid = {index: zid for index, zid in zip(anchor_list['index'], anchor_list['live_uid'])}

In [5]:
len(zid2index)

26195

In [6]:
anchor_list

Unnamed: 0,index,live_uid,count
0,1,730183576,558593
1,2,16673072,123029
2,3,4994793,95888
3,4,21678553,94230
4,5,735415548,90641
...,...,...,...
26190,26191,7519946,1
26191,26192,45927808,1
26192,26193,700122389,1
26193,26194,34400945,1


In [7]:
nbrs = NearestNeighbors(n_neighbors=10, n_jobs=-1).fit(normalized_X)


In [8]:
zid = 721745639

In [9]:
query_point = normalized_X[zid2index[zid]]
distances, indices = nbrs.kneighbors([query_point])

similar_anchor_scores = [(2 - dist**2) / 2 for dist in distances[0]]
similar_anchor_indices = indices[0]
similar_anchor_zid = [index2zid[idx] for idx in similar_anchor_indices]

In [11]:
similar_anchor_zid

[721745639,
 728765898,
 727627432,
 728765601,
 601417360,
 722742017,
 722099292,
 728764985,
 722741708,
 722099252]

In [20]:
def query_by_url(url):
    res_data = urllib.request.urlopen(url, timeout=30)
    res = res_data.read()
    res_json = json.loads(res)
    return res_json

def query_info_by_ids(ids):
    info_dic = {}
    for zid in ids:
        try:
            info_url = "http://apigateway.inke.srv/user/infos?id=%s" % zid
            info_list = query_by_url(info_url)
            if info_list["dm_error"] == 0 and len(info_list["users"]) > 0:
                for info in info_list["users"]:
                    info_dic[str(info["id"])] = info
        except Exception as e:
            print(e)
            continue
    return info_dic

In [22]:
info_dic = query_info_by_ids(similar_anchor_zid)


In [24]:
len(info_dic)

10

In [33]:
detail = info_dic[str(zid)]

In [37]:
html_str = "<!DOCTYPE html><html>" \
   "<head><meta http-equiv="'"Content-Type"'" content="'"text/html;charset=utf-8"'">" \
   "<title>基于item2vec的相似主播</title>" \
   "</head>" \
   "<body>" \
   "<hr><table border=""∂1"">" \
   "<tr bgcolor=""#C0C0C0"">" \
   "<th>zid</th>" \
   "<th>头像</th>" \
   "<th>直播间</th>" \
   "<th>余项相似度</th>" \
   "</tr>"

for zid, score in zip(similar_anchor_zid, similar_anchor_scores):
    detail = info_dic[str(zid)]
    html_str += "<tr align=""center""  bgcolor=""#FFFFF0"">"
    html_str += '<td><div style="width:180px;word-wrap:break-word;" >%s</td>' % str(zid) # 添加uid
    html_str += "<td><img src=""%s"" height=""150"" width=""150""></td>" % detail["portrait"] # 添加头像
    html_str += '<td><a href="http://www.inke.com/live.html?uid=%s">VIEW LIVE</a></td>' % zid # 添加直播间链接
    html_str += '<td><div style="width:180px;word-wrap:break-word;" >%s</td>' % str(score) # 添加相似度
    html_str += "</tr>"
html_str += "</table></body></html>"

In [38]:
html_str

'<!DOCTYPE html><html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"><title>基于item2vec的相似主播</title></head><body><hr><table border=∂1><tr bgcolor=#C0C0C0><th>zid</th><th>头像</th><th>直播间</th><th>余项相似度</th></tr><tr align=center  bgcolor=#FFFFF0><td><div style="width:180px;word-wrap:break-word;" >721745639</td><td><img src=http://img.ikstatic.cn/MTU2NjM3NTQxNDY2MyMgMzEjanBn.jpg height=150 width=150></td><td><a href="http://www.inke.com/live.html?uid=721745639">VIEW LIVE</a></td><td><div style="width:180px;word-wrap:break-word;" >1.0</td></tr><tr align=center  bgcolor=#FFFFF0><td><div style="width:180px;word-wrap:break-word;" >728765898</td><td><img src=http://img.ikstatic.cn/MTU2Njg5NjU5NDM2NyMgNDAjanBn.jpg height=150 width=150></td><td><a href="http://www.inke.com/live.html?uid=728765898">VIEW LIVE</a></td><td><div style="width:180px;word-wrap:break-word;" >0.9759654912293284</td></tr><tr align=center  bgcolor=#FFFFF0><td><div style="width:180px;word-wrap:break-wor