In [6]:
import csv
from collections import defaultdict
import numpy as np
import numpy.ma as ma
import pandas as pd
import tabulate
import tensorflow as tf
from numpy import genfromtxt
from sklearn.preprocessing import StandardScaler
from tensorflow.python.keras import Model
pd.set_option("display.precision", 1)

In [7]:
# 构建神经网络
num_outputs = 32
tf.random.set_seed(1)
item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs),
])

In [8]:
# 导入数据
with open('data/content_item_train_header.txt', newline='') as f:
    item_features = list(csv.reader(f))[0]
item_train = genfromtxt('data/content_item_train.csv', delimiter=',')
item_vecs = genfromtxt('data/content_item_vecs.csv', delimiter=',')
# 把csv读成dict
movie_dict = defaultdict(dict)
count = 0
with open('data/content_movie_list.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    for line in reader:
        if count == 0:
            count += 1  # skip header
        else:
            count += 1
            movie_id = int(line[0])
            movie_dict[movie_id]["title"] = line[1]
            movie_dict[movie_id]["genres"] = line[2]
ivs = 3  # item genre vector start
i_s = 1  # start of columns to use in training, items
num_item_features = item_train.shape[1] - 1

In [9]:
# 预处理
scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

In [10]:
# 向神经网络放置数据
input_item_m = tf.keras.layers.Input(shape=num_item_features)
vm_m = item_NN(input_item_m)
vm_m = tf.linalg.l2_normalize(vm_m, axis=1)
model_m = Model(input_item_m, vm_m)
model_m.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Total params: 41,376
Trainable params: 41,376
Non-trainable params: 0
_________________________________________________________________


In [12]:
print(num_item_features)
print(vm_m)

16
KerasTensor(type_spec=TensorSpec(shape=(None, 32), dtype=tf.float32, name=None), name='tf.math.l2_normalize/l2_normalize:0', description="created by layer 'tf.math.l2_normalize'")


In [13]:
# 计算样本距离函数
def sq_dist(a, b):
    d = 0.0
    for i in range(len(a)):
        d = d + np.square(a[i] - b[i])
    return d

# 获得对应的item的属性(genre)
def get_item_genre(item, ivs, item_features):
    offset = np.where(item[ivs:] == 1)[0][0]
    genre = item_features[ivs + offset]
    return genre, offset

In [14]:
scaled_item_vecs = scalerItem.transform(item_vecs)
vms = model_m.predict(scaled_item_vecs[:, i_s:])
print(f"size of all predicted movie feature vectors: {vms.shape}")

size of all predicted movie feature vectors: (1883, 32)


In [17]:
print(vms)

[[-0.27082956 -0.08036376  0.00062436 ... -0.24820146  0.03875639
   0.08023484]
 [-0.17197903  0.01741263 -0.30231732 ... -0.30880255  0.20028582
   0.09363958]
 [-0.23659278  0.09339767  0.09067516 ... -0.22965577 -0.03743649
   0.12535354]
 ...
 [-0.04982761  0.2480078  -0.12861396 ... -0.14392868  0.17976898
   0.03090984]
 [ 0.04796767  0.28846937 -0.05266033 ... -0.09936561  0.00628154
   0.17299837]
 [-0.04397301  0.25368923 -0.04348531 ... -0.02347828  0.08373854
  -0.02823697]]


In [9]:
dim = len(vms)
dist = np.zeros((dim, dim))
# 计算全体的相似度
# 计算时间长!!!
for i in range(dim):
    for j in range(dim):
        dist[i, j] = sq_dist(vms[i, :], vms[j, :])

[[0.         1.28204696 0.97717536 ... 1.26099219 1.30794076 0.84152463]
 [1.28204696 0.         1.16661835 ... 1.82232032 1.10724113 1.3384086 ]
 [0.97717536 1.16661835 0.         ... 1.28102649 0.83886599 1.48232408]
 ...
 [1.26099219 1.82232032 1.28102649 ... 0.         1.18314847 0.75234022]
 [1.30794076 1.10724113 0.83886599 ... 1.18314847 0.         1.3525712 ]
 [0.84152463 1.3384086  1.48232408 ... 0.75234022 1.3525712  0.        ]]


In [11]:
m_dist = ma.masked_array(dist, mask=np.identity(dist.shape[0]))  # mask the diagonal

count = 50
disp = [["movie1", "genres", "movie2", "genres"]]
for i in range(count):
    min_idx = np.argmin(m_dist[i])
    movie1_id = int(item_vecs[i, 0])
    movie2_id = int(item_vecs[min_idx, 0])
    genre1, _ = get_item_genre(item_vecs[i, :], ivs, item_features)
    genre2, _ = get_item_genre(item_vecs[min_idx, :], ivs, item_features)

    disp.append([movie_dict[movie1_id]['title'], genre1,
                 movie_dict[movie2_id]['title'], genre2])

table = tabulate.tabulate(disp, tablefmt='grid', headers="firstrow", floatfmt=[".1f", ".1f", ".0f", ".2f", ".2f"])
print(table)

+----------------------------------------+-----------+----------------------------------------+-----------+
| movie1                                 | genres    | movie2                                 | genres    |
| Save the Last Dance (2001)             | Drama     | Planet of the Apes (2001)              | Drama     |
+----------------------------------------+-----------+----------------------------------------+-----------+
| Save the Last Dance (2001)             | Romance   | Wedding Planner, The (2001)            | Romance   |
+----------------------------------------+-----------+----------------------------------------+-----------+
| Wedding Planner, The (2001)            | Comedy    | Spy Kids (2001)                        | Comedy    |
+----------------------------------------+-----------+----------------------------------------+-----------+
| Wedding Planner, The (2001)            | Romance   | Save the Last Dance (2001)             | Romance   |
+---------------------------