In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
print(tf.__version__)
tf.compat.v1.disable_eager_execution()

2.4.1


## 数据清洗

In [2]:
#读取评分表
ratings_df = pd.read_csv('ml-latest-small\\ratings.csv')
ratings_df.tail()

Unnamed: 0,userId,movieId,rating,timestamp
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


In [3]:
#读取电影表
movies_df = pd.read_csv('ml-latest-small\\movies.csv')
movies_df.tail()

Unnamed: 0,movieId,title,genres
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy


In [4]:
#加入索引行
movies_df['MovieRow'] = movies_df.index
movies_df.tail()

Unnamed: 0,movieId,title,genres,MovieRow
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,9737
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,9738
9739,193585,Flint (2017),Drama,9739
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,9740
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy,9741


## 特征提取

In [5]:
#提取索引、电影id和标题
movies_df = movies_df[['MovieRow', 'movieId', 'title']]
movies_df.tail()

Unnamed: 0,MovieRow,movieId,title
9737,9737,193581,Black Butler: Book of the Atlantic (2017)
9738,9738,193583,No Game No Life: Zero (2017)
9739,9739,193585,Flint (2017)
9740,9740,193587,Bungo Stray Dogs: Dead Apple (2018)
9741,9741,193609,Andrew Dice Clay: Dice Rules (1991)


In [6]:
#暂存到文件
movies_df.to_csv('ml-latest-small\\moviesProcessed.csv', index = False, header = True, encoding = 'utf-8')

In [7]:
#合并两表
ratings_df = pd.merge(ratings_df, movies_df, on = 'movieId')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,MovieRow,title
0,1,1,4.0,964982703,0,Toy Story (1995)
1,5,1,4.0,847434962,0,Toy Story (1995)
2,7,1,4.5,1106635946,0,Toy Story (1995)
3,15,1,2.5,1510577970,0,Toy Story (1995)
4,17,1,4.5,1305696483,0,Toy Story (1995)


In [8]:
#特征提取
ratings_df = ratings_df[['userId', 'MovieRow', 'rating']]
ratings_df.head()

Unnamed: 0,userId,MovieRow,rating
0,1,0,4.0
1,5,0,4.0
2,7,0,4.5
3,15,0,2.5
4,17,0,4.5


## 创建评分矩阵rating和评分记录矩阵record

In [9]:
#获取评分的用户和电影的数量
userNo = ratings_df['userId'].max() + 1
movieNo = ratings_df['MovieRow'].max() + 1

rating = np.zeros((movieNo, userNo))

#遍历矩阵，将评分填入表中
for  index, row in ratings_df.iterrows():
    rating[int(row['MovieRow']), int(row['userId'])] = row['rating']

rating

array([[0. , 4. , 0. , ..., 2.5, 3. , 5. ],
       [0. , 0. , 0. , ..., 2. , 0. , 0. ],
       [0. , 4. , 0. , ..., 2. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [10]:
#record记录用户是否评分，1表示评分，0表示没有
record = rating > 0
record = np.array(record, dtype = int)
record

array([[0, 1, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## 构建模型

In [11]:
#标准化评分
def normalizeRatings(rating, record):
    rating = np.nan_to_num(rating)
    #获取电影数量m和用户数量n
    m, n = rating.shape
    #rating_mean 电影平均分
    #rating_norm 标准化后的电影评分
    rating_mean = np.zeros((m, 1))
    rating_norm = np.zeros((m, n))
    for i in range(m):
        index = record[i, :] != 0
        #如果一个评分都没有就跳过
        if True not in index: 
            continue
        rating_mean[i] = np.mean(rating[i, index])
        rating_norm[i, index] -= rating_mean[i]
    return rating_norm, rating_mean

rating_norm, rating_mean = normalizeRatings(rating, record)
rating_norm

array([[ 0.        , -3.92093023,  0.        , ..., -3.92093023,
        -3.92093023, -3.92093023],
       [ 0.        ,  0.        ,  0.        , ..., -3.43181818,
         0.        ,  0.        ],
       [ 0.        , -3.25961538,  0.        , ..., -3.25961538,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

## 损失函数
## $J(\theta)=\dfrac{1}{2}\sum_{j=1}^{u}\sum_{i,r(i,j)=1}{}((\theta^{j})^{T}x^{i}-y^{(i,j)})^2+\dfrac{\lambda}{2}\sum_{j=1}^{u}\sum_{k=1}^{n}(\theta_{k}^{j})^2$
### 其中，x表示电影的标签，大小1\*n，$\theta$表示用户j的喜好，大小也是1\*n，这里的n就是tags的数量。y(i,j)表示用户j对电影i的评分，u是用户数量，后面一部分是正则化项，防止过拟合。

In [12]:
#设置参数，其中X_parameters对应x^i，Theta_parameters对应\theta^j
num_features = 10
X_parameters = tf.Variable(tf.random.normal([movieNo, num_features], stddev = 0.35))
Theta_parameters = tf.Variable(tf.random.normal([userNo, num_features], stddev = 0.35))
X_parameters

<tf.Variable 'Variable:0' shape=(9742, 10) dtype=float32>

In [32]:
#损失函数
with tf.GradientTape() as tape:
    loss = 1/2 * tf.reduce_sum(((tf.matmul(X_parameters, Theta_parameters, transpose_b=True) - rating_norm) * record) ** 2) + 1/2 * (tf.reduce_sum(X_parameters ** 2) + tf.reduce_sum(Theta_parameters ** 2))
#Adam算法优化器，设置学习率为0.0001
optimizer = tf.optimizers.Adam(1e-4)
var_list = [X_parameters, Theta_parameters]
train = optimizer.minimize(loss, var_list = var_list, tape = tape)
#训练模型
tf.compat.v1.summary.scalar('loss', loss)

<tf.Tensor 'loss_2:0' shape=() dtype=string>

## 训练模型

In [33]:
#保存结果
#指定一个文件用来保存图。
filename = './movie_tensorboard'
writer = tf.compat.v1.summary.FileWriter(filename)
#定义一个session
sess = tf.compat.v1.Session()
#运行session
init = tf.compat.v1.global_variables_initializer()
#merge_all 可以将所有summary全部保存到磁盘，以便tensorboard显示。
summaryMerged = tf.compat.v1.summary.merge_all()
sess.run(init)
print(type(summaryMerged))

<class 'tensorflow.python.framework.ops.Tensor'>


In [34]:
#递归5000次
for i in range(5000):
    _, movie_summary = sess.run([train, summaryMerged])
    writer.add_summary(movie_summary, i)

## 模型评估

In [35]:
Current_X_parameters, Current_Theta_parameters = sess.run(var_list)
predicts = np.dot(Current_X_parameters, Current_Theta_parameters.T) + rating_mean
errors = np.sqrt(np.sum((predicts - rating) ** 2))
errors

3791.2537246049706

## 电影推荐

In [42]:
userId = input('请输入用户编号：(smaller than 610)')
sortedResult = predicts[:, int(userId)].argsort()[::-1]
print('为该用户推荐的评分最高的20部电影是：'.center(80, '='))
index = 0
for i in sortedResult:
    print('score: %.3f, movie name: %s' % (predicts[i, int(userId)], movies_df.iloc[i]['title']))
    index += 1
    if index == 20: break

请输入用户编号：(smaller than 610)14
score: 4.891, movie name: Devil and Daniel Johnston, The (2005)
score: 4.744, movie name: Supercop 2 (Project S) (Chao ji ji hua) (1993)
score: 4.683, movie name: Che: Part One (2008)
score: 4.554, movie name: Harlan County U.S.A. (1976)
score: 4.546, movie name: Frozen River (2008)
score: 4.476, movie name: Laggies (2014)
score: 4.469, movie name: Palindromes (2004)
score: 4.461, movie name: Crippled Avengers (Can que) (Return of the 5 Deadly Venoms) (1981)
score: 4.435, movie name: Battle For Sevastopol (2015)
score: 4.423, movie name: Year of the Horse (1997)
score: 4.403, movie name: Awfully Big Adventure, An (1995)
score: 4.402, movie name: Black Tar Heroin: The Dark End of the Street (2000)
score: 4.375, movie name: Déjà Vu (1997)
score: 4.354, movie name: Gena the Crocodile (1969)
score: 4.340, movie name: Animals are Beautiful People (1974)
score: 4.298, movie name: Siam Sunset (1999)
score: 4.248, movie name: Man and a Woman, A (Un homme et une fem