# Movie_Recommendation
## Read the data

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

### Show MemoryError when read the dataset
#### Try to read the data in chunks (fails)

In [None]:
# def data(path):
#     f = open(path)
    
#     data = pd.read_csv(path, sep=',', engine='python', iterator=True)
#     loop = True
#     chunkSize = 1000
#     chunks = []
#     index = 0
#     while loop:
#         try:
# #             print(index)
#             chunk = data.get_chunk(chunkSize)
#             chunks.append(chunk)
#             index += 1

#         except StopIteration:
#             loop = False
#             print("Iteration is stopped.")
#     print('开始合并')
#     data = pd.concat(chunks, ignore_index=True)
#     return data

In [None]:
# ratings_df = data('ml-25m/ratings.csv')
# ratings_df

In [None]:
ratings_df = pd.read_csv('ml-25m/ratings.csv')
movies_df = pd.read_csv('ml-25m/movies.csv')
movies_df['movieRow'] = movies_df.index

In [None]:
ratings_df.dtypes

In [None]:
ratings_df = ratings_df.astype(np.int16)
ratings_df

In [None]:
ratings_df.tail()

In [None]:
movies_df.tail()

## compress the volumn

In [None]:
ratings_df = ratings_df.iloc[:1000000]
ratings_df.tail()

In [None]:
## select attributes from movies_df
movies_df = movies_df[['movieRow', 'movieId', 'title']]
movies_df.tail()

In [None]:
#将rating_df中的movieId替换为行号
ratings_df = pd.merge(ratings_df, movies_df, on='movieId')
ratings_df = ratings_df[['userId', 'movieRow', 'rating']]

In [None]:
ratings_df

## Build rating matrix

In [None]:
# 获取最大用户数目及电影数目
userNo = ratings_df['userId'].max() + 1
movieNo = ratings_df['movieRow'].max() + 1
print('max userNo: {},min movieNo: {}'.format(userNo, movieNo))

In [None]:
# create a matrix: row movies, columns users
rating = np.zeros((movieNo, userNo))

flag = 0
ratings_df_length = np.shape(ratings_df)[0]
#查看矩阵ratings_df的第一维度是多少
# interrows（），对表格ratings_df进行遍历
for index, row in ratings_df.iterrows():
    #     print(index)
    #     print(row)
    # 将ratings_df表里的'movieRow'和'userId'列，填上row的‘评分’
    rating[int(row['movieRow']), int(row['userId'])] = row['rating']

del index, row

In [None]:
rating

In [None]:
record = rating > 0
print(record)
record = np.array(record,
                  dtype=int)  #将record的布尔型转化为0和1,0:用户没有对此电影评分；1：用户对此电影进行了评分
print(record)

## Normalize ratings 

In [None]:
def normalizeRatings(rating, record):
    # m: No. of movies
    # n: No. of users
    m, n = rating.shape

    rating_mean = np.zeros((m, 1))
    # 每部电影的平均得分
    rating_norm = np.zeros((m, n))
    # 处理过的评分
    for i in range(m):
        # idx: all indeces of records with ratings
        idx = record[i, :] != 0

        # calculate mean rating of each movie
        rating_mean[i] = np.mean(rating[i, idx])

        # normal = rating - mean
        rating_norm[i, idx] -= rating_mean[i]

    return rating_norm, rating_mean

In [None]:
rating_norm, rating_mean = normalizeRatings(rating, record)

In [None]:
rating_mean

In [None]:
rating_norm

In [None]:
# set 0 (default) to NaN
rating_norm = np.nan_to_num(rating_norm)
rating_norm

In [None]:
rating_mean = np.nan_to_num(rating_mean)
rating_mean

## Build the model

In [None]:
num_features = 10
X_parameters = tf.Variable(
    tf.random.uniform([movieNo, num_features]))
Theta_parameters = tf.Variable(
    tf.random.uniform([userNo, num_features]))
#tf.Variables()初始化变量
#tf.random.unifrom() used in TF 2.0

In [None]:
# matmul: dot product a and b, transpose_b: transpose matrix b if True.
# tf.reduce_sum: sum function
loss = 1 / 2 * tf.reduce_sum(
    ((tf.matmul(X_parameters, Theta_parameters, transpose_b=True) -
      rating_norm) * record)**2) + 1 / 2 * (tf.reduce_sum(X_parameters**2) +
                                            tf.reduce_sum(Theta_parameters**2))
#基于内容的推荐算法模型

In [None]:
# Adam algorithm minimize the loss function
optimizer = tf.train.AdamOptimizer(1e-4)
# https://blog.csdn.net/lenbow/article/details/52218551
train = optimizer.minimize(loss)

## Train the model

In [None]:
# tf.summary的用法 https://www.cnblogs.com/lyc-seu/p/8647792.html
tf.summary.scalar('loss', loss)
#用来显示标量信息

In [None]:
# tf.summary.merge_all
# merge_all 可以将所有summary全部保存到磁盘，以便tensorboard显示
# tensorboard --logdir to show tensorboard
summaryMerged = tf.summary.merge_all()

filename = './movie_tensorborad'
writer = tf.summary.FileWriter(filename)

sess = tf.Session()

# should use variables_init to init variables when you used tf.Variable() in your program.
init = tf.global_variables_initializer()
sess.run(init)

In [None]:
for i in range(5000):
    _, movie_summary = sess.run([train, summaryMerged])
    # 把训练的结果summaryMerged存在movie里
    writer.add_summary(movie_summary, i)

## Evaluate the model performance

In [None]:
Current_X_parameters, Current_Theta_parameters = sess.run(
    [X_parameters, Theta_parameters])
# Current_X_parameters为用户内容矩阵，Current_Theta_parameters用户喜好矩阵
predicts = np.dot(Current_X_parameters,
                  Current_Theta_parameters.T) + rating_mean
# dot函数是np中的矩阵乘法，np.dot(x,y) 等价于 x.dot(y)
errors = np.sqrt(np.sum((predicts - rating)**2))
# sqrt(arr) ,计算各元素的平方根
errors

## Build the recommend system

In [None]:
user_id = input('您要想哪位用户进行推荐？请输入用户编号：')
sortedResult = predicts[:, int(user_id)].argsort()[::-1]
# argsort()函数返回的是数组值从小到大的索引值; argsort()[::-1] 返回的是数组值从大到小的索引值
idx = 0
print('为该用户推荐的评分最高的20部电影是：'.center(80,'='))
# center() 返回一个原字符串居中,并使用空格填充至长度 width 的新字符串。默认填充字符为空格。
for i in sortedResult:
    print('评分: %.2f, 电影名: %s' % (predicts[i,int(user_id)],movies_df.iloc[i]['title']))
    # .iloc的用法：https://www.cnblogs.com/harvey888/p/6006200.html
    idx += 1
    if idx == 20:
        break