In [66]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [67]:
df = pd.read_csv('timing_flattern_3列.csv')

In [68]:
def get_ratings_matrix(df, train_size=0.75):
    user_to_row = {}
    movie_to_column = {}
    df_values = df.values[:,1:]
    n_dims = 10
    parameters = {}

    uniq_users = np.unique(df_values[:,0])
    uniq_movies = np.unique(df_values[:,1])

    for i, user_id in enumerate(uniq_users):
        user_to_row[user_id] = i

    for j, movie_id in enumerate(uniq_movies):
        movie_to_column[movie_id] = j

    n_users = len(uniq_users)
    n_movies = len(uniq_movies)

    R = np.zeros((n_users, n_movies))

#     df_copy = df.copy()
#     train_set = df_copy.sample(frac=train_size, random_state=0)
#     test_set = df_copy.drop(train_set.index)
    train_set,test_set = train_test_split(df_values, test_size=0.2)
    for r in train_set:
        i = r[0]
        j = r[1]
        R[int(i),int(j)] = r[2]

    return R, train_set, test_set, n_dims, n_users, n_movies, user_to_row, movie_to_column


In [69]:
R, train_set, test_set, n_dims, n_users, n_movies, user_to_row, movie_to_column = get_ratings_matrix(df, 0.8)

print("R.shape",R.shape)
print("n_users:",n_users)
print("n_movies:",n_movies)
# print("user_to_row:",user_to_row)
# print("movie_to_column:",movie_to_column)
# print("R:",R)

R.shape (1500, 18)
n_users: 1500
n_movies: 18


In [70]:
parameters = {}


# 初始化U，V
def initialize_parameters(lambda_U, lambda_V):
    U = np.zeros((n_dims, n_users), dtype=np.float64)
    V = np.random.normal(0.0, 1.0 / lambda_V, (n_dims, n_movies))
    
    parameters['U'] = U
    parameters['V'] = V
    parameters['lambda_U'] = lambda_U
    parameters['lambda_V'] = lambda_V

In [71]:
#更新U，V
def update_parameters():
    U = parameters['U']
    V = parameters['V']
    lambda_U = parameters['lambda_U']
    lambda_V = parameters['lambda_V']
    
    for i in range(n_users):
        V_j = V[:, R[i, :] > 0]
        U[:, i] = np.dot(np.linalg.inv(np.dot(V_j, V_j.T) + lambda_U * np.identity(n_dims)), np.dot(R[i, R[i, :] > 0], V_j.T))
        
    for j in range(n_movies):
        U_i = U[:, R[:, j] > 0]
        V[:, j] = np.dot(np.linalg.inv(np.dot(U_i, U_i.T) + lambda_V * np.identity(n_dims)), np.dot(R[R[:, j] > 0, j], U_i.T))
        
    parameters['U'] = U
    parameters['V'] = V

In [72]:
# let's implement the Log-a posteriori:

def log_a_posteriori():
    lambda_U = parameters['lambda_U']
    lambda_V = parameters['lambda_V']
    U = parameters['U']
    V = parameters['V']
    
    UV = np.dot(U.T, V)
    R_UV = (R[R > 0] - UV[R > 0])
    
    return -0.5 * (np.sum(np.dot(R_UV, R_UV.T)) + lambda_U * np.sum(np.dot(U, U.T)) + lambda_V * np.sum(np.dot(V, V.T)))

predict function allows us to predict the rating value given the user_id and the movie_id parameters. The value has been scaled within the range 0-5


In [73]:
def predict(user_id, movie_id):
    U = parameters['U']
    V = parameters['V']
    
    r_ij = U[:, user_to_row[user_id]].T.reshape(1, -1) @ V[:, movie_to_column[movie_id]].reshape(-1, 1)

    max_rating = parameters['max_rating']
    min_rating = parameters['min_rating']

    return 0 if max_rating == min_rating else ((r_ij[0][0] - min_rating) / (max_rating - min_rating)) * 5.0

The evaluate function will calculate the RMSE of the model given a dataset (train or test).


In [74]:
def evaluate(dataset):
    ground_truths = []
    predictions = []
    
    for index, row in dataset.iterrows():
        ground_truths.append(row.loc['rating'])
        predictions.append(predict(row.loc['userId'], row.loc['movieId']))
    
    return mean_squared_error(ground_truths, predictions, squared=False)