In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
import pandas as pd
import numpy as np
import math
from scipy.spatial import distance
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler


### EXAMPLE ONE - USER MOVIE RATINGS

In [8]:
def MatrixFactorization(data_file, target_u, target_m):
    
    #to read the file with index col
    df_utility = pd.read_csv(data_file).set_index("movies") 
    print(df_utility.head())
    
    cols = list(df_utility.columns)
    indices = list(df_utility.index)

    #Decompose the utility matrix into three matrices (U, Σ, V⊺) using SVD
    U, S, Vh = np.linalg.svd(df_utility, full_matrices=False)
    
    # create n x n Sigma matrix
    Sigma = np.diag(S)
    
    #approximate the original matrix by considering only the first k singular values
    #reconstruct the utility matrix using the reduced matrices
    reconstruct_matrix = U.dot(Sigma.dot(Vh))
    
    #round the values in the reconstructed matrix to the nearest integer
    reconstruct_matrix = np.round(reconstruct_matrix, 2)
    
    #Repair: estimations that fall between 1 and 5 (e.g., you can set any value < 1 to 1 and any value > 5 to 5).
    for i in range(len(reconstruct_matrix)):
        for j in range(len(reconstruct_matrix[i])):
            if reconstruct_matrix[i][j] <= 0:
                reconstruct_matrix[i][j] = 1
            elif reconstruct_matrix[i][j] > 5:
                reconstruct_matrix[i][j] = 5
                
    df_movie_prediction = pd.DataFrame(reconstruct_matrix, index=indices, columns=cols)
    rating = df_movie_prediction.loc[target_m, target_u]
    
    return df_movie_prediction, rating

In [9]:
data_file = "f22_utility_mat.csv"

# target user and movie, respectively
target_u, target_m = "u5", "m2" 

df_MFPrediction, rating_MF = MatrixFactorization(data_file, target_u, target_m)
df_MFPrediction
rating_MF

        u1  u2  u3  u4  u5  u6  u7  u8  u9  u10  u11  u12
movies                                                   
m1       1   0   3   0   0   5   0   0   5    0    4    0
m2       0   0   5   4   0   0   4   0   0    2    1    3
m3       2   4   0   1   2   0   3   0   4    3    5    0
m4       0   2   4   0   5   0   0   4   0    0    2    0
m5       0   0   4   3   4   2   0   0   0    0    2    5


Unnamed: 0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12
m1,1.0,1.0,3.0,1.0,1.0,5.0,1.0,1.0,5.0,1.0,4.0,1.0
m2,1.0,1.0,5.0,4.0,1.0,1.0,4.0,1.0,1.0,2.0,1.0,3.0
m3,2.0,4.0,1.0,1.0,2.0,1.0,3.0,1.0,4.0,3.0,5.0,1.0
m4,1.0,2.0,4.0,1.0,5.0,1.0,1.0,4.0,1.0,1.0,2.0,1.0
m5,1.0,1.0,4.0,3.0,4.0,2.0,1.0,1.0,1.0,1.0,2.0,5.0
m6,1.0,1.0,3.0,1.0,3.0,1.0,1.0,2.0,1.0,1.0,4.0,1.0


1.0

### EXAMPLE TWO - BOOK RATINGS

In [13]:
#Implement matrix factorization algorithm using Singular Value Decomposition (SVD)
#to predict the missing ratings and recommend the top 2 genres for a target user.
def GenreMatrixFactorization(csv_file_name, target_u, k=2):
    
    #read the genre ratings csv
    genre_ratings = pd.read_csv(csv_file_name)
    print(genre_ratings.head())
    genre_ratings.drop(columns=['genres'])   
    
    
    cols = list(genre_ratings.columns)[1:]
    indices = list(genre_ratings["genres"])
    
    #Decompose the utility matrix into three matrices (U, Σ, V⊺) using SVD
    U, S, Vh = np.linalg.svd(genre_ratings.drop(columns=['genres']), full_matrices=False)
    
    # create n x n Sigma matrix
    Sigma = np.diag(S)
    
    #approximate the original matrix by considering only the first k singular values
    #reconstruct the utility matrix using the reduced matrices
    reconstruct_matrix = U[:,:k].dot(Sigma[0:k,:k].dot(Vh[:k,:]))
    
    #round the values in the reconstructed matrix to the nearest integer
    reconstruct_matrix = np.round(reconstruct_matrix)
    
    #adjust the estimated ratings to ensure they fall within the 1 to 10 range
    for i in range(len(reconstruct_matrix)):
        for j in range(len(reconstruct_matrix[i])):
            if reconstruct_matrix[i][j] <= 0:
                reconstruct_matrix[i][j] = 1
            elif reconstruct_matrix[i][j] > 10:
                reconstruct_matrix[i][j] = 10
    
    df_genre_prediction = pd.DataFrame(reconstruct_matrix, index=indices, columns=cols)
    
    #Identify the top K genres that the target user is predicted to rate the highest and return them as a dictionary
    #along with the predicted ratings dataframe.
    ratings_target_u = df_genre_prediction[target_u]
    top_genres = {indices[i]:ratings_target_u[i] for i in range(len(indices))}
    top_genres = dict(sorted(top_genres.items(), key=lambda item: item[1], reverse=True)[:k])
    
    return df_genre_prediction, top_genres

In [14]:
target_u = "u5"
df_genre_prediction, top_genres = GenreMatrixFactorization("genre_ratings.csv", target_u, k=2)
print(df_genre_prediction)
print()
print(top_genres)

  genres  u1  u2  u3  u4  u5  u6  u7  u8  u9  u10
0     g1   5   7   6   5   3   6   3   9  10    3
1     g2   8   5   5  10   0   0   0   8   8    9
2     g3  10   8   4   8   0   6   0  10   5    3
3     g4   1   8   4   5   0   6   6   6  10    7
4     g5   2   3   7   0   6   6   1   0   2    3
     u1   u2   u3   u4   u5   u6   u7    u8   u9  u10
g1  6.0  7.0  7.0  6.0  2.0  5.0  3.0   7.0  8.0  5.0
g2  7.0  7.0  3.0  9.0  1.0  3.0  1.0  10.0  8.0  6.0
g3  7.0  7.0  4.0  8.0  1.0  4.0  1.0   9.0  8.0  6.0
g4  6.0  7.0  7.0  5.0  2.0  5.0  3.0   6.0  8.0  5.0
g5  1.0  3.0  7.0  1.0  3.0  4.0  4.0   1.0  4.0  2.0
g6  2.0  4.0  7.0  1.0  3.0  4.0  4.0   1.0  5.0  2.0

{'g5': 3.0, 'g6': 3.0}


  top_genres = {indices[i]:ratings_target_u[i] for i in range(len(indices))}


In [15]:
target_u = "u5"
df_genre_prediction, top_genres = GenreMatrixFactorization("genre_ratings.csv", target_u, k=2)
print(df_genre_prediction)
print()
print(top_genres)

  genres  u1  u2  u3  u4  u5  u6  u7  u8  u9  u10
0     g1   5   7   6   5   3   6   3   9  10    3
1     g2   8   5   5  10   0   0   0   8   8    9
2     g3  10   8   4   8   0   6   0  10   5    3
3     g4   1   8   4   5   0   6   6   6  10    7
4     g5   2   3   7   0   6   6   1   0   2    3


  top_genres = {indices[i]:ratings_target_u[i] for i in range(len(indices))}


     u1   u2   u3   u4   u5   u6   u7    u8   u9  u10
g1  6.0  7.0  7.0  6.0  2.0  5.0  3.0   7.0  8.0  5.0
g2  7.0  7.0  3.0  9.0  1.0  3.0  1.0  10.0  8.0  6.0
g3  7.0  7.0  4.0  8.0  1.0  4.0  1.0   9.0  8.0  6.0
g4  6.0  7.0  7.0  5.0  2.0  5.0  3.0   6.0  8.0  5.0
g5  1.0  3.0  7.0  1.0  3.0  4.0  4.0   1.0  4.0  2.0
g6  2.0  4.0  7.0  1.0  3.0  4.0  4.0   1.0  5.0  2.0

{'g5': 3.0, 'g6': 3.0}
