In [133]:
import numpy as np
import pandas as pd

# Read in the data
column_list_ratings = ["UserID", "MovieID", "Ratings","Timestamp"]
ratings_data = pd.read_csv('ratings.dat', sep='::', names=column_list_ratings, engine='python')
column_list_movies = ["MovieID","Title","Genres"]
movies_data = pd.read_csv('movies.dat', sep='::', names=column_list_movies, engine='python', encoding='latin-1')
column_list_users = ["UserID","Gender","Age","Occupation","Zixp-code"]
user_data = pd.read_csv("users.dat", sep="::", names=column_list_users, engine="python")

# Extracting all the unique iser_ids and movie_ids from the data
user_ids = ratings_data['UserID'].unique()
movie_ids = ratings_data['MovieID'].unique()

#Rating matrix filled with zero's with dimension of user_ids, movie_ids
ratings_matrix = np.zeros((len(user_ids), len(movie_ids)), dtype = np.uint8)

# Populating the rating matrix with the rating given by a user to a movie
for index, row in ratings_data.iterrows():
    user_id = row['UserID']
    movie_id = row['MovieID']
    rating = row['Ratings']
    user_idx = np.where(user_ids == user_id)[0][0]
    movie_idx = np.where(movie_ids == movie_id)[0][0]
    ratings_matrix[user_idx, movie_idx] = rating

# Printing the shape of ratings_matrix
print(ratings_matrix.shape)

# Printing the ratings_matrix
print(ratings_matrix)

(6040, 3706)
[[5 3 3 ... 0 0 0]
 [5 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 3 4 ... 0 0 0]
 [4 0 0 ... 0 0 0]]


In [128]:
movie_id = 1377
user_ids_1377 = ratings_data[ratings_data['MovieID'] == movie_id]['UserID'].head(3).values

movie_idx = np.where(movie_ids == movie_id)[0][0]

for user_1 in user_ids_1377:
    ratings_1377 = ratings_matrix[user_1-1, movie_idx]
    print("User",user_1, "Rated the movie: ",ratings_1377 )

User 10 Rated the movie:  3
User 13 Rated the movie:  3
User 18 Rated the movie:  2


In [49]:
# Calculate mean by movie
mean_ratings = np.zeros((num_movies,))
for j in range(num_movies):
    idx = np.where(ratings_matrix[:, j] != 0)
    mean_ratings[j] = np.mean(ratings_matrix[idx, j])
    
# Replace missing values with the mean rating for the given movie
for i in range(num_users):
    for j in range(num_movies):
        if ratings_matrix[i, j] == 0:
            ratings_matrix[i, j] = mean_ratings[j]
            
# Subtract mean from original ratings and divide it by the standard deviation
std_ratings = np.zeros((num_users, num_movies))
for j in range(num_movies):
    idx = np.where(ratings_matrix[:, j] != 0)
    std_ratings[idx, j] = (ratings_matrix[idx, j] - mean_ratings[j]) / np.std(ratings_matrix[idx, j])
    
# Replace NaN values with 0
std_ratings = np.nan_to_num(std_ratings, nan=0)

  std_ratings[idx, j] = (ratings_matrix[idx, j] - mean_ratings[j]) / np.std(ratings_matrix[idx, j])


In [50]:
# Compute the SVD of the normalised matrix
u, s, vh = np.linalg.svd(std_ratings)

# Print the shapes
print("U shape: ", u.shape)
print("S shape: ", s.shape)
print("V shape: ", vh.shape)

U shape:  (6040, 6040)
S shape:  (3706,)
V shape:  (3706, 3706)


In [130]:
# Define the k values
k_values = [100, 1000, 2000, 3000]

# Loop over the k values
for k in k_values:
    # Reconstruct the rank-k matrix
    R_k = u[:, :k] @ np.diag(s[:k]) @ vh[:k, :]
    if k == 1000:
        Rk_1000 = R_k
    # Make predictions for the selected users and movie
    for user_id in [10,13,18]:
        user_idx = user_id
        movie_idx = 1376  # index starts from 0
        
        # Get the original rating
        rating_original = ratings_data[(ratings_data["UserID"] == user_id) & 
                                        (ratings_data["MovieID"] == 1377)]["Ratings"].values[0]
        
        # Get the predicted rating
        rating_predicted = R_k[user_idx, movie_idx]
        
        # Print the original and predicted ratings
        print("For k = {}, User {}'s original rating: {}, predicted rating: {:.2f}".format(k, user_id, rating_original, rating_predicted))

For k = 100, User 10's original rating: 3, predicted rating: -0.82
For k = 100, User 13's original rating: 3, predicted rating: -0.70
For k = 100, User 18's original rating: 2, predicted rating: -0.77
For k = 1000, User 10's original rating: 3, predicted rating: 0.11
For k = 1000, User 13's original rating: 3, predicted rating: -0.70
For k = 1000, User 18's original rating: 2, predicted rating: 0.20
For k = 2000, User 10's original rating: 3, predicted rating: -0.38
For k = 2000, User 13's original rating: 3, predicted rating: -0.62
For k = 2000, User 18's original rating: 2, predicted rating: -0.71
For k = 3000, User 10's original rating: 3, predicted rating: -0.71
For k = 3000, User 13's original rating: 3, predicted rating: -0.64
For k = 3000, User 18's original rating: 2, predicted rating: -0.70


In [94]:
def top_movie_similarity(data, movie_id, top_n=5):
    # Get the column index for the given movie_id
    movie_idx = movie_id-1
    
    # Get the column vector for the given movie
    movie_vec = data[:, movie_idx]
    # Calculate the cosine similarity between the given movie and all other movies
    sim_scores = np.nan_to_num(np.dot(data.T, movie_vec) / (np.linalg.norm(data, axis=0) * np.linalg.norm(movie_vec)))
    
    # Sort the movies based on their similarity with the given movie
    top_indices = np.argsort(-sim_scores)[:top_n]
    top_scores = sim_scores[top_indices]
    
    return top_indices, top_scores
def print_similar_movies(movie_titles, top_indices):
    print('Most Similar movies: ')
    for i, idx in enumerate(top_indices):
        print(f'{i+1}. {movie_titles[idx]}')
        
# Find the top 5 similar movies for Batman Returns
movie_id = 1377
top_n = 5
top_indices, top_scores = top_movie_similarity(Rk_1000, movie_id, top_n)

movie_titles = movies_data["Title"].tolist()

# Print the top 5 similar movies
print_similar_movies(movie_titles, top_indices)

Most Similar movies: 
1. Bastard Out of Carolina (1996)
2. Cyrano de Bergerac (1990)
3. Somewhere in Time (1980)
4. Xiu Xiu: The Sent-Down Girl (Tian yu) (1998)
5. That Old Feeling (1997)


  sim_scores = np.nan_to_num(np.dot(data.T, movie_vec) / (np.linalg.norm(data, axis=0) * np.linalg.norm(movie_vec)))


In [146]:
# Calculate cosine similarity between user 5954 and all other users
def top_user_similarity(data, user_id):
    # Get row of user 5954
    user_row = data[user_id-1]
    # Calculate cosine similarity between user 5954 and all other users
    sim_scores = np.dot(data, user_row)/(np.linalg.norm(data, axis=1)*np.linalg.norm(user_row))
    # Sort similarity scores in descending order
    sorted_scores = np.argsort(sim_scores)[::-1]
    # Return the most similar user
    return sorted_scores[1]

# Find top movie recommendations for user 5954
def get_movie_recommendations(data, user_id, top_n=5):
    # Find most similar user
    similar_user = top_user_similarity(data, user_id)
    # Get rows of user 5954 and most similar user
    #print(user)
    user_row = data[user_id-1]
    similar_user_row = data[similar_user]
    # Find movies that similar user rated highly but user 5954 has not seen
    unseen_movies = np.where(user_row == 0)[0]
    similar_user_ratings = similar_user_row[unseen_movies]
    # Sort unseen movies by rating from similar user in descending order
    sorted_movies = np.argsort(similar_user_ratings)[::-1]
    # Return top n movie recommendations
    return unseen_movies[sorted_movies][:top_n]

# Get movie recommendations for user 5954
user_id = 5954
#user = top_user_similarity(Rk_1000, user_id)
recommendations = get_movie_recommendations(Rk_1000, user_id)
#print(user)
for movie in recommendations:
    movie_id = movie+1
    print(movies_data[movies_data['MovieID']==movie_id]['Title'].values[0])

4176
Angel Heart (1987)
Anguish (Angustia) (1986)
Saludos Amigos (1943)
Dinosaur (2000)
Road Trip (2000)


In [149]:
# Reconstructing four rank-k rating matrices
k_values = [100, 1000, 2000, 3000]
R_k = []

for k in k_values:
    # Construct the rank-k matrix using the formula Rk = Uk * Sk * V(k)T
    Sk = np.diag(s[:k])
    Uk = u[:, :k]
    V_kT = vh[:k, :]
    R_k.append(Uk @ Sk @ V_kT)

# Make predictions for the 3 users selected in Question 1 for movie with ID 1377 (Batman Returns)
user_ids = [320, 230, 450]
movie_id = 1377
for i, user_id in enumerate(user_ids):
    original_rating = ratings_matrix[user_id - 1][movie_id - 1]
    for j, R in enumerate(R_k):
        predicted_rating = R[user_id - 1][movie_id - 1]
        print(f"User {i + 1}, k={k_values[j]} - Original Rating: {original_rating:.2f}, Predicted Rating: {predicted_rating:.2f}")

User 1, k=100 - Original Rating: 0.00, Predicted Rating: -0.63
User 1, k=1000 - Original Rating: 0.00, Predicted Rating: -0.73
User 1, k=2000 - Original Rating: 0.00, Predicted Rating: -0.67
User 1, k=3000 - Original Rating: 0.00, Predicted Rating: -0.66
User 2, k=100 - Original Rating: 0.00, Predicted Rating: -0.49
User 2, k=1000 - Original Rating: 0.00, Predicted Rating: -0.13
User 2, k=2000 - Original Rating: 0.00, Predicted Rating: -1.02
User 2, k=3000 - Original Rating: 0.00, Predicted Rating: -0.59
User 3, k=100 - Original Rating: 0.00, Predicted Rating: -0.70
User 3, k=1000 - Original Rating: 0.00, Predicted Rating: -0.88
User 3, k=2000 - Original Rating: 0.00, Predicted Rating: -0.94
User 3, k=3000 - Original Rating: 0.00, Predicted Rating: -0.71


In [152]:
from sklearn.metrics.pairwise import cosine_similarity

def top_movie_similarity(data, movie_id, top_n=5):
    # Replace NaN values with 0
    data = np.nan_to_num(data)
    # Calculate cosine similarity
    similarities = cosine_similarity(data)
    # Get indices of the top_n most similar movies
    similar_indices = similarities[movie_id - 1].argsort()[-top_n-1:-1][::-1]
    return similar_indices

def print_similar_movies(movie_titles, top_indices):
    print('Most Similar movies: ')
    for i, idx in enumerate(top_indices):
        print(f"{i+1}. {movie_titles[idx]}")
        
# Print the top 5 movies for Batman Returns
movie_id = 1376
top_n = 5
similar_indices = top_movie_similarity(R_k[1], movie_id, top_n=top_n)
print(similar_indices)

[ 778 3193 2738 5373 5866]


In [153]:
# Compute column averages
means = np.sum(ratings_matrix, axis=0) / np.count_nonzero(ratings_matrix, axis=0)

# Replace missing ratings with column means
filled_ratings_matrix = np.where(ratings_matrix == 0, means[np.newaxis, :], ratings_matrix)

# Subtract column means from ratings matrix
mean_centered_ratings_matrix = filled_ratings_matrix - means[np.newaxis, :]

mean_centered_ratings_matrix

array([[ 0.60927536, -0.4647619 , -1.15408805, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.60927536,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.4647619 , -0.15408805, ...,  0.        ,
         0.        ,  0.        ],
       [-0.39072464,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [157]:
ratings_matrix = (ratings_matrix - ratings_matrix.mean(axis = 0))/ratings_matrix.std(axis = 0) #normalize the data


# In[27]:


ratings_matrix[np.isnan(ratings_matrix)] = 0 #replace nan values with 0.


# In[274]:


ratings_matrix

array([[ 1.84748076,  2.64189476,  1.96194218, ..., -0.01286819,
        -0.01286819, -0.01286819],
       [ 1.84748076, -0.2948044 , -0.33489154, ..., -0.01286819,
        -0.01286819, -0.01286819],
       [-0.61843982, -0.2948044 , -0.33489154, ..., -0.01286819,
        -0.01286819, -0.01286819],
       ...,
       [-0.61843982, -0.2948044 , -0.33489154, ..., -0.01286819,
        -0.01286819, -0.01286819],
       [-0.61843982,  2.64189476,  2.72755342, ..., -0.01286819,
        -0.01286819, -0.01286819],
       [ 1.35429665, -0.2948044 , -0.33489154, ..., -0.01286819,
        -0.01286819, -0.01286819]])