In [45]:
import numpy as np
import pandas as pd

# read the data
column_list_ratings = ["UserID", "MovieID", "Ratings","Timestamp"]
ratings_data = pd.read_csv('ratings.dat',sep='::',names = column_list_ratings, engine='python')

# get the number of unique users and movies
user_ids = ratings_data['UserID'].unique()
movie_ids = ratings_data['MovieID'].unique()

ratings_matrix = np.zeros((len(user_ids), len(movie_ids)), dtype = np.uint8)

for index, row in ratings_data.iterrows():
    user_id = row['UserID']
    movie_id = row['MovieID']
    rating = row['Ratings']
    user_idx = np.where(user_ids == user_id)[0][0]
    movie_idx = np.where(movie_ids == movie_id)[0][0]
    ratings_matrix[user_idx, movie_idx] = rating
ratings_matrix

array([[5, 3, 3, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [48]:
movie_id = 1377
user_ids_1377 = ratings_data[ratings_data['MovieID'] == movie_id]['UserID'].head(3).values
user_ids_1377
movie_idx = np.where(movie_ids == movie_id)[0][0]
ratings_1377 = ratings_matrix[user_ids_1377-1, movie_idx]
print(ratings_1377)
print(user_ids_1377)

[3 3 2]
[10 13 18]


In [49]:
# Calculate mean by movie
mean_ratings = np.zeros((num_movies,))
for j in range(num_movies):
    idx = np.where(ratings_matrix[:, j] != 0)
    mean_ratings[j] = np.mean(ratings_matrix[idx, j])
    
# Replace missing values with the mean rating for the given movie
for i in range(num_users):
    for j in range(num_movies):
        if ratings_matrix[i, j] == 0:
            ratings_matrix[i, j] = mean_ratings[j]
            
# Subtract mean from original ratings and divide it by the standard deviation
std_ratings = np.zeros((num_users, num_movies))
for j in range(num_movies):
    idx = np.where(ratings_matrix[:, j] != 0)
    std_ratings[idx, j] = (ratings_matrix[idx, j] - mean_ratings[j]) / np.std(ratings_matrix[idx, j])
    
# Replace NaN values with 0
std_ratings = np.nan_to_num(std_ratings, nan=0)

  std_ratings[idx, j] = (ratings_matrix[idx, j] - mean_ratings[j]) / np.std(ratings_matrix[idx, j])


In [50]:
# Compute the SVD of the normalised matrix
u, s, vh = np.linalg.svd(std_ratings)

# Print the shapes
print("U shape: ", u.shape)
print("S shape: ", s.shape)
print("V shape: ", vh.shape)

U shape:  (6040, 6040)
S shape:  (3706,)
V shape:  (3706, 3706)


In [69]:
# Define the k values
k_values = [100, 1000, 2000, 3000]

# Loop over the k values
for k in k_values:
    # Reconstruct the rank-k matrix
    R_k = u[:, :k] @ np.diag(s[:k]) @ vh[:k, :]
    if k == 1000:
        Rk_1000 = R_k
    # Make predictions for the selected users and movie
    for user_id in [10,13,18]:
        user_idx = user_id - 1
        movie_idx = 1376  # index starts from 0
        
        # Get the original rating
        rating_original = ratings_data[(ratings_data["UserID"] == user_id) & 
                                        (ratings_data["MovieID"] == 1377)]["Ratings"].values[0]
        
        # Get the predicted rating
        rating_predicted = R_k[user_idx, movie_idx]
        
        # Print the original and predicted ratings
        print("For k = {}, User {}'s original rating: {}, predicted rating: {:.2f}".format(k, user_id, rating_original, rating_predicted))

For k = 100, User 10's original rating: 3, predicted rating: -0.63
For k = 100, User 13's original rating: 3, predicted rating: -0.67
For k = 100, User 18's original rating: 2, predicted rating: -1.21
For k = 1000, User 10's original rating: 3, predicted rating: -0.11
For k = 1000, User 13's original rating: 3, predicted rating: -0.67
For k = 1000, User 18's original rating: 2, predicted rating: -1.25
For k = 2000, User 10's original rating: 3, predicted rating: -0.47
For k = 2000, User 13's original rating: 3, predicted rating: -0.73
For k = 2000, User 18's original rating: 2, predicted rating: -0.88
For k = 3000, User 10's original rating: 3, predicted rating: -0.68
For k = 3000, User 13's original rating: 3, predicted rating: -0.71
For k = 3000, User 18's original rating: 2, predicted rating: -0.76


In [82]:
def top_movie_similarity(data, movie_id, top_n=5):
    # Get the column index for the given movie_id
    movie_idx = movie_id-1
    
    # Get the column vector for the given movie
    movie_vec = data[:, movie_idx]
    # Calculate the cosine similarity between the given movie and all other movies
    sim_scores = np.nan_to_num(np.dot(data.T, movie_vec) / (np.linalg.norm(data, axis=0) * np.linalg.norm(movie_vec)))
    
    # Sort the movies based on their similarity with the given movie
    top_indices = np.argsort(-sim_scores)[:top_n]
    top_scores = sim_scores[top_indices]
    
    return top_indices, top_scores
def print_similar_movies(movie_titles, top_indices):
    print('Most Similar movies: ')
    for i, idx in enumerate(top_indices):
        print(f'{i+1}. {movie_titles[idx]}')
        
# Find the top 5 similar movies for Batman Returns
movie_id = 1377
top_indices, top_scores = top_movie_similarity(Rk_1000, movie_id, top_n=5)

movie_titles = movies_data["Title"].tolist()

# Print the top 5 similar movies
print_similar_movies(movie_titles, top_indices)

Most Similar movies: 
1. Bastard Out of Carolina (1996)
2. Cyrano de Bergerac (1990)
3. Somewhere in Time (1980)
4. Xiu Xiu: The Sent-Down Girl (Tian yu) (1998)
5. That Old Feeling (1997)


  sim_scores = np.nan_to_num(np.dot(data.T, movie_vec) / (np.linalg.norm(data, axis=0) * np.linalg.norm(movie_vec)))
