In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def power_iteration(A, max_iter=100, tol=1e-6):
    """
    Power Iteration method to find the dominant singular vector of matrix A.

    Parameters:
        A (numpy.ndarray): Input matrix.
        max_iter (int): Maximum number of iterations (default: 100).
        tol (float): Tolerance for convergence (default: 1e-6).

    Returns:
        eig_val (float): The greatest (in absolute value) eigenvalue of A.
        v (numpy.ndarray): Nonzero vector which is a corresponding eigenvector of eig_val.
    """
    v = np.random.rand(A.shape[1])
    
    for _ in range(max_iter):
        Av = np.dot(A, v)  
        norm_av = 0
        for e in Av:
            norm_av += e ** 2
        v_new = Av / np.sqrt(norm_av)

        # Check for convergence
        if np.linalg.norm(v_new - v) < tol:
            break

        v = v_new

    # Compute the corresponding singular value
    eig_val = np.dot(np.dot(A, v), v) / np.dot(v, v)

    return eig_val, v

In [3]:
def svd_with_deflation(A, num_singular_values=1, max_iter=100, tol=1e-6):
    """
    Perform Singular Value Decomposition (SVD) using power iteration with deflation.

    Parameters:
    - A (numpy.ndarray): The input matrix for which the SVD will be computed.
    - num_singular_values (int): Number of singular values to compute (default: 1).
    - max_iter (int): Maximum number of iterations for power iteration (default: 100).
    - tol (float): Tolerance for convergence in power iteration (default: 1e-6).

    Returns:
    - tuple: A tuple containing the matrices U, Sigma, and V^T, representing the SVD of `A`.
    """

    ATA = np.dot(A.T, A)

    n = ATA.shape[0]
    eigen_values = np.zeros(n)
    eigen_vectors = np.zeros((n, n))

    for i in range(num_singular_values):
        # Use power iteration to find the dominant singular vector and value
        singular_value, singular_vector = power_iteration(ATA, max_iter, tol)

        # Store the computed singular vectors and values
        eigen_values[i] = singular_value
        eigen_vectors[:, i] = singular_vector

        # Deflation: Subtract the contribution of the computed singular vector and value
        outer_product = singular_value * np.outer(singular_vector, singular_vector)
        ATA = ATA - outer_product

    # Sort singular values and corresponding vectors in descending order
    sorted_indices = np.argsort(eigen_values)[::-1]
    Sigma = np.array(eigen_values)[sorted_indices]
    Vt = np.array(eigen_vectors)[sorted_indices]

    # Calculate Sigma
    Sigma = np.sqrt(eigen_values)

    # Assemble U, Sigma, and V^T
    U = A.dot(eigen_vectors) / Sigma
    V = eigen_vectors.T

    return U, Sigma, V

In [4]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')

# Create user-item matrix
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Convert the user-item matrix to a NumPy array
A = user_item_matrix.to_numpy()

# Perform SVD with deflation
U, Sigma, Vt = svd_with_deflation(A, num_singular_values=50, max_iter=100, tol=1e-3)

  U = A.dot(eigen_vectors) / Sigma


In [16]:
k = 50
U = U[:, :k]
Sigma = Sigma[:k]
Vt = Vt[:k, :]

In [17]:
def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    similarity = dot_product / (norm_vector1 * norm_vector2)
    return similarity

In [18]:
# Reconstruct the user-item matrix
reconstructed_matrix = U.dot(np.diag(Sigma)).dot(Vt)

user_id_to_recommend = 111
user_similarity = cosine_similarity(reconstructed_matrix, reconstructed_matrix[user_id_to_recommend])

# Recommend top N movies based on cosine similarity
top_n = 20
recommendations = np.argsort(user_similarity)[::-1][:top_n]

print("Recommended movies for user {}: {}".format(user_id_to_recommend, recommendations))

Recommended movies for user 111: [413 304 598 379 572 248  67 560 609 273  17 479 447 473  61  90 482 433
 329 589]


In [19]:
# Display recommended movies
recommended_movie_ids = user_item_matrix.columns[recommendations]
recommended_movies_info = movies[movies['movieId'].isin(recommended_movie_ids)]
recommended_movies_info[['movieId', 'title', 'genres']]

Unnamed: 0,movieId,title,genres
17,18,Four Rooms (1995),Comedy
61,69,Friday (1995),Comedy
67,75,Big Bully (1996),Comedy|Drama
90,102,Mr. Wrong (1996),Comedy
248,287,Nina Takes a Lover (1994),Comedy|Romance
273,314,"Secret of Roan Inish, The (1994)",Children|Drama|Fantasy|Mystery
304,346,Backbeat (1993),Drama|Musical
329,371,"Paper, The (1994)",Comedy|Drama
379,435,Coneheads (1993),Comedy|Sci-Fi
413,475,In the Name of the Father (1993),Drama
