In [1]:
from tensorflow import keras
from numpy import loadtxt
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
df = pd.read_csv("ml-latest-small/ratings.csv")
df_small = df.head(900)


In [3]:
df.iloc[:,:10]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [4]:
df.shape

(100836, 4)

In [6]:

matrix = df.pivot(index='userId', columns='movieId', values='rating')

matrix_array = matrix.to_numpy()

matrix_array[matrix_array == 0] = np.nan



In [7]:
import pandas as pd
movies_df = pd.read_csv("ml-latest-small/movies.csv")

movies_df.head(8)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children


In [8]:
movies_df.shape

(9742, 3)

In [9]:
import pandas as pd

# Load ratings and movies data
ratings_df = pd.read_csv("ml-latest-small/ratings.csv")
ratings_df.head(8)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176


In [10]:

def factorize(A, k):
    """Factorize the matrix A into W and H"""
    # Randomly initialize W and H
    W = tf.Variable(tf.random.normal((A.shape[0], k), dtype=tf.float32))
    H = tf.Variable(tf.random.normal((k, A.shape[1]), dtype=tf.float32))
    # Define the optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
    
    # Train the model
    for i in range(1000):
        with tf.GradientTape() as tape:
            # Compute the loss
            diff_matrix = tf.matmul(W, H) - A
            mask = ~tf.math.is_nan(A)
            masked_diff = tf.boolean_mask(diff_matrix, mask)
            loss = tf.norm(masked_diff)
        
        # Compute gradients
        gradients = tape.gradient(loss, [W, H])
        
        # Update the parameters
        optimizer.apply_gradients(zip(gradients, [W, H]))
        
    return W, H, loss


In [11]:
k = 10 
W, H, loss = factorize(matrix_array, k)


In [12]:
import numpy as np
from sklearn.metrics import mean_squared_error

def calculate_rmse_all_users_movies(W, H, observed_ratings):
    # Get the predicted ratings matrix
    predicted_ratings = np.dot(W, H)
    
    # Remove NaN values from the observed ratings and the corresponding predicted ratings
    observed_ratings_non_nan = observed_ratings[~np.isnan(observed_ratings)]
    predicted_ratings_non_nan = predicted_ratings[~np.isnan(observed_ratings)]
    
    # Calculate the RMSE
    rmse = np.sqrt(mean_squared_error(observed_ratings_non_nan, predicted_ratings_non_nan))
    return rmse


# Assuming matrix_array is the observed ratings matrix
rmse = calculate_rmse_all_users_movies(W, H, matrix_array)
print("RMSE for all users and movies:", rmse)


RMSE for all users and movies: 0.4946889826920509


In [13]:
def get_top_predicted_movies(user_id, W, H, movies_df, N=10):
    user_index = user_id - 1  # Users are 1-indexed in the ratings DataFrame

    # Get the predicted ratings for the user
    user_ratings = np.dot(W[user_index], H)

    # Sort the movie ratings in descending order and get the top N movie indices
    top_movie_indices = np.argsort(user_ratings)[::-1][:N]

    # Get the movie IDs, titles, and predicted ratings for the top N movies
    top_movies = movies_df.iloc[top_movie_indices][["movieId", "title"]]
    predicted_ratings = user_ratings[top_movie_indices]

    # Create a DataFrame to display the results
    top_movies_with_ratings = pd.DataFrame({
        "movieId": top_movies["movieId"],
        "title": top_movies["title"]
    })

    # Print the DataFrame
    print(f"Top {N} predicted movies for user {user_id}")
    print(top_movies_with_ratings)

# Example usage:
user_id = 27  # Specify the user ID for which you want to generate recommendations
get_top_predicted_movies(user_id, W, H, movies_df)


Top 10 predicted movies for user 27
      movieId                                              title
3744     5214                                    Oh, God! (1977)
2020     2692                   Run Lola Run (Lola rennt) (1998)
8791   129737                         Unfinished Business (2015)
1713     2302                             My Cousin Vinny (1992)
7581    86237                                 Connections (1978)
8116   101025                       Jack the Giant Slayer (2013)
2781     3720                                    Sunshine (1999)
1536     2071                      And the Band Played On (1993)
8452   112183  Birdman: Or (The Unexpected Virtue of Ignoranc...
1547     2083                 Muppet Christmas Carol, The (1992)


In [14]:
import numpy as np
from sklearn.metrics import mean_squared_error


def calculate_rmse(predicted_ratings, observed_ratings):
    """
    Calculate the Root Mean Squared Error (RMSE) between predicted and observed ratings.

    Parameters:
    predicted_ratings (numpy.ndarray): Predicted ratings.
    observed_ratings (numpy.ndarray): Observed ratings.

    Returns:
    float: RMSE between predicted and observed ratings.
    """
    # Remove NaN values from the observed ratings and the corresponding predicted ratings
    observed_ratings_non_nan = observed_ratings[~np.isnan(observed_ratings)]
    predicted_ratings_non_nan = predicted_ratings[~np.isnan(observed_ratings)]
    
    # Check if there are non-NaN values in both arrays
    if len(observed_ratings_non_nan) == 0 or len(predicted_ratings_non_nan) == 0:
        return np.nan  # Return NaN if either array is empty
    
    # Calculate the RMSE
    rmse = np.sqrt(mean_squared_error(observed_ratings_non_nan, predicted_ratings_non_nan))
    return rmse



In [16]:
def calculate_mse(predicted_ratings, original_ratings):
    """
    Calculate the Mean Squared Error (MSE) between predicted and original ratings.

    Parameters:
    predicted_ratings (numpy.ndarray): Predicted ratings.
    original_ratings (numpy.ndarray): Original ratings.

    Returns:
    float: MSE between predicted and original ratings.
    """
    # Remove NaN values from the original ratings and the corresponding predicted ratings
    original_ratings_non_nan = original_ratings[~np.isnan(original_ratings)]
    predicted_ratings_non_nan = predicted_ratings[~np.isnan(original_ratings)]

    # Calculate the MSE
    mse = np.mean((original_ratings_non_nan - predicted_ratings_non_nan) ** 2)
    return mse

In [15]:
import numpy as np
from sklearn.metrics import mean_squared_error

def calculate_rmse(predicted_ratings, observed_ratings):
    
    # Remove NaN values from the observed ratings and the corresponding predicted ratings
    observed_ratings_non_nan = observed_ratings[~np.isnan(observed_ratings)]
    predicted_ratings_non_nan = predicted_ratings[~np.isnan(observed_ratings)]
    
    # Check if there are non-NaN values in both arrays
    if len(observed_ratings_non_nan) == 0 or len(predicted_ratings_non_nan) == 0:
        return np.nan  # Return NaN if either array is empty
    
    # Calculate the RMSE
    rmse = np.sqrt(mean_squared_error(observed_ratings_non_nan, predicted_ratings_non_nan))
    return rmse

def make_nan_percentage(matrix_array, percentage=0.1):
    
    num_rows, num_cols = matrix_array.shape
    num_non_nan_values = int(np.ceil(percentage * np.sum(~np.isnan(matrix_array))))
    non_nan_indices = np.argwhere(~np.isnan(matrix_array))
    np.random.shuffle(non_nan_indices)
    nan_indices = non_nan_indices[:num_non_nan_values]
    matrix_array_with_nans = np.copy(matrix_array)
    for idx in nan_indices:
        matrix_array_with_nans[tuple(idx)] = np.nan
    return matrix_array_with_nans, nan_indices

# Example usage:
# Assume matrix_array is your original matrix
matrix_array_with_nans, nan_indices = make_nan_percentage(matrix_array)

# Now you can use matrix_array_with_nans for training and nan_indices for testing
# Train your model using matrix_array_with_nans
k = 10 
W, H, loss = factorize(matrix_array_with_nans, k)


In [22]:
matrix_array_with_nans

array([[4. , nan, 4. , ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [2.5, 2. , 2. , ..., nan, nan, nan],
       [3. , nan, nan, ..., nan, nan, nan],
       [5. , nan, nan, ..., nan, nan, nan]])

In [17]:
import numpy as np
from sklearn.metrics import mean_squared_error

def calculate_rmse(predicted_ratings, observed_ratings):
    # Remove NaN values from the observed ratings and the corresponding predicted ratings
    observed_ratings_non_nan = observed_ratings[~np.isnan(observed_ratings)]
    predicted_ratings_non_nan = predicted_ratings[~np.isnan(observed_ratings)]
    
    # Calculate the RMSE
    rmse = np.sqrt(mean_squared_error(observed_ratings_non_nan, predicted_ratings_non_nan))
    return rmse


In [18]:
def predict_movie_ratings(user, df, W, H):
    idx = df.index.get_loc(user)
    user_ratings = df.iloc[idx]
    user_ratings = user_ratings.dropna()
    user_ratings = user_ratings.to_frame().T
    user_ratings = user_ratings.reindex(columns=df.columns, fill_value=float('nan'))
    user_ratings = user_ratings.to_numpy()

    predicted_ratings = tf.matmul(W, H).numpy()
    predicted_ratings_user = predicted_ratings[idx].flatten()
    
    # Calculate RMSE
    rmse = calculate_rmse(predicted_ratings_user, user_ratings.flatten())
    
    return pd.DataFrame({"Observed": user_ratings.flatten(), "Predicted": predicted_ratings_user}), rmse


In [19]:
df_ob_predict, rmse = predict_movie_ratings(1, matrix, W, H)
rmse


0.6442465141746468

In [None]:
df_ob_predict.head(30)
#These are the observed and predicted ratings of movies in the dataset. 

Unnamed: 0,Observed,Predicted
0,4.0,5.026918
1,,4.50973
2,4.0,4.184297
3,,3.018063
4,,4.010834
5,4.0,4.620251
6,,3.565875
7,,4.698216
8,,2.770305
9,,4.376901
